Fix the legend

Create a small tool to measure the size of inernal databases
Merge #3842
2025-11-22 20:56:04 +00:00 · 2023-06-24 14:53:32 +02:00 · 2023-06-23 22:57:57 +02:00 · 2023-06-22 18:01:10 +00:00 · 2023-06-22 21:59:00 +08:00 · 2023-06-20 13:35:33 +00:00
54 changed files with 1736 additions and 602 deletions
--- a/.github/scripts/check-release.sh
+++ b/.github/scripts/check-release.sh
@@ -1,24 +1,41 @@
-#!/bin/bash
+#!/usr/bin/env bash
+set -eu -o pipefail

-# check_tag $current_tag $file_tag $file_name
-function check_tag {
-  if [[ "$1" != "$2" ]]; then
-      echo "Error: the current tag does not match the version in Cargo.toml: found $2 - expected $1"
-      ret=1
-  fi
+check_tag() {
+    local expected=$1
+    local actual=$2
+    local filename=$3
+
+    if [[ $actual != $expected ]]; then
+        echo >&2 "Error: the current tag does not match the version in $filename: found $actual, expected $expected"
+        return 1
+    fi
 }

+read_version() {
+    grep '^version = ' | cut -d \" -f 2
+}
+
+if [[ -z "${GITHUB_REF:-}" ]]; then
+    echo >&2 "Error: GITHUB_REF is not set"
+    exit 1
+fi
+
+if [[ ! "$GITHUB_REF" =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+(-[a-z0-9]+)?$ ]]; then
+    echo >&2 "Error: GITHUB_REF is not a valid tag: $GITHUB_REF"
+    exit 1
+fi
+
+current_tag=${GITHUB_REF#refs/tags/v}
 ret=0
-current_tag=${GITHUB_REF#'refs/tags/v'}

-file_tag="$(grep '^version = ' Cargo.toml | cut -d '=' -f 2 | tr -d '"' | tr -d ' ')"
-check_tag $current_tag $file_tag
+toml_tag="$(cat Cargo.toml | read_version)"
+check_tag "$current_tag" "$toml_tag" Cargo.toml || ret=1

-lock_file='Cargo.lock'
-lock_tag=$(grep -A 1 'name = "meilisearch-auth"' $lock_file | grep version | cut -d '=' -f 2 | tr -d '"' | tr -d ' ')
-check_tag $current_tag $lock_tag $lock_file
+lock_tag=$(grep -A 1 '^name = "meilisearch-auth"' Cargo.lock | read_version)
+check_tag "$current_tag" "$lock_tag" Cargo.lock || ret=1

-if [[ "$ret" -eq 0 ]] ; then
-  echo 'OK'
+if (( ret == 0 )); then
+    echo 'OK'
 fi
 exit $ret
--- a/.github/workflows/fuzzer-indexing.yml
+++ b/.github/workflows/fuzzer-indexing.yml
@@ -0,0 +1,24 @@
+name: Run the indexing fuzzer
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  fuzz:
+    name: Setup the action
+    runs-on: ubuntu-latest
+    timeout-minutes: 4320 # 72h
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions-rs/toolchain@v1
+        with:
+          profile: minimal
+          toolchain: stable
+          override: true
+
+      # Run benchmarks
+      - name: Run the fuzzer
+        run: |
+          cargo run --release --bin fuzz-indexing
--- a/.github/workflows/sdks-tests.yml
+++ b/.github/workflows/sdks-tests.yml
@@ -16,13 +16,28 @@ env:
  MEILI_NO_ANALYTICS: 'true'

 jobs:
+  define-docker-image:
+    runs-on: ubuntu-latest
+    outputs:
+      docker-image: ${{ steps.define-image.outputs.docker-image }}
+    steps:
+      - uses: actions/checkout@v3
+      - name: Define the Docker image we need to use
+        id: define-image
+        run: |
+          event=${{ github.event_name }}
+          echo "docker-image=nightly" >> $GITHUB_OUTPUT
+          if [[ $event == 'workflow_dispatch' ]]; then
+            echo "docker-image=${{ github.event.inputs.docker_image }}" >> $GITHUB_OUTPUT
+          fi

  meilisearch-js-tests:
+    needs: define-docker-image
    name: JS SDK tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -52,11 +67,12 @@ jobs:
        run: yarn test:env:browser

  instant-meilisearch-tests:
+    needs: define-docker-image
    name: instant-meilisearch tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -78,11 +94,12 @@ jobs:
        run: yarn build

  meilisearch-php-tests:
+    needs: define-docker-image
    name: PHP SDK tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -108,11 +125,12 @@ jobs:
          composer remove --dev guzzlehttp/guzzle http-interop/http-factory-guzzle

  meilisearch-python-tests:
+    needs: define-docker-image
    name: Python SDK tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -132,11 +150,12 @@ jobs:
        run: pipenv run pytest

  meilisearch-go-tests:
+    needs: define-docker-image
    name: Go SDK tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -161,11 +180,12 @@ jobs:
        run: go test -v ./...

  meilisearch-ruby-tests:
+    needs: define-docker-image
    name: Ruby SDK tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
@@ -185,11 +205,12 @@ jobs:
        run: bundle exec rspec

  meilisearch-rust-tests:
+    needs: define-docker-image
    name: Rust SDK tests
    runs-on: ubuntu-latest
    services:
      meilisearch:
-        image: getmeili/meilisearch:${{ github.event.inputs.docker_image }}
+        image: getmeili/meilisearch:${{ needs.define-docker-image.outputs.docker-image }}
        env:
          MEILI_MASTER_KEY: ${{ env.MEILI_MASTER_KEY }}
          MEILI_NO_ANALYTICS: ${{ env.MEILI_NO_ANALYTICS }}
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,10 +10,12 @@ members = [
    "file-store",
    "permissive-json-pointer",
    "milli",
+    "index-stats",
    "filter-parser",
    "flatten-serde-json",
    "json-depth-checker",
-    "benchmarks"
+    "benchmarks",
+    "fuzzers",
 ]

 [workspace.package]
--- a/fuzzers/Cargo.toml
+++ b/fuzzers/Cargo.toml
@@ -0,0 +1,20 @@
+[package]
+name = "fuzzers"
+publish = false
+
+version.workspace = true
+authors.workspace = true
+description.workspace = true
+homepage.workspace = true
+readme.workspace = true
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+arbitrary = { version = "1.3.0", features = ["derive"] }
+clap = { version = "4.3.0", features = ["derive"] }
+fastrand = "1.9.0"
+milli = { path = "../milli" }
+serde = { version = "1.0.160", features = ["derive"] }
+serde_json = { version = "1.0.95", features = ["preserve_order"] }
+tempfile = "3.5.0"
--- a/fuzzers/README.md
+++ b/fuzzers/README.md
@@ -0,0 +1,3 @@
+# Fuzzers
+
+The purpose of this crate is to contains all the handmade "fuzzer" we may need.
--- a/fuzzers/src/bin/fuzz-indexing.rs
+++ b/fuzzers/src/bin/fuzz-indexing.rs
@@ -0,0 +1,152 @@
+use std::num::NonZeroUsize;
+use std::path::PathBuf;
+use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::time::Duration;
+
+use arbitrary::{Arbitrary, Unstructured};
+use clap::Parser;
+use fuzzers::Operation;
+use milli::heed::EnvOpenOptions;
+use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig};
+use milli::Index;
+use tempfile::TempDir;
+
+#[derive(Debug, Arbitrary)]
+struct Batch([Operation; 5]);
+
+#[derive(Debug, Clone, Parser)]
+struct Opt {
+    /// The number of fuzzer to run in parallel.
+    #[clap(long)]
+    par: Option<NonZeroUsize>,
+    // We need to put a lot of newlines in the following documentation or else everything gets collapsed on one line
+    /// The path in which the databases will be created.
+    /// Using a ramdisk is recommended.
+    ///
+    /// Linux:
+    ///
+    /// sudo mount -t tmpfs -o size=2g tmpfs ramdisk # to create it
+    ///
+    /// sudo umount ramdisk # to remove it
+    ///
+    /// MacOS:
+    ///
+    /// diskutil erasevolume HFS+ 'RAM Disk' `hdiutil attach -nobrowse -nomount ram://4194304 # create it
+    ///
+    /// hdiutil detach /dev/:the_disk
+    #[clap(long)]
+    path: Option<PathBuf>,
+}
+
+fn main() {
+    let opt = Opt::parse();
+    let progression: &'static AtomicUsize = Box::leak(Box::new(AtomicUsize::new(0)));
+    let stop: &'static AtomicBool = Box::leak(Box::new(AtomicBool::new(false)));
+
+    let par = opt.par.unwrap_or_else(|| std::thread::available_parallelism().unwrap()).get();
+    let mut handles = Vec::with_capacity(par);
+
+    for _ in 0..par {
+        let opt = opt.clone();
+
+        let handle = std::thread::spawn(move || {
+            let mut options = EnvOpenOptions::new();
+            options.map_size(1024 * 1024 * 1024 * 1024);
+            let tempdir = match opt.path {
+                Some(path) => TempDir::new_in(path).unwrap(),
+                None => TempDir::new().unwrap(),
+            };
+            let index = Index::new(options, tempdir.path()).unwrap();
+            let indexer_config = IndexerConfig::default();
+            let index_documents_config = IndexDocumentsConfig::default();
+
+            std::thread::scope(|s| {
+                loop {
+                    if stop.load(Ordering::Relaxed) {
+                        return;
+                    }
+                    let v: Vec<u8> =
+                        std::iter::repeat_with(|| fastrand::u8(..)).take(1000).collect();
+
+                    let mut data = Unstructured::new(&v);
+                    let batches = <[Batch; 5]>::arbitrary(&mut data).unwrap();
+                    // will be used to display the error once a thread crashes
+                    let dbg_input = format!("{:#?}", batches);
+
+                    let handle = s.spawn(|| {
+                        let mut wtxn = index.write_txn().unwrap();
+
+                        for batch in batches {
+                            let mut builder = IndexDocuments::new(
+                                &mut wtxn,
+                                &index,
+                                &indexer_config,
+                                index_documents_config.clone(),
+                                |_| (),
+                                || false,
+                            )
+                            .unwrap();
+
+                            for op in batch.0 {
+                                match op {
+                                    Operation::AddDoc(doc) => {
+                                        let documents =
+                                            milli::documents::objects_from_json_value(doc.to_d());
+                                        let documents =
+                                            milli::documents::documents_batch_reader_from_objects(
+                                                documents,
+                                            );
+                                        let (b, _added) = builder.add_documents(documents).unwrap();
+                                        builder = b;
+                                    }
+                                    Operation::DeleteDoc(id) => {
+                                        let (b, _removed) =
+                                            builder.remove_documents(vec![id.to_s()]).unwrap();
+                                        builder = b;
+                                    }
+                                }
+                            }
+                            builder.execute().unwrap();
+
+                            // after executing a batch we check if the database is corrupted
+                            let res = index.search(&wtxn).execute().unwrap();
+                            index.documents(&wtxn, res.documents_ids).unwrap();
+                            progression.fetch_add(1, Ordering::Relaxed);
+                        }
+                        wtxn.abort().unwrap();
+                    });
+                    if let err @ Err(_) = handle.join() {
+                        stop.store(true, Ordering::Relaxed);
+                        err.expect(&dbg_input);
+                    }
+                }
+            });
+        });
+        handles.push(handle);
+    }
+
+    std::thread::spawn(|| {
+        let mut last_value = 0;
+        let start = std::time::Instant::now();
+        loop {
+            let total = progression.load(Ordering::Relaxed);
+            let elapsed = start.elapsed().as_secs();
+            if elapsed > 3600 {
+                // after 1 hour, stop the fuzzer, success
+                std::process::exit(0);
+            }
+            println!(
+                "Has been running for {:?} seconds. Tested {} new values for a total of {}.",
+                elapsed,
+                total - last_value,
+                total
+            );
+            last_value = total;
+            std::thread::sleep(Duration::from_secs(1));
+        }
+    });
+
+    for handle in handles {
+        handle.join().unwrap();
+    }
+}
--- a/fuzzers/src/lib.rs
+++ b/fuzzers/src/lib.rs
@@ -0,0 +1,46 @@
+use arbitrary::Arbitrary;
+use serde_json::{json, Value};
+
+#[derive(Debug, Arbitrary)]
+pub enum Document {
+    One,
+    Two,
+    Three,
+    Four,
+    Five,
+    Six,
+}
+
+impl Document {
+    pub fn to_d(&self) -> Value {
+        match self {
+            Document::One => json!({ "id": 0, "doggo": "bernese" }),
+            Document::Two => json!({ "id": 0, "doggo": "golden" }),
+            Document::Three => json!({ "id": 0, "catto": "jorts" }),
+            Document::Four => json!({ "id": 1, "doggo": "bernese" }),
+            Document::Five => json!({ "id": 1, "doggo": "golden" }),
+            Document::Six => json!({ "id": 1, "catto": "jorts" }),
+        }
+    }
+}
+
+#[derive(Debug, Arbitrary)]
+pub enum DocId {
+    Zero,
+    One,
+}
+
+impl DocId {
+    pub fn to_s(&self) -> String {
+        match self {
+            DocId::Zero => "0".to_string(),
+            DocId::One => "1".to_string(),
+        }
+    }
+}
+
+#[derive(Debug, Arbitrary)]
+pub enum Operation {
+    AddDoc(Document),
+    DeleteDoc(DocId),
+}
--- a/index-scheduler/src/autobatcher.rs
+++ b/index-scheduler/src/autobatcher.rs
@@ -160,7 +160,7 @@ impl BatchKind {
 impl BatchKind {
    /// Returns a `ControlFlow::Break` if you must stop right now.
    /// The boolean tell you if an index has been created by the batched task.
-    /// To ease the writting of the code. `true` can be returned when you don't need to create an index
+    /// To ease the writing of the code. `true` can be returned when you don't need to create an index
    /// but false can't be returned if you needs to create an index.
    // TODO use an AutoBatchKind as input
    pub fn new(
@@ -214,7 +214,7 @@ impl BatchKind {

    /// Returns a `ControlFlow::Break` if you must stop right now.
    /// The boolean tell you if an index has been created by the batched task.
-    /// To ease the writting of the code. `true` can be returned when you don't need to create an index
+    /// To ease the writing of the code. `true` can be returned when you don't need to create an index
    /// but false can't be returned if you needs to create an index.
    #[rustfmt::skip]
    fn accumulate(self, id: TaskId, kind: AutobatchKind, index_already_exists: bool, primary_key: Option<&str>) -> ControlFlow<BatchKind, BatchKind> {
@@ -321,9 +321,18 @@ impl BatchKind {
                })
            }
            (
-                this @ BatchKind::DocumentOperation { .. },
+                BatchKind::DocumentOperation { method, allow_index_creation, primary_key, mut operation_ids },
                K::DocumentDeletion,
-            ) => Break(this),
+            ) => {
+                operation_ids.push(id);
+
+                Continue(BatchKind::DocumentOperation {
+                    method,
+                    allow_index_creation,
+                    primary_key,
+                    operation_ids,
+                })
+            }
            // but we can't autobatch documents if it's not the same kind
            // this match branch MUST be AFTER the previous one
            (
@@ -346,7 +355,35 @@ impl BatchKind {
                deletion_ids.push(id);
                Continue(BatchKind::DocumentClear { ids: deletion_ids })
            }
-            // we can't autobatch a deletion and an import
+            // we can autobatch the deletion and import if the index already exists
+            (
+                BatchKind::DocumentDeletion { mut deletion_ids },
+                K::DocumentImport { method, allow_index_creation, primary_key }
+            ) if index_already_exists => {
+                deletion_ids.push(id);
+
+                Continue(BatchKind::DocumentOperation {
+                    method,
+                    allow_index_creation,
+                    primary_key,
+                    operation_ids: deletion_ids,
+                })
+            }
+            // we can autobatch the deletion and import if both can't create an index
+            (
+                BatchKind::DocumentDeletion { mut deletion_ids },
+                K::DocumentImport { method, allow_index_creation, primary_key }
+            ) if !allow_index_creation => {
+                deletion_ids.push(id);
+
+                Continue(BatchKind::DocumentOperation {
+                    method,
+                    allow_index_creation,
+                    primary_key,
+                    operation_ids: deletion_ids,
+                })
+            }
+            // we can't autobatch a deletion and an import if the index does not exists but would be created by an addition
            (
                this @ BatchKind::DocumentDeletion { .. },
                K::DocumentImport { .. }
@@ -648,36 +685,36 @@ mod tests {
        debug_snapshot!(autobatch_from(false,None,  [settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0] }, false))");
        debug_snapshot!(autobatch_from(false,None,  [settings(false), settings(false), settings(false)]), @"Some((Settings { allow_index_creation: false, settings_ids: [0, 1, 2] }, false))");

-        // We can't autobatch document addition with document deletion
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
-        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0] }, true))");
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0] }, true))"###);
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
-        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0] }, false))"###);
-        // we also can't do the only way around
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
-        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @"Some((DocumentDeletion { deletion_ids: [0] }, false))");
+        // We can autobatch document addition with document deletion
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(true, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, true))");
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, None), doc_del()]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, true, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, true))"###);
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(ReplaceDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(false, None, [doc_imp(UpdateDocuments, false, Some("catto")), doc_del()]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        // And the other way around
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, true, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: true, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(true, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, None)]), @"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, None)]), @"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: None, operation_ids: [0, 1] }, false))");
+        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(ReplaceDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: ReplaceDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
+        debug_snapshot!(autobatch_from(false, None, [doc_del(), doc_imp(UpdateDocuments, false, Some("catto"))]), @r###"Some((DocumentOperation { method: UpdateDocuments, allow_index_creation: false, primary_key: Some("catto"), operation_ids: [0, 1] }, false))"###);
    }

    #[test]
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -998,7 +998,7 @@ impl IndexScheduler {
                }()
                .unwrap_or_default();

-                // The write transaction is directly owned and commited inside.
+                // The write transaction is directly owned and committed inside.
                match self.index_mapper.delete_index(wtxn, &index_uid) {
                    Ok(()) => (),
                    Err(Error::IndexNotFound(_)) if index_has_been_created => (),
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -1785,7 +1785,7 @@ mod tests {
            assert_eq!(task.kind.as_kind(), k);
        }

-        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "everything_is_succesfully_registered");
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "everything_is_successfully_registered");
    }

    #[test]
@@ -2075,6 +2075,105 @@ mod tests {
        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "both_task_succeeded");
    }

+    #[test]
+    fn document_addition_and_document_deletion() {
+        let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
+
+        let content = r#"[
+            { "id": 1, "doggo": "jean bob" },
+            { "id": 2, "catto": "jorts" },
+            { "id": 3, "doggo": "bork" }
+        ]"#;
+
+        let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
+        let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
+        file.persist().unwrap();
+        index_scheduler
+            .register(KindWithContent::DocumentAdditionOrUpdate {
+                index_uid: S("doggos"),
+                primary_key: Some(S("id")),
+                method: ReplaceDocuments,
+                content_file: uuid,
+                documents_count,
+                allow_index_creation: true,
+            })
+            .unwrap();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
+        index_scheduler
+            .register(KindWithContent::DocumentDeletion {
+                index_uid: S("doggos"),
+                documents_ids: vec![S("1"), S("2")],
+            })
+            .unwrap();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
+
+        handle.advance_one_successful_batch(); // The addition AND deletion should've been batched together
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_processing_the_batch");
+
+        let index = index_scheduler.index("doggos").unwrap();
+        let rtxn = index.read_txn().unwrap();
+        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
+        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
+        let documents = index
+            .all_documents(&rtxn)
+            .unwrap()
+            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .collect::<Vec<_>>();
+        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
+    }
+
+    #[test]
+    fn document_deletion_and_document_addition() {
+        let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
+        index_scheduler
+            .register(KindWithContent::DocumentDeletion {
+                index_uid: S("doggos"),
+                documents_ids: vec![S("1"), S("2")],
+            })
+            .unwrap();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_first_task");
+
+        let content = r#"[
+            { "id": 1, "doggo": "jean bob" },
+            { "id": 2, "catto": "jorts" },
+            { "id": 3, "doggo": "bork" }
+        ]"#;
+
+        let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0).unwrap();
+        let documents_count = read_json(content.as_bytes(), file.as_file_mut()).unwrap();
+        file.persist().unwrap();
+        index_scheduler
+            .register(KindWithContent::DocumentAdditionOrUpdate {
+                index_uid: S("doggos"),
+                primary_key: Some(S("id")),
+                method: ReplaceDocuments,
+                content_file: uuid,
+                documents_count,
+                allow_index_creation: true,
+            })
+            .unwrap();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "registered_the_second_task");
+
+        // The deletion should have failed because it can't create an index
+        handle.advance_one_failed_batch();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_failing_the_deletion");
+
+        // The addition should works
+        handle.advance_one_successful_batch();
+        snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_last_successful_addition");
+
+        let index = index_scheduler.index("doggos").unwrap();
+        let rtxn = index.read_txn().unwrap();
+        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
+        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
+        let documents = index
+            .all_documents(&rtxn)
+            .unwrap()
+            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .collect::<Vec<_>>();
+        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents");
+    }
+
    #[test]
    fn do_not_batch_task_of_different_indexes() {
        let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
--- a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/after_processing_the_batch.snap
@@ -0,0 +1,43 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
+1 {uid: 1, status: succeeded, details: { received_document_ids: 2, deleted_documents: Some(2) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
+----------------------------------------------------------------------
+### Status:
+enqueued []
+succeeded [0,1,]
+----------------------------------------------------------------------
+### Kind:
+"documentAdditionOrUpdate" [0,]
+"documentDeletion" [1,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,1,]
+----------------------------------------------------------------------
+### Index Mapper:
+doggos: { number_of_documents: 1, field_distribution: {"doggo": 1, "id": 1} }
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+[timestamp] [0,1,]
+----------------------------------------------------------------------
+### Finished At:
+[timestamp] [0,1,]
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/documents.snap
@@ -0,0 +1,9 @@
+---
+source: index-scheduler/src/lib.rs
+---
+[
+  {
+    "id": 3,
+    "doggo": "bork"
+  }
+]
--- a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_first_task.snap
@@ -0,0 +1,37 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,]
+----------------------------------------------------------------------
+### Kind:
+"documentAdditionOrUpdate" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,]
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+00000000-0000-0000-0000-000000000000
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_addition_and_document_deletion/registered_the_second_task.snap
@@ -0,0 +1,40 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
+1 {uid: 1, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,1,]
+----------------------------------------------------------------------
+### Kind:
+"documentAdditionOrUpdate" [0,]
+"documentDeletion" [1,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,1,]
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+00000000-0000-0000-0000-000000000000
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_failing_the_deletion.snap
@@ -0,0 +1,43 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
+1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
+----------------------------------------------------------------------
+### Status:
+enqueued [1,]
+failed [0,]
+----------------------------------------------------------------------
+### Kind:
+"documentAdditionOrUpdate" [1,]
+"documentDeletion" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,1,]
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+[timestamp] [0,]
+----------------------------------------------------------------------
+### Finished At:
+[timestamp] [0,]
+----------------------------------------------------------------------
+### File Store:
+00000000-0000-0000-0000-000000000000
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/after_last_successful_addition.snap
@@ -0,0 +1,46 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: failed, error: ResponseError { code: 200, message: "Index `doggos` not found.", error_code: "index_not_found", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_not_found" }, details: { received_document_ids: 2, deleted_documents: Some(0) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
+1 {uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
+----------------------------------------------------------------------
+### Status:
+enqueued []
+succeeded [1,]
+failed [0,]
+----------------------------------------------------------------------
+### Kind:
+"documentAdditionOrUpdate" [1,]
+"documentDeletion" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,1,]
+----------------------------------------------------------------------
+### Index Mapper:
+doggos: { number_of_documents: 3, field_distribution: {"catto": 1, "doggo": 2, "id": 3} }
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Finished At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/documents.snap
@@ -0,0 +1,17 @@
+---
+source: index-scheduler/src/lib.rs
+---
+[
+  {
+    "id": 1,
+    "doggo": "jean bob"
+  },
+  {
+    "id": 2,
+    "catto": "jorts"
+  },
+  {
+    "id": 3,
+    "doggo": "bork"
+  }
+]
--- a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_first_task.snap
@@ -0,0 +1,36 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,]
+----------------------------------------------------------------------
+### Kind:
+"documentDeletion" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,]
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap
+++ b/index-scheduler/src/snapshots/lib.rs/document_deletion_and_document_addition/registered_the_second_task.snap
@@ -0,0 +1,40 @@
+---
+source: index-scheduler/src/lib.rs
+---
+### Autobatching Enabled = true
+### Processing Tasks:
+[]
+----------------------------------------------------------------------
+### All Tasks:
+0 {uid: 0, status: enqueued, details: { received_document_ids: 2, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1", "2"] }}
+1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }}
+----------------------------------------------------------------------
+### Status:
+enqueued [0,1,]
+----------------------------------------------------------------------
+### Kind:
+"documentAdditionOrUpdate" [1,]
+"documentDeletion" [0,]
+----------------------------------------------------------------------
+### Index Tasks:
+doggos [0,1,]
+----------------------------------------------------------------------
+### Index Mapper:
+
+----------------------------------------------------------------------
+### Canceled By:
+
+----------------------------------------------------------------------
+### Enqueued At:
+[timestamp] [0,]
+[timestamp] [1,]
+----------------------------------------------------------------------
+### Started At:
+----------------------------------------------------------------------
+### Finished At:
+----------------------------------------------------------------------
+### File Store:
+00000000-0000-0000-0000-000000000000
+
+----------------------------------------------------------------------
+
--- a/index-scheduler/src/snapshots/lib.rs/register/everything_is_successfully_registered.snap
+++ b/index-scheduler/src/snapshots/lib.rs/register/everything_is_successfully_registered.snap
--- a/index-stats/Cargo.toml
+++ b/index-stats/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "index-stats"
+description = "A small program that computes internal stats of a Meilisearch index"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[dependencies]
+anyhow = "1.0.71"
+clap = { version = "4.3.5", features = ["derive"] }
+milli = { path = "../milli" }
+piechart = "1.0.0"
--- a/index-stats/src/main.rs
+++ b/index-stats/src/main.rs
@@ -0,0 +1,224 @@
+use std::cmp::Reverse;
+use std::path::PathBuf;
+
+use clap::Parser;
+use milli::heed::{types::ByteSlice, EnvOpenOptions, PolyDatabase, RoTxn};
+use milli::index::db_name::*;
+use milli::index::Index;
+use piechart::{Chart, Color, Data};
+
+/// Simple program to greet a person
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// The path to the LMDB Meilisearch index database.
+    path: PathBuf,
+
+    /// The radius of the graphs
+    #[clap(long, default_value_t = 10)]
+    graph_radius: u16,
+
+    /// The radius of the graphs
+    #[clap(long, default_value_t = 6)]
+    graph_aspect_ratio: u16,
+}
+
+fn main() -> anyhow::Result<()> {
+    let Args { path, graph_radius, graph_aspect_ratio } = Args::parse();
+    let env = EnvOpenOptions::new().max_dbs(24).open(path)?;
+
+    // TODO not sure to keep that...
+    //      if removed put the pub(crate) back in the Index struct
+    matches!(
+        Option::<Index>::None,
+        Some(Index {
+            env: _,
+            main: _,
+            word_docids: _,
+            exact_word_docids: _,
+            word_prefix_docids: _,
+            exact_word_prefix_docids: _,
+            word_pair_proximity_docids: _,
+            word_prefix_pair_proximity_docids: _,
+            prefix_word_pair_proximity_docids: _,
+            word_position_docids: _,
+            word_fid_docids: _,
+            field_id_word_count_docids: _,
+            word_prefix_position_docids: _,
+            word_prefix_fid_docids: _,
+            script_language_docids: _,
+            facet_id_exists_docids: _,
+            facet_id_is_null_docids: _,
+            facet_id_is_empty_docids: _,
+            facet_id_f64_docids: _,
+            facet_id_string_docids: _,
+            field_id_docid_facet_f64s: _,
+            field_id_docid_facet_strings: _,
+            documents: _,
+        })
+    );
+
+    let mut wtxn = env.write_txn()?;
+    let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
+    let word_docids = env.create_poly_database(&mut wtxn, Some(WORD_DOCIDS))?;
+    let exact_word_docids = env.create_poly_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
+    let word_prefix_docids = env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
+    let exact_word_prefix_docids =
+        env.create_poly_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?;
+    let word_pair_proximity_docids =
+        env.create_poly_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
+    let script_language_docids =
+        env.create_poly_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
+    let word_prefix_pair_proximity_docids =
+        env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
+    let prefix_word_pair_proximity_docids =
+        env.create_poly_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
+    let word_position_docids = env.create_poly_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
+    let word_fid_docids = env.create_poly_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
+    let field_id_word_count_docids =
+        env.create_poly_database(&mut wtxn, Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
+    let word_prefix_position_docids =
+        env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_POSITION_DOCIDS))?;
+    let word_prefix_fid_docids =
+        env.create_poly_database(&mut wtxn, Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
+    let facet_id_f64_docids = env.create_poly_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
+    let facet_id_string_docids =
+        env.create_poly_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
+    let facet_id_exists_docids =
+        env.create_poly_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
+    let facet_id_is_null_docids =
+        env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_NULL_DOCIDS))?;
+    let facet_id_is_empty_docids =
+        env.create_poly_database(&mut wtxn, Some(FACET_ID_IS_EMPTY_DOCIDS))?;
+    let field_id_docid_facet_f64s =
+        env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_F64S))?;
+    let field_id_docid_facet_strings =
+        env.create_poly_database(&mut wtxn, Some(FIELD_ID_DOCID_FACET_STRINGS))?;
+    let documents = env.create_poly_database(&mut wtxn, Some(DOCUMENTS))?;
+    wtxn.commit()?;
+
+    let list = [
+        (main, MAIN),
+        (word_docids, WORD_DOCIDS),
+        (exact_word_docids, EXACT_WORD_DOCIDS),
+        (word_prefix_docids, WORD_PREFIX_DOCIDS),
+        (exact_word_prefix_docids, EXACT_WORD_PREFIX_DOCIDS),
+        (word_pair_proximity_docids, WORD_PAIR_PROXIMITY_DOCIDS),
+        (script_language_docids, SCRIPT_LANGUAGE_DOCIDS),
+        (word_prefix_pair_proximity_docids, WORD_PREFIX_PAIR_PROXIMITY_DOCIDS),
+        (prefix_word_pair_proximity_docids, PREFIX_WORD_PAIR_PROXIMITY_DOCIDS),
+        (word_position_docids, WORD_POSITION_DOCIDS),
+        (word_fid_docids, WORD_FIELD_ID_DOCIDS),
+        (field_id_word_count_docids, FIELD_ID_WORD_COUNT_DOCIDS),
+        (word_prefix_position_docids, WORD_PREFIX_POSITION_DOCIDS),
+        (word_prefix_fid_docids, WORD_PREFIX_FIELD_ID_DOCIDS),
+        (facet_id_f64_docids, FACET_ID_F64_DOCIDS),
+        (facet_id_string_docids, FACET_ID_STRING_DOCIDS),
+        (facet_id_exists_docids, FACET_ID_EXISTS_DOCIDS),
+        (facet_id_is_null_docids, FACET_ID_IS_NULL_DOCIDS),
+        (facet_id_is_empty_docids, FACET_ID_IS_EMPTY_DOCIDS),
+        (field_id_docid_facet_f64s, FIELD_ID_DOCID_FACET_F64S),
+        (field_id_docid_facet_strings, FIELD_ID_DOCID_FACET_STRINGS),
+        (documents, DOCUMENTS),
+    ];
+
+    let rtxn = env.read_txn()?;
+    let result: Result<Vec<_>, _> =
+        list.into_iter().map(|(db, name)| compute_stats(&rtxn, db).map(|s| (s, name))).collect();
+    let mut stats = result?;
+
+    println!("{:1$} Number of Entries", "", graph_radius as usize * 2);
+    stats.sort_by_key(|(s, _)| Reverse(s.number_of_entries));
+    let data = compute_graph_data(stats.iter().map(|(s, n)| (s.number_of_entries as f32, *n)));
+    Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
+    display_legend(&data);
+    print!("\r\n");
+
+    println!("{:1$} Size of Entries", "", graph_radius as usize * 2);
+    stats.sort_by_key(|(s, _)| Reverse(s.size_of_entries));
+    let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_entries as f32, *n)));
+    Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
+    display_legend(&data);
+    print!("\r\n");
+
+    println!("{:1$} Size of Data", "", graph_radius as usize * 2);
+    stats.sort_by_key(|(s, _)| Reverse(s.size_of_data));
+    let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_data as f32, *n)));
+    Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
+    display_legend(&data);
+    print!("\r\n");
+
+    println!("{:1$} Size of Keys", "", graph_radius as usize * 2);
+    stats.sort_by_key(|(s, _)| Reverse(s.size_of_keys));
+    let data = compute_graph_data(stats.iter().map(|(s, n)| (s.size_of_keys as f32, *n)));
+    Chart::new().radius(graph_radius).aspect_ratio(graph_aspect_ratio).draw(&data);
+    display_legend(&data);
+
+    Ok(())
+}
+
+fn display_legend(data: &[Data]) {
+    let total: f32 = data.iter().map(|d| d.value).sum();
+    for Data { label, value, color, fill } in data {
+        println!(
+            "{} {} {:.02}%",
+            color.unwrap().paint(fill.to_string()),
+            label,
+            value / total * 100.0
+        );
+    }
+}
+
+fn compute_graph_data<'a>(stats: impl IntoIterator<Item = (f32, &'a str)>) -> Vec<Data> {
+    let mut colors = [
+        Color::Red,
+        Color::Green,
+        Color::Yellow,
+        Color::Blue,
+        Color::Purple,
+        Color::Cyan,
+        Color::White,
+    ]
+    .into_iter()
+    .cycle();
+
+    let mut characters = ['▴', '▵', '▾', '▿', '▪', '▫', '•', '◦'].into_iter().cycle();
+
+    stats
+        .into_iter()
+        .map(|(value, name)| Data {
+            label: (*name).into(),
+            value,
+            color: Some(colors.next().unwrap().into()),
+            fill: characters.next().unwrap(),
+        })
+        .collect()
+}
+
+#[derive(Debug)]
+pub struct Stats {
+    pub number_of_entries: u64,
+    pub size_of_keys: u64,
+    pub size_of_data: u64,
+    pub size_of_entries: u64,
+}
+
+fn compute_stats(rtxn: &RoTxn, db: PolyDatabase) -> anyhow::Result<Stats> {
+    let mut number_of_entries = 0;
+    let mut size_of_keys = 0;
+    let mut size_of_data = 0;
+
+    for result in db.iter::<_, ByteSlice, ByteSlice>(rtxn)? {
+        let (key, data) = result?;
+        number_of_entries += 1;
+        size_of_keys += key.len() as u64;
+        size_of_data += data.len() as u64;
+    }
+
+    Ok(Stats {
+        number_of_entries,
+        size_of_keys,
+        size_of_data,
+        size_of_entries: size_of_keys + size_of_data,
+    })
+}
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -75,9 +75,6 @@ maplit = "1.0.2"
 md5 = "0.7.0"
 rand = { version = "0.8.5", features = ["small_rng"] }

-[target.'cfg(fuzzing)'.dev-dependencies]
-fuzzcheck = "0.12.1"
-
 [features]
 all-tokenizations = ["charabia/default"]

--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -111,7 +111,6 @@ pub enum Error {
    Io(#[from] io::Error),
 }

-#[cfg(test)]
 pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
    let documents = match json {
        object @ serde_json::Value::Object(_) => vec![object],
@@ -141,7 +140,6 @@ macro_rules! documents {
    }};
 }

-#[cfg(test)]
 pub fn documents_batch_reader_from_objects(
    objects: impl IntoIterator<Item = Object>,
 ) -> DocumentsBatchReader<std::io::Cursor<Vec<u8>>> {
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -106,22 +106,30 @@ impl<'a> ExternalDocumentsIds<'a> {
        map
    }

+    /// Return an fst of the combined hard and soft deleted ID.
+    pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
+        if self.soft.is_empty() {
+            return Ok(Cow::Borrowed(&self.hard));
+        }
+        let union_op = self.hard.op().add(&self.soft).r#union();
+
+        let mut iter = union_op.into_stream();
+        let mut new_hard_builder = fst::MapBuilder::memory();
+        while let Some((external_id, marked_docids)) = iter.next() {
+            let value = indexed_last_value(marked_docids).unwrap();
+            if value != DELETED_ID {
+                new_hard_builder.insert(external_id, value)?;
+            }
+        }
+
+        drop(iter);
+
+        Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
+    }
+
    fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
        if self.soft.len() >= self.hard.len() / 2 {
-            let union_op = self.hard.op().add(&self.soft).r#union();
-
-            let mut iter = union_op.into_stream();
-            let mut new_hard_builder = fst::MapBuilder::memory();
-            while let Some((external_id, marked_docids)) = iter.next() {
-                let value = indexed_last_value(marked_docids).unwrap();
-                if value != DELETED_ID {
-                    new_hard_builder.insert(external_id, value)?;
-                }
-            }
-
-            drop(iter);
-
-            self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
+            self.hard = self.to_fst()?.into_owned();
            self.soft = fst::Map::default().map_data(Cow::Owned)?;
        }

--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -93,10 +93,10 @@ pub mod db_name {
 #[derive(Clone)]
 pub struct Index {
    /// The LMDB environment which this index is associated with.
-    pub(crate) env: heed::Env,
+    pub env: heed::Env,

    /// Contains many different types (e.g. the fields ids map).
-    pub(crate) main: PolyDatabase,
+    pub main: PolyDatabase,

    /// A word and all the documents ids containing the word.
    pub word_docids: Database<Str, RoaringBitmapCodec>,
@@ -150,7 +150,7 @@ pub struct Index {
    pub field_id_docid_facet_strings: Database<FieldDocIdFacetStringCodec, Str>,

    /// Maps the document id to the document as an obkv store.
-    pub(crate) documents: Database<OwnedType<BEU32>, ObkvCodec>,
+    pub documents: Database<OwnedType<BEU32>, ObkvCodec>,
 }

 impl Index {
@@ -1466,9 +1466,9 @@ pub(crate) mod tests {

        db_snap!(index, field_distribution,
            @r###"
-        age              1     
-        id               2     
-        name             2     
+        age              1      |
+        id               2      |
+        name             2      |
        "###
        );

@@ -1486,9 +1486,9 @@ pub(crate) mod tests {

        db_snap!(index, field_distribution,
            @r###"
-        age              1     
-        id               2     
-        name             2     
+        age              1      |
+        id               2      |
+        name             2      |
        "###
        );

@@ -1502,9 +1502,9 @@ pub(crate) mod tests {

        db_snap!(index, field_distribution,
            @r###"
-        has_dog          1     
-        id               2     
-        name             2     
+        has_dog          1      |
+        id               2      |
+        name             2      |
        "###
        );
    }
--- a/milli/src/search/new/distinct.rs
+++ b/milli/src/search/new/distinct.rs
@@ -26,7 +26,6 @@ pub fn apply_distinct_rule(
    ctx: &mut SearchContext,
    field_id: u16,
    candidates: &RoaringBitmap,
-    // TODO: add a universe here, such that the `excluded` are a subset of the universe?
 ) -> Result<DistinctOutput> {
    let mut excluded = RoaringBitmap::new();
    let mut remaining = RoaringBitmap::new();
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@@ -206,7 +206,7 @@ impl State {
            )?;
            intersection &= &candidates;
            if !intersection.is_empty() {
-                // TODO: although not really worth it in terms of performance,
+                // Although not really worth it in terms of performance,
                // if would be good to put this in cache for the sake of consistency
                let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
                    ctx.index
--- a/milli/src/search/new/interner.rs
+++ b/milli/src/search/new/interner.rs
@@ -32,7 +32,7 @@ impl<T> Interned<T> {
 #[derive(Clone)]
 pub struct DedupInterner<T> {
    stable_store: Vec<T>,
-    lookup: FxHashMap<T, Interned<T>>, // TODO: Arc
+    lookup: FxHashMap<T, Interned<T>>,
 }
 impl<T> Default for DedupInterner<T> {
    fn default() -> Self {
--- a/milli/src/search/new/limits.rs
+++ b/milli/src/search/new/limits.rs
@@ -1,5 +1,4 @@
 /// Maximum number of tokens we consider in a single search.
-// TODO: Loic, find proper value here so we don't overflow the interner.
 pub const MAX_TOKEN_COUNT: usize = 1_000;

 /// Maximum number of prefixes that can be derived from a single word.
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@@ -92,7 +92,7 @@ impl QueryGraph {
    /// which contains ngrams.
    pub fn from_query(
        ctx: &mut SearchContext,
-        // NOTE: the terms here must be consecutive
+        // The terms here must be consecutive
        terms: &[LocatedQueryTerm],
    ) -> Result<(QueryGraph, Vec<LocatedQueryTerm>)> {
        let mut new_located_query_terms = terms.to_vec();
@@ -103,7 +103,7 @@ impl QueryGraph {
        let root_node = 0;
        let end_node = 1;

-        // TODO: we could consider generalizing to 4,5,6,7,etc. ngrams
+        // Ee could consider generalizing to 4,5,6,7,etc. ngrams
        let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
            (vec![], vec![], vec![root_node]);

--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@@ -132,7 +132,6 @@ impl QueryTermSubset {
        if full_query_term.ngram_words.is_some() {
            return None;
        }
-        // TODO: included in subset
        if let Some(phrase) = full_query_term.zero_typo.phrase {
            self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
        } else if let Some(word) = full_query_term.zero_typo.exact {
@@ -182,7 +181,6 @@ impl QueryTermSubset {
        let word = match &self.zero_typo_subset {
            NTypoTermSubset::All => Some(use_prefix_db),
            NTypoTermSubset::Subset { words, phrases: _ } => {
-                // TODO: use a subset of prefix words instead
                if words.contains(&use_prefix_db) {
                    Some(use_prefix_db)
                } else {
@@ -204,7 +202,6 @@ impl QueryTermSubset {
        ctx: &mut SearchContext,
    ) -> Result<BTreeSet<Word>> {
        let mut result = BTreeSet::default();
-        // TODO: a compute_partially funtion
        if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() {
            self.original.compute_fully_if_needed(ctx)?;
        }
@@ -300,7 +297,6 @@ impl QueryTermSubset {
        let mut result = BTreeSet::default();

        if !self.one_typo_subset.is_empty() {
-            // TODO: compute less than fully if possible
            self.original.compute_fully_if_needed(ctx)?;
        }
        let original = ctx.term_interner.get_mut(self.original);
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -139,7 +139,6 @@ pub fn number_of_typos_allowed<'ctx>(
    let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
    let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;

-    // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
    let exact_words = ctx.index.exact_words(ctx.txn)?;

    Ok(Box::new(move |word: &str| {
@@ -250,8 +249,6 @@ impl PhraseBuilder {
        } else {
            // token has kind Word
            let word = ctx.word_interner.insert(token.lemma().to_string());
-            // TODO: in a phrase, check that every word exists
-            // otherwise return an empty term
            self.words.push(Some(word));
        }
    }
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@@ -1,5 +1,48 @@
-#![allow(clippy::too_many_arguments)]
+/** Implements a "PathVisitor" which finds all paths of a certain cost
+from the START to END node of a ranking rule graph.

+A path is a list of conditions. A condition is the data associated with
+an edge, given by the ranking rule. Some edges don't have a condition associated
+with them, they are "unconditional". These kinds of edges are used to "skip" a node.
+
+The algorithm uses a depth-first search. It benefits from two main optimisations:
+- The list of all possible costs to go from any node to the END node is precomputed
+- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
+untraversable depending on what other edges were selected.
+
+These two optimisations are meant to avoid traversing edges that wouldn't lead
+to a valid path. In practically all cases, we avoid the exponential complexity
+that is inherent to depth-first search in a large ranking rule graph.
+
+The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
+conditions to a list of traversed conditions.
+For example, the DeadEndsCache could say the following:
+- Immediately, from the start, the conditions `[a,b]` are forbidden
+    - if we take the condition `c`, then the conditions `[e]` are also forbidden
+        - and if after that, we take `f`, then `[h,i]` are also forbidden
+            - etc.
+    - if we take `g`, then `[f]` is also forbidden
+        - etc.
+    - etc.
+As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
+conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
+
+When a path is found from START to END, we give it to the `visit` closure.
+This closure takes a mutable reference to the `DeadEndsCache`. This means that
+the caller can update this cache. Therefore, we must handle the case where the
+DeadEndsCache has been updated. This means potentially backtracking up to the point
+where the traversed conditions are all allowed by the new DeadEndsCache.
+
+The algorithm also implements the `TermsMatchingStrategy` logic.
+Some edges are augmented with a list of "nodes_to_skip". Skipping
+a node means "reaching this node through an unconditional edge". If we have
+already traversed (ie. not skipped) a node that is in this list, then we know that we
+can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
+future node that was present in the "nodes_to_skip" list.
+
+The caller can decide to stop the path finding algorithm
+by returning a `ControlFlow::Break` from the `visit` closure.
+*/
 use std::collections::{BTreeSet, VecDeque};
 use std::iter::FromIterator;
 use std::ops::ControlFlow;
@@ -12,30 +55,41 @@ use crate::search::new::query_graph::QueryNode;
 use crate::search::new::small_bitmap::SmallBitmap;
 use crate::Result;

+/// Closure which processes a path found by the `PathVisitor`
 type VisitFn<'f, G> = &'f mut dyn FnMut(
+    // the path as a list of conditions
    &[Interned<<G as RankingRuleGraphTrait>::Condition>],
    &mut RankingRuleGraph<G>,
+    // a mutable reference to the DeadEndsCache, to update it in case the given
+    // path doesn't resolve to any valid document ids
    &mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
 ) -> Result<ControlFlow<()>>;

+/// A structure which is kept but not updated during the traversal of the graph.
+/// It can however be updated by the `visit` closure once a valid path has been found.
 struct VisitorContext<'a, G: RankingRuleGraphTrait> {
    graph: &'a mut RankingRuleGraph<G>,
    all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
    dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
 }

+/// The internal state of the traversal algorithm
 struct VisitorState<G: RankingRuleGraphTrait> {
+    /// Budget from the current node to the end node
    remaining_cost: u64,
-
+    /// Previously visited conditions, in order.
    path: Vec<Interned<G::Condition>>,
-
+    /// Previously visited conditions, as an efficient and compact set.
    visited_conditions: SmallBitmap<G::Condition>,
+    /// Previously visited (ie not skipped) nodes, as an efficient and compact set.
    visited_nodes: SmallBitmap<QueryNode>,
-
+    /// The conditions that cannot be visited anymore
    forbidden_conditions: SmallBitmap<G::Condition>,
-    forbidden_conditions_to_nodes: SmallBitmap<QueryNode>,
+    /// The nodes that cannot be visited anymore (they must be skipped)
+    nodes_to_skip: SmallBitmap<QueryNode>,
 }

+/// See module documentation
 pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
    state: VisitorState<G>,
    ctx: VisitorContext<'a, G>,
@@ -56,14 +110,13 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
                forbidden_conditions: SmallBitmap::for_interned_values_in(
                    &graph.conditions_interner,
                ),
-                forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in(
-                    &graph.query_graph.nodes,
-                ),
+                nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
            },
            ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
        }
    }

+    /// See module documentation
    pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> {
        let _ =
            self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
@@ -72,22 +125,31 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
 }

 impl<G: RankingRuleGraphTrait> VisitorState<G> {
+    /// Visits a node: traverse all its valid conditional and unconditional edges.
+    ///
+    /// Returns ControlFlow::Break if the path finding algorithm should stop.
+    /// Returns whether a valid path was found from this node otherwise.
    fn visit_node(
        &mut self,
        from_node: Interned<QueryNode>,
        visit: VisitFn<G>,
        ctx: &mut VisitorContext<G>,
    ) -> Result<ControlFlow<(), bool>> {
+        // any valid path will be found from this point
+        // if a valid path was found, then we know that the DeadEndsCache may have been updated,
+        // and we will need to do more work to potentially backtrack
        let mut any_valid = false;

        let edges = ctx.graph.edges_of_node.get(from_node).clone();
        for edge_idx in edges.iter() {
+            // could be none if the edge was deleted
            let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };

            if self.remaining_cost < edge.cost as u64 {
                continue;
            }
            self.remaining_cost -= edge.cost as u64;
+
            let cf = match edge.condition {
                Some(condition) => self.visit_condition(
                    condition,
@@ -119,6 +181,10 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        Ok(ControlFlow::Continue(any_valid))
    }

+    /// Visits an unconditional edge.
+    ///
+    /// Returns ControlFlow::Break if the path finding algorithm should stop.
+    /// Returns whether a valid path was found from this node otherwise.
    fn visit_no_condition(
        &mut self,
        dest_node: Interned<QueryNode>,
@@ -134,20 +200,29 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        {
            return Ok(ControlFlow::Continue(false));
        }
+        // We've reached the END node!
        if dest_node == ctx.graph.query_graph.end_node {
            let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
+            // We could change the return type of the visit closure such that the caller
+            // tells us whether the dead ends cache was updated or not.
+            // Alternatively, maybe the DeadEndsCache should have a generation number
+            // to it, so that we don't need to play with these booleans at all.
            match control_flow {
                ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
                ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
            }
        } else {
-            let old_fbct = self.forbidden_conditions_to_nodes.clone();
-            self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip);
+            let old_fbct = self.nodes_to_skip.clone();
+            self.nodes_to_skip.union(edge_new_nodes_to_skip);
            let cf = self.visit_node(dest_node, visit, ctx)?;
-            self.forbidden_conditions_to_nodes = old_fbct;
+            self.nodes_to_skip = old_fbct;
            Ok(cf)
        }
    }
+    /// Visits a conditional edge.
+    ///
+    /// Returns ControlFlow::Break if the path finding algorithm should stop.
+    /// Returns whether a valid path was found from this node otherwise.
    fn visit_condition(
        &mut self,
        condition: Interned<G::Condition>,
@@ -159,7 +234,7 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        assert!(dest_node != ctx.graph.query_graph.end_node);

        if self.forbidden_conditions.contains(condition)
-            || self.forbidden_conditions_to_nodes.contains(dest_node)
+            || self.nodes_to_skip.contains(dest_node)
            || edge_new_nodes_to_skip.intersects(&self.visited_nodes)
        {
            return Ok(ControlFlow::Continue(false));
@@ -180,19 +255,19 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        self.visited_nodes.insert(dest_node);
        self.visited_conditions.insert(condition);

-        let old_fc = self.forbidden_conditions.clone();
+        let old_forb_cond = self.forbidden_conditions.clone();
        if let Some(next_forbidden) =
            ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
        {
            self.forbidden_conditions.union(&next_forbidden);
        }
-        let old_fctn = self.forbidden_conditions_to_nodes.clone();
-        self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip);
+        let old_nodes_to_skip = self.nodes_to_skip.clone();
+        self.nodes_to_skip.union(edge_new_nodes_to_skip);

        let cf = self.visit_node(dest_node, visit, ctx)?;

-        self.forbidden_conditions_to_nodes = old_fctn;
-        self.forbidden_conditions = old_fc;
+        self.nodes_to_skip = old_nodes_to_skip;
+        self.forbidden_conditions = old_forb_cond;

        self.visited_conditions.remove(condition);
        self.visited_nodes.remove(dest_node);
--- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs
@@ -9,12 +9,8 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
 use crate::search::new::SearchContext;
 use crate::Result;

-// TODO: give a generation to each universe, then be able to get the exact
-// delta of docids between two universes of different generations!
-
 /// A cache storing the document ids associated with each ranking rule edge
 pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
-    // TOOD: should be a mapped interner?
    pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
    _phantom: PhantomData<G>,
 }
@@ -54,7 +50,7 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
        }
        let condition = graph.conditions_interner.get_mut(interned_condition);
        let computed = G::resolve_condition(ctx, condition, universe)?;
-        // TODO: if computed.universe_len != universe.len() ?
+        // Can we put an assert here for computed.universe_len == universe.len() ?
        let _ = self.cache.insert(interned_condition, computed);
        let computed = &self.cache[&interned_condition];
        Ok(computed)
--- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs
@@ -2,6 +2,7 @@ use crate::search::new::interner::{FixedSizeInterner, Interned};
 use crate::search::new::small_bitmap::SmallBitmap;

 pub struct DeadEndsCache<T> {
+    // conditions and next could/should be part of the same vector
    conditions: Vec<Interned<T>>,
    next: Vec<Self>,
    pub forbidden: SmallBitmap<T>,
@@ -27,7 +28,7 @@ impl<T> DeadEndsCache<T> {
        self.forbidden.insert(condition);
    }

-    pub fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
+    fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
        if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
            Some(&mut self.next[idx])
        } else {
--- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs
@@ -69,14 +69,9 @@ impl RankingRuleGraphTrait for FidGraph {

        let mut edges = vec![];
        for fid in all_fields {
-            // TODO: We can improve performances and relevancy by storing
-            //       the term subsets associated to each field ids fetched.
            edges.push((
-                fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
-                conditions_interner.insert(FidCondition {
-                    term: term.clone(), // TODO remove this ugly clone
-                    fid,
-                }),
+                fid as u32 * term.term_ids.len() as u32,
+                conditions_interner.insert(FidCondition { term: term.clone(), fid }),
            ));
        }

--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@@ -94,14 +94,9 @@ impl RankingRuleGraphTrait for PositionGraph {
        let mut edges = vec![];

        for (cost, positions) in positions_for_costs {
-            // TODO: We can improve performances and relevancy by storing
-            //       the term subsets associated to each position fetched
            edges.push((
                cost,
-                conditions_interner.insert(PositionCondition {
-                    term: term.clone(), // TODO remove this ugly clone
-                    positions,
-                }),
+                conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
            ));
        }

--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -65,13 +65,6 @@ pub fn compute_docids(
        }
    }

-    // TODO: add safeguard in case the cartesian product is too large!
-    // even if we restrict the word derivations to a maximum of 100, the size of the
-    // caterisan product could reach a maximum of 10_000 derivations, which is way too much.
-    // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
-    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
-    // reached
-
    for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
        // Before computing the edges, check that the left word and left phrase
        // aren't disjoint with the universe, but only do it if there is more than
@@ -111,8 +104,6 @@ pub fn compute_docids(
    Ok(ComputedCondition {
        docids,
        universe_len: universe.len(),
-        // TODO: think about whether we want to reduce the subset,
-        // we probably should!
        start_term_subset: Some(left_term.clone()),
        end_term_subset: right_term.clone(),
    })
@@ -203,12 +194,7 @@ fn compute_non_prefix_edges(
            *docids |= new_docids;
        }
    }
-    if backward_proximity >= 1
-            // TODO: for now, we don't do any swapping when either term is a phrase
-            // but maybe we should. We'd need to look at the first/last word of the phrase
-            // depending on the context.
-            && left_phrase.is_none() && right_phrase.is_none()
-    {
+    if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
        if let Some(new_docids) =
            ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
        {
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -33,8 +33,6 @@ pub fn compute_query_term_subset_docids(
    ctx: &mut SearchContext,
    term: &QueryTermSubset,
 ) -> Result<RoaringBitmap> {
-    // TODO Use the roaring::MultiOps trait
-
    let mut docids = RoaringBitmap::new();
    for word in term.all_single_words_except_prefix_db(ctx)? {
        if let Some(word_docids) = ctx.word_docids(word)? {
@@ -59,8 +57,6 @@ pub fn compute_query_term_subset_docids_within_field_id(
    term: &QueryTermSubset,
    fid: u16,
 ) -> Result<RoaringBitmap> {
-    // TODO Use the roaring::MultiOps trait
-
    let mut docids = RoaringBitmap::new();
    for word in term.all_single_words_except_prefix_db(ctx)? {
        if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? {
@@ -71,7 +67,6 @@ pub fn compute_query_term_subset_docids_within_field_id(
    for phrase in term.all_phrases(ctx)? {
        // There may be false positives when resolving a phrase, so we're not
        // guaranteed that all of its words are within a single fid.
-        // TODO: fix this?
        if let Some(word) = phrase.words(ctx).iter().flatten().next() {
            if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
                docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
@@ -95,7 +90,6 @@ pub fn compute_query_term_subset_docids_within_position(
    term: &QueryTermSubset,
    position: u16,
 ) -> Result<RoaringBitmap> {
-    // TODO Use the roaring::MultiOps trait
    let mut docids = RoaringBitmap::new();
    for word in term.all_single_words_except_prefix_db(ctx)? {
        if let Some(word_position_docids) =
@@ -108,7 +102,6 @@ pub fn compute_query_term_subset_docids_within_position(
    for phrase in term.all_phrases(ctx)? {
        // It's difficult to know the expected position of the words in the phrase,
        // so instead we just check the first one.
-        // TODO: fix this?
        if let Some(word) = phrase.words(ctx).iter().flatten().next() {
            if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
                docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
@@ -132,9 +125,6 @@ pub fn compute_query_graph_docids(
    q: &QueryGraph,
    universe: &RoaringBitmap,
 ) -> Result<RoaringBitmap> {
-    // TODO: there must be a faster way to compute this big
-    // roaring bitmap expression
-
    let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes);
    let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new());

--- a/milli/src/search/new/sort.rs
+++ b/milli/src/search/new/sort.rs
@@ -141,10 +141,6 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<Query>>> {
        let iter = self.iter.as_mut().unwrap();
-        // TODO: we should make use of the universe in the function below
-        // good for correctness, but ideally iter.next_bucket would take the current universe into account,
-        // as right now it could return buckets that don't intersect with the universe, meaning we will make many
-        // unneeded calls.
        if let Some(mut bucket) = iter.next_bucket()? {
            bucket.candidates &= universe;
            Ok(Some(bucket))
--- a/milli/src/search/new/tests/distinct.rs
+++ b/milli/src/search/new/tests/distinct.rs
@@ -527,7 +527,7 @@ fn test_distinct_all_candidates() {
    let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap();
    let candidates = candidates.iter().collect::<Vec<_>>();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
-    // TODO: this is incorrect!
+    // This is incorrect, but unfortunately impossible to do better efficiently.
    insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]");
 }

--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@@ -122,11 +122,11 @@ fn create_edge_cases_index() -> TempIndex {
            sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz
            "
        },
-        // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. 
-        // If the search query is "sunflower", the split word "Sun Flower" will match some documents. 
+        // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`.
+        // If the search query is "sunflower", the split word "Sun Flower" will match some documents.
        // If the query is `sunflower wilting`, then we should make sure that
-        // the sprximity condition `flower wilting: sprx N` also comes with the condition
-        // `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. 
+        // the proximity condition `flower wilting: sprx N` also comes with the condition
+        // `sun wilting: sprx N+1`, but this is not the exact condition we use for now.
        // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which
        // is better than nothing but not the best.
        {
@@ -139,7 +139,7 @@ fn create_edge_cases_index() -> TempIndex {
        },
        {
            "id": 3,
-            // This document matches the query `sunflower wilting`, but the sprximity condition 
+            // This document matches the query `sunflower wilting`, but the sprximity condition
            // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower`
            // which would reduce to only `flower` and `wilting` being in sprximity.
            "text": "A flower wilting under the sun, unlike a sunflower"
@@ -299,7 +299,7 @@ fn test_proximity_split_word() {
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: "2" and "4" should be swapped ideally
+    // "2" and "4" should be swapped ideally
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@@ -316,7 +316,7 @@ fn test_proximity_split_word() {
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: "2" and "4" should be swapped ideally
+    // "2" and "4" should be swapped ideally
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
@@ -341,7 +341,7 @@ fn test_proximity_split_word() {
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: "2" and "4" should be swapped ideally
+    // "2" and "4" should be swapped ideally
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"",
--- a/milli/src/search/new/tests/proximity_typo.rs
+++ b/milli/src/search/new/tests/proximity_typo.rs
@@ -2,9 +2,8 @@
 This module tests the interactions between the proximity and typo ranking rules.

 The proximity ranking rule should transform the query graph such that it
-only contains the word pairs that it used to compute its bucket.
-
-TODO: This is not currently implemented.
+only contains the word pairs that it used to compute its bucket, but this is not currently
+implemented.
 */

 use crate::index::tests::TempIndex;
@@ -64,7 +63,7 @@ fn test_trap_basic() {
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: this is incorrect, 1 should come before 0
+    // This is incorrect, 1 should come before 0
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"summer. holiday. sommer holidty\"",
--- a/milli/src/search/new/tests/typo.rs
+++ b/milli/src/search/new/tests/typo.rs
@@ -571,8 +571,8 @@ fn test_typo_synonyms() {
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("the fast brownish fox jumps over the lackadaisical dog");

-    // TODO: is this correct? interaction of ngrams + synonyms means that the
-    // multi-word synonyms end up having a typo cost. This is probably not what we want.
+    // The interaction of ngrams + synonyms means that the multi-word synonyms end up having a typo cost.
+    // This is probably not what we want.
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -318,7 +318,7 @@ pub fn snap_field_distributions(index: &Index) -> String {
    let rtxn = index.read_txn().unwrap();
    let mut snap = String::new();
    for (field, count) in index.field_distribution(&rtxn).unwrap() {
-        writeln!(&mut snap, "{field:<16} {count:<6}").unwrap();
+        writeln!(&mut snap, "{field:<16} {count:<6} |").unwrap();
    }
    snap
 }
@@ -328,7 +328,7 @@ pub fn snap_fields_ids_map(index: &Index) -> String {
    let mut snap = String::new();
    for field_id in fields_ids_map.ids() {
        let name = fields_ids_map.name(field_id).unwrap();
-        writeln!(&mut snap, "{field_id:<3} {name:<16}").unwrap();
+        writeln!(&mut snap, "{field_id:<3} {name:<16} |").unwrap();
    }
    snap
 }
--- a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap
+++ b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap
@@ -1,7 +1,7 @@
 ---
 source: milli/src/index.rs
 ---
-age              1     
-id               2     
-name             2     
+age              1      |
+id               2      |
+name             2      |

--- a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap
+++ b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap
@@ -1,7 +1,7 @@
 ---
 source: milli/src/index.rs
 ---
-age              1     
-id               2     
-name             2     
+age              1      |
+id               2      |
+name             2      |

--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -71,7 +71,6 @@ impl std::fmt::Display for DeletionStrategy {
 pub(crate) struct DetailedDocumentDeletionResult {
    pub deleted_documents: u64,
    pub remaining_documents: u64,
-    pub soft_deletion_used: bool,
 }

 impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
@@ -108,11 +107,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        Some(docid)
    }
    pub fn execute(self) -> Result<DocumentDeletionResult> {
-        let DetailedDocumentDeletionResult {
-            deleted_documents,
-            remaining_documents,
-            soft_deletion_used: _,
-        } = self.execute_inner()?;
+        let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
+            self.execute_inner()?;

        Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
    }
@@ -133,7 +129,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            return Ok(DetailedDocumentDeletionResult {
                deleted_documents: 0,
                remaining_documents: 0,
-                soft_deletion_used: false,
            });
        }

@@ -149,7 +144,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            return Ok(DetailedDocumentDeletionResult {
                deleted_documents: current_documents_ids_len,
                remaining_documents,
-                soft_deletion_used: false,
            });
        }

@@ -218,7 +212,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            return Ok(DetailedDocumentDeletionResult {
                deleted_documents: self.to_delete_docids.len(),
                remaining_documents: documents_ids.len(),
-                soft_deletion_used: true,
            });
        }

@@ -441,7 +434,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        Ok(DetailedDocumentDeletionResult {
            deleted_documents: self.to_delete_docids.len(),
            remaining_documents: documents_ids.len(),
-            soft_deletion_used: false,
        })
    }

--- a/milli/src/update/index_documents/helpers/clonable_mmap.rs
+++ b/milli/src/update/index_documents/helpers/clonable_mmap.rs
@@ -2,7 +2,7 @@ use std::sync::Arc;

 use memmap2::Mmap;

-/// Wrapper around Mmap allowing to virtualy clone grenad-chunks
+/// Wrapper around Mmap allowing to virtually clone grenad-chunks
 /// in a parallel process like the indexing.
 #[derive(Debug, Clone)]
 pub struct ClonableMmap {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -236,7 +236,7 @@ where
            primary_key,
            fields_ids_map,
            field_distribution,
-            mut external_documents_ids,
+            new_external_documents_ids,
            new_documents_ids,
            replaced_documents_ids,
            documents_count,
@@ -363,9 +363,6 @@ where
            deletion_builder.delete_documents(&replaced_documents_ids);
            let deleted_documents_result = deletion_builder.execute_inner()?;
            debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
-            if !deleted_documents_result.soft_deletion_used {
-                external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
-            }
        }

        let index_documents_ids = self.index.documents_ids(self.wtxn)?;
@@ -445,6 +442,9 @@ where
        self.index.put_primary_key(self.wtxn, &primary_key)?;

        // We write the external documents ids into the main database.
+        let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
+        external_documents_ids.insert_ids(&new_external_documents_ids)?;
+        let external_documents_ids = external_documents_ids.into_static();
        self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;

        let all_documents_ids = index_documents_ids | new_documents_ids;
@@ -2514,4 +2514,170 @@ mod tests {
        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }
+
+    #[test]
+    fn reproduce_the_bug() {
+        /*
+            [milli/examples/fuzz.rs:69] &batches = [
+            Batch(
+                [
+                    AddDoc(
+                        { "id": 1, "doggo": "bernese" }, => internal 0
+                    ),
+                ],
+            ),
+            Batch(
+                [
+                    DeleteDoc(
+                        1, => delete internal 0
+                    ),
+                    AddDoc(
+                        { "id": 0, "catto": "jorts" }, => internal 1
+                    ),
+                ],
+            ),
+            Batch(
+                [
+                    AddDoc(
+                        { "id": 1, "catto": "jorts" }, => internal 2
+                    ),
+                ],
+            ),
+        ]
+        */
+        let mut index = TempIndex::new();
+        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
+
+        // START OF BATCH
+
+        println!("--- ENTERING BATCH 1");
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        // OP
+
+        let documents = documents!([
+            { "id": 1, "doggo": "bernese" },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"1");
+
+        // FINISHING
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 1,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":1,"doggo":"bernese"}
+        "###);
+        db_snap!(index, external_documents_ids, @r###"
+        soft:
+        hard:
+        1                        0
+        "###);
+
+        // A first batch of documents has been inserted
+
+        // BATCH 2
+
+        println!("--- ENTERING BATCH 2");
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"1");
+
+        let documents = documents!([
+            { "id": 0, "catto": "jorts" },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"1");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 1,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":0,"catto":"jorts"}
+        "###);
+
+        db_snap!(index, external_documents_ids, @r###"
+        soft:
+        hard:
+        0                        1
+        "###);
+
+        db_snap!(index, soft_deleted_documents_ids, @"[]");
+
+        // BATCH 3
+
+        println!("--- ENTERING BATCH 3");
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let documents = documents!([
+            { "id": 1, "catto": "jorts" },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"1");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 1,
+            number_of_documents: 2,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":1,"catto":"jorts"}
+        {"id":0,"catto":"jorts"}
+        "###);
+
+        // Ensuring all the returned IDs actually exists
+        let rtxn = index.read_txn().unwrap();
+        let res = index.search(&rtxn).execute().unwrap();
+        index.documents(&rtxn, res.documents_ids).unwrap();
+    }
 }
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -21,15 +21,14 @@ use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
 use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
 use crate::{
-    ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
-    Result, BEU32,
+    FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
 };

 pub struct TransformOutput {
    pub primary_key: String,
    pub fields_ids_map: FieldsIdsMap,
    pub field_distribution: FieldDistribution,
-    pub external_documents_ids: ExternalDocumentsIds<'static>,
+    pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
    pub new_documents_ids: RoaringBitmap,
    pub replaced_documents_ids: RoaringBitmap,
    pub documents_count: usize,
@@ -568,8 +567,6 @@ impl<'a, 'i> Transform<'a, 'i> {
            }))?
            .to_string();

-        let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
-
        // We create a final writer to write the new documents in order from the sorter.
        let mut writer = create_writer(
            self.indexer_settings.chunk_compression_type,
@@ -651,13 +648,12 @@ impl<'a, 'i> Transform<'a, 'i> {
            fst_new_external_documents_ids_builder.insert(key, value)
        })?;
        let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
-        external_documents_ids.insert_ids(&new_external_documents_ids)?;

        Ok(TransformOutput {
            primary_key,
            fields_ids_map: self.fields_ids_map,
            field_distribution,
-            external_documents_ids: external_documents_ids.into_static(),
+            new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
            new_documents_ids: self.new_documents_ids,
            replaced_documents_ids: self.replaced_documents_ids,
            documents_count: self.documents_count,
@@ -691,7 +687,8 @@ impl<'a, 'i> Transform<'a, 'i> {
        let new_external_documents_ids = {
            let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
            external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
-            external_documents_ids
+            // This call should be free and can't fail since the previous method merged both fsts.
+            external_documents_ids.into_static().to_fst()?.into_owned()
        };

        let documents_ids = self.index.documents_ids(wtxn)?;
@@ -776,7 +773,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            primary_key,
            fields_ids_map: new_fields_ids_map,
            field_distribution,
-            external_documents_ids: new_external_documents_ids.into_static(),
+            new_external_documents_ids,
            new_documents_ids: documents_ids,
            replaced_documents_ids: RoaringBitmap::default(),
            documents_count,
Author	SHA1	Message	Date
Kerollmops	a41c0ba755	Fix the legend	2023-06-24 14:53:32 +02:00
Kerollmops	ef9875256b	Create a small tool to measure the size of inernal databases	2023-06-23 22:57:57 +02:00
meili-bors[bot]	040b5a5b6f	Merge #3842 3842: fix some typos r=dureuill a=cuishuang # Pull Request ## Related issue Fixes #<issue_number> ## What does this PR do? - fix some typos ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: cui fliter <imcusg@gmail.com>	2023-06-22 18:01:10 +00:00
cui fliter	530a3e2df3	fix some typos Signed-off-by: cui fliter <imcusg@gmail.com>	2023-06-22 21:59:00 +08:00
meili-bors[bot]	28404d56b7	Merge #3799 3799: Fix error messages in `check-release.sh` r=curquiza a=vvv - `check_tag`: Report file name correctly. Use named variables. - Introduce `read_version` helper function. Simplify the implementation. - Show meaningful error message if `GITHUB_REF` is not set or its format is incorrect. Co-authored-by: Valeriy V. Vorotyntsev <valery.vv@gmail.com>	2023-06-20 13:35:33 +00:00
meili-bors[bot]	262c1f2baf	Merge #3844 3844: Fix SDK CI (again) r=curquiza a=curquiza Following this PR: https://github.com/meilisearch/meilisearch/pull/3813 Sorry `@Kerollmops,` here is (I hope) the latest fix 🙏 I made tests last time that were not sufficient. I really did a lot this time. I hope I have not missed anything. Co-authored-by: curquiza <clementine@meilisearch.com>	2023-06-20 13:01:07 +00:00
Valeriy V. Vorotyntsev	cfed349aa3	Fix error messages in `check-release.sh` - `check_tag`: Report file name correctly. Use named variables. - Introduce `read_version` helper function. Simplify the implementation. - Show meaningful error message if `GITHUB_REF` is not set or its format is incorrect.	2023-06-20 13:58:09 +03:00
curquiza	bbc9f68ff5	Use the input from the previous job instead of the workflow dispatch	2023-06-19 18:49:15 +02:00
meili-bors[bot]	45636d315c	Merge #3670 3670: Fix addition deletion bug r=irevoire a=irevoire The first commit of this PR is a revert of https://github.com/meilisearch/meilisearch/pull/3667. It re-enable the auto-batching of addition and deletion of tasks. No new changes have been introduced outside of `milli`. So all the changes you see on the autobatcher have actually already been reviewed. It fixes https://github.com/meilisearch/meilisearch/issues/3440. ### What was happening? The issue was that the `external_documents_ids` generated in the `transform` were used in a very strange way that wasn’t compatible with the deletion of documents. Instead of doing a clear merge between the external document IDs of the DB and the one returned by the transform + writing it on disk, we were doing some weird tricks with the soft-deleted to avoid writing the fst on disk as much as possible. The new algorithm may be a bit slower but is way more straightforward and doesn’t change depending on if the soft deletion was used or not. Here is a list of the changes introduced: 1. We now do a clear distinction between the `new_external_documents_ids` coming from the transform and only held on RAM and the `external_documents_ids` coming from the DB. 2. The `new_external_documents_ids` (coming out of the transform) are now represented as an `fst`. We don't need to struggle with the hard, soft distinction + the soft_deleted => That's easier to understand 3. When indexing documents, we merge the `external_documents_ids` coming from the DB and the `new_external_documents_ids` coming from the transform. ### Other things introduced in this PR Since we constantly have to write small, very specialized fuzzers for this kind of bug, we decided to push the one used to reproduce this bug. It's not perfect, but it's easy to improve in the future. It'll also run for as long as possible on every merge on the main branch. Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: Loïc Lecrenier <loic.lecrenier@icloud.com>	2023-06-19 09:09:30 +00:00
meili-bors[bot]	cb9d78fc7f	Merge #3835 3835: Add more documentation to graph-based ranking rule algorithms + comment cleanup r=Kerollmops a=loiclec In addition to documenting the `cheapest_path.rs` file, this PR cleans up a few outdated comments as well as some TODOs. These TODOs have been moved to https://github.com/meilisearch/meilisearch/issues/3776 Co-authored-by: Loïc Lecrenier <loic.lecrenier@icloud.com>	2023-06-15 15:30:24 +00:00
meili-bors[bot]	01d2ee5cc1	Merge #3836 3836: Remove trailing whitespace in snapshots r=dureuill a=dureuill # Pull Request ## Related issue No issue, maintenance ## What does this PR do? - Remove trailing whitespace in snapshots by adding a trailing `\|` at the end of lines that would previously end with fixed-width integers - This allows contributors whose editor is configured to remove trailing whitespace not to modify the tests when changing an unrelated part of the file containing the tests Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2023-06-14 13:00:52 +00:00
Louis Dureuil	e0c4682758	Fix tests	2023-06-14 13:30:52 +02:00
Louis Dureuil	d9b4b39922	Add trailing pipe to the snapshots so it doesn't end with trailing whitespace	2023-06-14 13:30:52 +02:00
Loïc Lecrenier	2da86b31a6	Remove comments and add documentation	2023-06-14 12:39:42 +02:00
Loïc Lecrenier	4e81445d42	Stop the fuzzer after an hour	2023-06-12 15:30:51 +02:00
meili-bors[bot]	4829348d6e	Merge #3813 3813: Fix SDK CI for scheduled jobs r=curquiza a=curquiza The SDK CI does not run for the scheduled job (`cron`) every day, and only works for manual triggers. I added a job to define the Docker image we use depending on the event: `worflow_dispatch` = manual triggering, or `scheduled` = cron jobs Co-authored-by: curquiza <clementine@meilisearch.com>	2023-06-12 08:41:03 +00:00
curquiza	b6b6a80b76	Fix SDK CI for scheduled jobs	2023-06-06 10:38:05 +02:00
Tamo	f03d99690d	run the indexing fuzzer on every merge for as long as possible	2023-05-29 14:56:15 +02:00
Tamo	23a5b45ebf	drop the old fuzz file	2023-05-29 14:02:37 +02:00
Tamo	46fa99f486	make the fuzzer stops if an error occurs	2023-05-29 13:44:32 +02:00
Tamo	67a583bedf	handle the panic happening in milli	2023-05-29 13:39:26 +02:00
Tamo	99e9057684	rename the indexing fuzzer to fuzz-indexing so it doesn't collide with other binary name when being called from the root of the workspace	2023-05-29 13:07:06 +02:00
Tamo	8d40d300a5	rename the fuzzer to indexing	2023-05-29 12:37:24 +02:00
Tamo	6c6387d05e	move the fuzzer to its own crate	2023-05-29 12:27:39 +02:00
Tamo	002f42875f	fix the fuzzer	2023-05-23 11:42:40 +02:00
Tamo	22213dc604	push the fuzzer	2023-05-23 09:14:26 +02:00
Tamo	602ad98cb8	improve the way we handle the fsts	2023-05-22 11:15:14 +02:00
Tamo	7f619ff0e4	get rids of the now unused soft_deletion_used parameter	2023-05-22 10:33:49 +02:00
Tamo	4391cba6ca	fix the addition + deletion bug	2023-05-17 18:28:57 +02:00
Tamo	d7ddf4925e	Revert "Disable autobatching of additions and deletions" This reverts commit `a94e78ffb0`.	2023-05-17 14:25:50 +02:00