Add spans

WIP add more logs
Make the merger multithreaded
2025-07-20 05:20:36 +00:00 · 2024-09-26 17:20:32 +02:00 · 2024-09-26 16:37:38 +02:00 · 2024-09-26 11:09:06 +02:00 · 2024-09-25 22:42:41 +02:00 · 2024-09-25 22:15:15 +02:00
75 changed files with 5356 additions and 648 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -527,7 +527,7 @@ dependencies = [
 "proc-macro2",
 "quote",
 "regex",
- "rustc-hash 1.1.0",
+ "rustc-hash",
 "shlex",
 "syn 2.0.60",
 ]
@ -934,8 +934,7 @@ dependencies = [
 [[package]]
 name = "charabia"
 version = "0.9.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "55ff52497324e7d168505a16949ae836c14595606fab94687238d2f6c8d4c798"
+source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71"
 dependencies = [
 "aho-corasick",
 "csv",
@ -2221,11 +2220,11 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 [[package]]
 name = "grenad"
 version = "0.4.7"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "350d89047298d3b1b40050acd11ab76e487b854a104b760ebc5a7f375093de77"
+source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#58ac87d852413571102f44c5e55ca13509a3f1a0"
 dependencies = [
 "bytemuck",
 "byteorder",
+ "either",
 "rayon",
 "tempfile",
 ]
@ -2308,9 +2307,9 @@ dependencies = [

 [[package]]
 name = "hashbrown"
-version = "0.14.3"
+version = "0.14.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
+checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
 dependencies = [
 "ahash 0.8.11",
 "allocator-api2",
@ -2570,6 +2569,7 @@ dependencies = [
 "meili-snap",
 "meilisearch-auth",
 "meilisearch-types",
+ "memmap2",
 "page_size",
 "rayon",
 "roaring",
@ -2591,7 +2591,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
 dependencies = [
 "equivalent",
- "hashbrown 0.14.3",
+ "hashbrown 0.14.5",
 "serde",
 ]

@ -2650,8 +2650,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
 [[package]]
 name = "irg-kvariants"
 version = "0.1.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26"
+source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71"
 dependencies = [
 "csv",
 "once_cell",
@ -3567,6 +3566,7 @@ dependencies = [
 "fxhash",
 "geoutils",
 "grenad",
+ "hashbrown 0.14.5",
 "heed",
 "hf-hub",
 "indexmap",
@ -3836,9 +3836,8 @@ dependencies = [

 [[package]]
 name = "obkv"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2e27bcfe835a379d32352112f6b8dbae2d99d16a5fff42abe6e5ba5386c1e5a"
+version = "0.3.0"
+source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce535874008ecac554f02e0c670e6caf62134d6b"

 [[package]]
 name = "once_cell"
@ -4314,7 +4313,7 @@ dependencies = [
 "pin-project-lite",
 "quinn-proto",
 "quinn-udp",
- "rustc-hash 1.1.0",
+ "rustc-hash",
 "rustls",
 "thiserror",
 "tokio",
@ -4323,14 +4322,14 @@ dependencies = [

 [[package]]
 name = "quinn-proto"
-version = "0.11.8"
+version = "0.11.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6"
+checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe"
 dependencies = [
 "bytes",
 "rand",
 "ring",
- "rustc-hash 2.0.0",
+ "rustc-hash",
 "rustls",
 "slab",
 "thiserror",
@ -4655,8 +4654,7 @@ dependencies = [
 [[package]]
 name = "roaring"
 version = "0.10.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1"
+source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#6bba84b1a47da1d6e52d5c4dc0ce8593ae4646a5"
 dependencies = [
 "bytemuck",
 "byteorder",
@ -4703,12 +4701,6 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"

-[[package]]
-name = "rustc-hash"
-version = "2.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
-
 [[package]]
 name = "rustc_version"
 version = "0.4.0"
@ -4847,9 +4839,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"

 [[package]]
 name = "serde"
-version = "1.0.209"
+version = "1.0.210"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
+checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
 dependencies = [
 "serde_derive",
 ]
@ -4865,9 +4857,9 @@ dependencies = [

 [[package]]
 name = "serde_derive"
-version = "1.0.209"
+version = "1.0.210"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
+checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
 dependencies = [
 "proc-macro2",
 "quote",
@ -5360,7 +5352,7 @@ dependencies = [
 "fancy-regex 0.12.0",
 "lazy_static",
 "parking_lot",
- "rustc-hash 1.1.0",
+ "rustc-hash",
 ]

 [[package]]
@ -6048,7 +6040,7 @@ version = "0.16.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0"
 dependencies = [
- "hashbrown 0.14.3",
+ "hashbrown 0.14.5",
 "once_cell",
 ]

--- a/Cargo.toml
+++ b/Cargo.toml
@ -44,23 +44,5 @@ opt-level = 3
 [profile.dev.package.roaring]
 opt-level = 3

-[profile.dev.package.lindera-ipadic-builder]
-opt-level = 3
-[profile.dev.package.encoding]
-opt-level = 3
-[profile.dev.package.yada]
-opt-level = 3
-
-[profile.release.package.lindera-ipadic-builder]
-opt-level = 3
-[profile.release.package.encoding]
-opt-level = 3
-[profile.release.package.yada]
-opt-level = 3
-
-[profile.bench.package.lindera-ipadic-builder]
-opt-level = 3
-[profile.bench.package.encoding]
-opt-level = 3
-[profile.bench.package.yada]
-opt-level = 3
+[patch.crates-io]
+roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" }
--- a/index-scheduler/Cargo.toml
+++ b/index-scheduler/Cargo.toml
@ -29,6 +29,7 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] }
 synchronoise = "1.0.1"
 tempfile = "3.10.1"
 thiserror = "1.0.61"
+memmap2 = "0.9.4"
 time = { version = "0.3.36", features = [
    "serde-well-known",
    "formatting",
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@ -28,6 +28,9 @@ use meilisearch_types::error::Code;
 use meilisearch_types::heed::{RoTxn, RwTxn};
 use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
 use meilisearch_types::milli::heed::CompactionOption;
+use meilisearch_types::milli::update::new::indexer::{
+    self, retrieve_or_guess_primary_key, DocumentChanges,
+};
 use meilisearch_types::milli::update::{
    IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
 };
@ -875,10 +878,8 @@ impl IndexScheduler {
                            while let Some(doc) =
                                cursor.next_document().map_err(milli::Error::from)?
                            {
-                                dump_content_file.push_document(&obkv_to_object(
-                                    &doc,
-                                    &documents_batch_index,
-                                )?)?;
+                                dump_content_file
+                                    .push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
                            }
                            dump_content_file.flush()?;
                        }
@ -1252,58 +1253,52 @@ impl IndexScheduler {
                let must_stop_processing = self.must_stop_processing.clone();
                let indexer_config = self.index_mapper.indexer_config();

-                if let Some(primary_key) = primary_key {
-                    match index.primary_key(index_wtxn)? {
-                        // if a primary key was set AND had already been defined in the index
-                        // but to a different value, we can make the whole batch fail.
-                        Some(pk) => {
-                            if primary_key != pk {
-                                return Err(milli::Error::from(
-                                    milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()),
-                                )
-                                .into());
-                            }
-                        }
-                        // if the primary key was set and there was no primary key set for this index
-                        // we set it to the received value before starting the indexing process.
-                        None => {
-                            let mut builder =
-                                milli::update::Settings::new(index_wtxn, index, indexer_config);
-                            builder.set_primary_key(primary_key);
-                            builder.execute(
-                                |indexing_step| tracing::debug!(update = ?indexing_step),
-                                || must_stop_processing.clone().get(),
-                            )?;
-                            primary_key_has_been_set = true;
+                /// TODO manage errors correctly
+                let rtxn = index.read_txn()?;
+                let first_addition_uuid = operations
+                    .iter()
+                    .find_map(|op| match op {
+                        DocumentOperation::Add(content_uuid) => Some(content_uuid),
+                        _ => None,
+                    })
+                    .unwrap();
+
+                let mut content_files = Vec::new();
+                for operation in &operations {
+                    if let DocumentOperation::Add(content_uuid) = operation {
+                        let content_file = self.file_store.get_update(*content_uuid)?;
+                        let mmap = unsafe { memmap2::Mmap::map(&content_file)? };
+                        if !mmap.is_empty() {
+                            content_files.push(mmap);
                        }
                    }
                }

-                let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
+                let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
+                let first_document = match content_files.first() {
+                    Some(mmap) => {
+                        let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter();
+                        iter.next().transpose().map_err(|e| e.into()).map_err(Error::IoError)?
+                    }
+                    None => None,
+                };

-                let embedder_configs = index.embedding_configs(index_wtxn)?;
-                // TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
-                let embedders = self.embedders(embedder_configs)?;
-
-                let mut builder = milli::update::IndexDocuments::new(
-                    index_wtxn,
+                let primary_key = retrieve_or_guess_primary_key(
+                    &rtxn,
                    index,
-                    indexer_config,
-                    config,
-                    |indexing_step| tracing::trace!(?indexing_step, "Update"),
-                    || must_stop_processing.get(),
-                )?;
+                    &mut fields_ids_map,
+                    first_document.as_ref(),
+                )?
+                .unwrap();

+                let mut content_files_iter = content_files.iter();
+                let mut indexer = indexer::DocumentOperation::new(method);
                for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) {
                    match operation {
-                        DocumentOperation::Add(content_uuid) => {
-                            let content_file = self.file_store.get_update(content_uuid)?;
-                            let reader = DocumentsBatchReader::from_reader(content_file)
-                                .map_err(milli::Error::from)?;
-                            let (new_builder, user_result) = builder.add_documents(reader)?;
-                            builder = new_builder;
-
-                            builder = builder.with_embedders(embedders.clone());
+                        DocumentOperation::Add(_content_uuid) => {
+                            let mmap = content_files_iter.next().unwrap();
+                            let stats = indexer.add_documents(mmap)?;
+                            // builder = builder.with_embedders(embedders.clone());

                            let received_documents =
                                if let Some(Details::DocumentAdditionOrUpdate {
@ -1317,30 +1312,17 @@ impl IndexScheduler {
                                    unreachable!();
                                };

-                            match user_result {
-                                Ok(count) => {
-                                    task.status = Status::Succeeded;
-                                    task.details = Some(Details::DocumentAdditionOrUpdate {
-                                        received_documents,
-                                        indexed_documents: Some(count),
-                                    })
-                                }
-                                Err(e) => {
-                                    task.status = Status::Failed;
-                                    task.details = Some(Details::DocumentAdditionOrUpdate {
-                                        received_documents,
-                                        indexed_documents: Some(0),
-                                    });
-                                    task.error = Some(milli::Error::from(e).into());
-                                }
-                            }
+                            task.status = Status::Succeeded;
+                            task.details = Some(Details::DocumentAdditionOrUpdate {
+                                received_documents,
+                                indexed_documents: Some(stats.document_count as u64),
+                            })
                        }
                        DocumentOperation::Delete(document_ids) => {
-                            let (new_builder, user_result) =
-                                builder.remove_documents(document_ids)?;
-                            builder = new_builder;
+                            let count = document_ids.len();
+                            indexer.delete_documents(document_ids);
                            // Uses Invariant: remove documents actually always returns Ok for the inner result
-                            let count = user_result.unwrap();
+                            // let count = user_result.unwrap();
                            let provided_ids =
                                if let Some(Details::DocumentDeletion { provided_ids, .. }) =
                                    task.details
@ -1354,26 +1336,35 @@ impl IndexScheduler {
                            task.status = Status::Succeeded;
                            task.details = Some(Details::DocumentDeletion {
                                provided_ids,
-                                deleted_documents: Some(count),
+                                deleted_documents: Some(count as u64),
                            });
                        }
                    }
                }

                if !tasks.iter().all(|res| res.error.is_some()) {
-                    let addition = builder.execute()?;
-                    tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
-                } else if primary_key_has_been_set {
-                    // Everything failed but we've set a primary key.
-                    // We need to remove it.
-                    let mut builder =
-                        milli::update::Settings::new(index_wtxn, index, indexer_config);
-                    builder.reset_primary_key();
-                    builder.execute(
-                        |indexing_step| tracing::trace!(update = ?indexing_step),
-                        || must_stop_processing.clone().get(),
-                    )?;
+                    /// TODO create a pool if needed
+                    // let pool = indexer_config.thread_pool.unwrap();
+                    let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
+
+                    let param = (index, &rtxn, &primary_key);
+                    let document_changes = indexer.document_changes(&mut fields_ids_map, param)?;
+                    /// TODO pass/write the FieldsIdsMap
+                    indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?;
+
+                    // tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
                }
+                // else if primary_key_has_been_set {
+                //     // Everything failed but we've set a primary key.
+                //     // We need to remove it.
+                //     let mut builder =
+                //         milli::update::Settings::new(index_wtxn, index, indexer_config);
+                //     builder.reset_primary_key();
+                //     builder.execute(
+                //         |indexing_step| tracing::trace!(update = ?indexing_step),
+                //         || must_stop_processing.clone().get(),
+                //     )?;
+                // }

                Ok(tasks)
            }
--- a/meilisearch-types/src/document_formats.rs
+++ b/meilisearch-types/src/document_formats.rs
@ -1,20 +1,22 @@
 use std::fmt::{self, Debug, Display};
 use std::fs::File;
-use std::io::{self, BufWriter, Write};
+use std::io::{self, BufWriter};
 use std::marker::PhantomData;

-use memmap2::MmapOptions;
-use milli::documents::{DocumentsBatchBuilder, Error};
+use memmap2::Mmap;
+use milli::documents::Error;
+use milli::update::new::TopLevelMap;
 use milli::Object;
 use serde::de::{SeqAccess, Visitor};
 use serde::{Deserialize, Deserializer};
 use serde_json::error::Category;
+use serde_json::{to_writer, Map, Value};

 use crate::error::{Code, ErrorCode};

 type Result<T> = std::result::Result<T, DocumentFormatError>;

-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub enum PayloadType {
    Ndjson,
    Json,
@ -88,6 +90,26 @@ impl From<(PayloadType, Error)> for DocumentFormatError {
    }
 }

+impl From<(PayloadType, serde_json::Error)> for DocumentFormatError {
+    fn from((ty, error): (PayloadType, serde_json::Error)) -> Self {
+        if error.classify() == Category::Data {
+            Self::Io(error.into())
+        } else {
+            Self::MalformedPayload(Error::Json(error), ty)
+        }
+    }
+}
+
+impl From<(PayloadType, csv::Error)> for DocumentFormatError {
+    fn from((ty, error): (PayloadType, csv::Error)) -> Self {
+        if error.is_io_error() {
+            Self::Io(error.into())
+        } else {
+            Self::MalformedPayload(Error::Csv(error), ty)
+        }
+    }
+}
+
 impl From<io::Error> for DocumentFormatError {
    fn from(error: io::Error) -> Self {
        Self::Io(error)
@ -103,67 +125,140 @@ impl ErrorCode for DocumentFormatError {
    }
 }

-/// Reads CSV from input and write an obkv batch to writer.
-pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result<u64> {
-    let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
-    let mmap = unsafe { MmapOptions::new().map(file)? };
-    let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref());
-    builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?;
-
-    let count = builder.documents_count();
-    let _ = builder.into_inner().map_err(DocumentFormatError::Io)?;
-
-    Ok(count as u64)
+// TODO remove that from the place I've borrowed it
+#[derive(Debug)]
+enum AllowedType {
+    String,
+    Boolean,
+    Number,
 }

-/// Reads JSON from temporary file and write an obkv batch to writer.
-pub fn read_json(file: &File, writer: impl Write) -> Result<u64> {
-    let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
-    let mmap = unsafe { MmapOptions::new().map(file)? };
-    let mut deserializer = serde_json::Deserializer::from_slice(&mmap);
+fn parse_csv_header(header: &str) -> (&str, AllowedType) {
+    // if there are several separators we only split on the last one.
+    match header.rsplit_once(':') {
+        Some((field_name, field_type)) => match field_type {
+            "string" => (field_name, AllowedType::String),
+            "boolean" => (field_name, AllowedType::Boolean),
+            "number" => (field_name, AllowedType::Number),
+            // if the pattern isn't recognized, we keep the whole field.
+            _otherwise => (header, AllowedType::String),
+        },
+        None => (header, AllowedType::String),
+    }
+}

-    match array_each(&mut deserializer, |obj| builder.append_json_object(&obj)) {
+/// Reads CSV from file and write it in NDJSON in a file checking it along the way.
+pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result<u64> {
+    let ptype = PayloadType::Csv { delimiter };
+    let mut output = BufWriter::new(output);
+    let mut reader = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(input);
+
+    let headers = reader.headers().map_err(|e| DocumentFormatError::from((ptype, e)))?.clone();
+    let typed_fields: Vec<_> = headers.iter().map(parse_csv_header).collect();
+    let mut object: Map<_, _> =
+        typed_fields.iter().map(|(k, _)| (k.to_string(), Value::Null)).collect();
+
+    let mut line = 0;
+    let mut record = csv::StringRecord::new();
+    while reader.read_record(&mut record).map_err(|e| DocumentFormatError::from((ptype, e)))? {
+        // We increment here and not at the end of the loop
+        // to take the header offset into account.
+        line += 1;
+
+        // Reset the document values
+        object.iter_mut().for_each(|(_, v)| *v = Value::Null);
+
+        for (i, (name, atype)) in typed_fields.iter().enumerate() {
+            let value = &record[i];
+            let trimmed_value = value.trim();
+            let value = match atype {
+                AllowedType::Number if trimmed_value.is_empty() => Value::Null,
+                AllowedType::Number => match trimmed_value.parse::<i64>() {
+                    Ok(integer) => Value::from(integer),
+                    Err(_) => match trimmed_value.parse::<f64>() {
+                        Ok(float) => Value::from(float),
+                        Err(error) => {
+                            return Err(DocumentFormatError::MalformedPayload(
+                                Error::ParseFloat { error, line, value: value.to_string() },
+                                ptype,
+                            ))
+                        }
+                    },
+                },
+                AllowedType::Boolean if trimmed_value.is_empty() => Value::Null,
+                AllowedType::Boolean => match trimmed_value.parse::<bool>() {
+                    Ok(bool) => Value::from(bool),
+                    Err(error) => {
+                        return Err(DocumentFormatError::MalformedPayload(
+                            Error::ParseBool { error, line, value: value.to_string() },
+                            ptype,
+                        ))
+                    }
+                },
+                AllowedType::String if value.is_empty() => Value::Null,
+                AllowedType::String => Value::from(value),
+            };
+
+            *object.get_mut(*name).expect("encountered an unknown field") = value;
+        }
+
+        to_writer(&mut output, &object).map_err(|e| DocumentFormatError::from((ptype, e)))?;
+    }
+
+    Ok(line as u64)
+}
+
+/// Reads JSON from file and write it in NDJSON in a file checking it along the way.
+pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
+    // We memory map to be able to deserailize into a TopLevelMap<'pl> that
+    // does not allocate when possible and only materialize the first/top level.
+    let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
+
+    let mut out = BufWriter::new(output);
+    let mut deserializer = serde_json::Deserializer::from_slice(&input);
+    let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) {
        // The json data has been deserialized and does not need to be processed again.
        // The data has been transferred to the writer during the deserialization process.
-        Ok(Ok(_)) => (),
-        Ok(Err(e)) => return Err(DocumentFormatError::Io(e)),
+        Ok(Ok(count)) => count,
+        Ok(Err(e)) => return Err(DocumentFormatError::from((PayloadType::Json, e))),
        Err(e) => {
            // Attempt to deserialize a single json string when the cause of the exception is not Category.data
            // Other types of deserialisation exceptions are returned directly to the front-end
-            if e.classify() != serde_json::error::Category::Data {
-                return Err(DocumentFormatError::MalformedPayload(
-                    Error::Json(e),
-                    PayloadType::Json,
-                ));
+            if e.classify() != Category::Data {
+                return Err(DocumentFormatError::from((PayloadType::Json, e)));
            }

-            let content: Object = serde_json::from_slice(&mmap)
+            let content: Object = serde_json::from_slice(&input)
                .map_err(Error::Json)
                .map_err(|e| (PayloadType::Json, e))?;
-            builder.append_json_object(&content).map_err(DocumentFormatError::Io)?;
+            to_writer(&mut out, &content)
+                .map(|_| 1)
+                .map_err(|e| DocumentFormatError::from((PayloadType::Json, e)))?
        }
+    };
+
+    match out.into_inner() {
+        Ok(_) => Ok(count),
+        Err(ie) => Err(DocumentFormatError::Io(ie.into_error())),
    }
-
-    let count = builder.documents_count();
-    let _ = builder.into_inner().map_err(DocumentFormatError::Io)?;
-
-    Ok(count as u64)
 }

-/// Reads JSON from temporary file  and write an obkv batch to writer.
-pub fn read_ndjson(file: &File, writer: impl Write) -> Result<u64> {
-    let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
-    let mmap = unsafe { MmapOptions::new().map(file)? };
+/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way.
+pub fn read_ndjson(input: &File, output: impl io::Write) -> Result<u64> {
+    // We memory map to be able to deserailize into a TopLevelMap<'pl> that
+    // does not allocate when possible and only materialize the first/top level.
+    let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
+    let mut output = BufWriter::new(output);

-    for result in serde_json::Deserializer::from_slice(&mmap).into_iter() {
-        let object = result.map_err(Error::Json).map_err(|e| (PayloadType::Ndjson, e))?;
-        builder.append_json_object(&object).map_err(Into::into).map_err(DocumentFormatError::Io)?;
+    let mut count = 0;
+    for result in serde_json::Deserializer::from_slice(&input).into_iter() {
+        count += 1;
+        result
+            .and_then(|map: TopLevelMap| to_writer(&mut output, &map))
+            .map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
    }

-    let count = builder.documents_count();
-    let _ = builder.into_inner().map_err(Into::into).map_err(DocumentFormatError::Io)?;
-
-    Ok(count as u64)
+    Ok(count)
 }

 /// The actual handling of the deserialization process in serde
@ -172,20 +267,23 @@ pub fn read_ndjson(file: &File, writer: impl Write) -> Result<u64> {
 /// ## References
 /// <https://serde.rs/stream-array.html>
 /// <https://github.com/serde-rs/json/issues/160>
-fn array_each<'de, D, T, F>(deserializer: D, f: F) -> std::result::Result<io::Result<u64>, D::Error>
+fn array_each<'de, D, T, F>(
+    deserializer: D,
+    f: F,
+) -> std::result::Result<serde_json::Result<u64>, D::Error>
 where
    D: Deserializer<'de>,
    T: Deserialize<'de>,
-    F: FnMut(T) -> io::Result<()>,
+    F: FnMut(T) -> serde_json::Result<()>,
 {
    struct SeqVisitor<T, F>(F, PhantomData<T>);

    impl<'de, T, F> Visitor<'de> for SeqVisitor<T, F>
    where
        T: Deserialize<'de>,
-        F: FnMut(T) -> io::Result<()>,
+        F: FnMut(T) -> serde_json::Result<()>,
    {
-        type Value = io::Result<u64>;
+        type Value = serde_json::Result<u64>;

        fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
            formatter.write_str("a nonempty sequence")
@ -194,7 +292,7 @@ where
        fn visit_seq<A>(
            mut self,
            mut seq: A,
-        ) -> std::result::Result<io::Result<u64>, <A as SeqAccess<'de>>::Error>
+        ) -> std::result::Result<serde_json::Result<u64>, <A as SeqAccess<'de>>::Error>
        where
            A: SeqAccess<'de>,
        {
@ -203,7 +301,7 @@ where
                match self.0(value) {
                    Ok(()) => max += 1,
                    Err(e) => return Ok(Err(e)),
-                };
+                }
            }
            Ok(Ok(max))
        }
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@ -57,7 +57,7 @@ meilisearch-types = { path = "../meilisearch-types" }
 mimalloc = { version = "0.1.43", default-features = false }
 mime = "0.3.17"
 num_cpus = "1.16.0"
-obkv = "0.2.2"
+obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" }
 once_cell = "1.19.0"
 ordered-float = "4.2.1"
 parking_lot = "0.12.3"
--- a/meilisearch/src/search/mod.rs
+++ b/meilisearch/src/search/mod.rs
@ -1682,7 +1682,7 @@ fn add_non_formatted_ids_to_formatted_options(
 fn make_document(
    displayed_attributes: &BTreeSet<FieldId>,
    field_ids_map: &FieldsIdsMap,
-    obkv: obkv::KvReaderU16,
+    obkv: &obkv::KvReaderU16,
 ) -> Result<Document, MeilisearchHttpError> {
    let mut document = serde_json::Map::new();

--- a/meilitool/src/main.rs
+++ b/meilitool/src/main.rs
@ -682,7 +682,7 @@ fn export_a_dump(
                        format!("While iterating on content file {:?}", content_file_uuid)
                    })? {
                        dump_content_file
-                            .push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
+                            .push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
                    }
                    dump_content_file.flush()?;
                    count += 1;
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -12,12 +12,14 @@ readme.workspace = true
 license.workspace = true

 [dependencies]
+big_s = "1.0.2"
 bimap = { version = "0.6.3", features = ["serde"] }
 bincode = "1.3.3"
 bstr = "1.9.1"
 bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
 byteorder = "1.5.0"
-charabia = { version = "0.9.1", default-features = false }
+# charabia = { version = "0.9.0", default-features = false }
+charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.13"
 deserr = "0.6.2"
@ -27,9 +29,9 @@ fst = "0.4.7"
 fxhash = "0.2.1"
 geoutils = "0.5.1"
 grenad = { version = "0.4.7", default-features = false, features = [
-    "rayon",
-    "tempfile",
-] }
+    "rayon", # TODO Should we keep this feature
+    "tempfile"
+], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" }
 heed = { version = "0.20.3", default-features = false, features = [
    "serde-json",
    "serde-bincode",
@ -40,14 +42,14 @@ json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
 memchr = "2.5.0"
 memmap2 = "0.9.4"
-obkv = "0.2.2"
+obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" }
 once_cell = "1.19.0"
 ordered-float = "4.2.1"
 rayon = "1.10.0"
 roaring = { version = "0.10.6", features = ["serde"] }
 rstar = { version = "0.12.0", features = ["serde"] }
 serde = { version = "1.0.204", features = ["derive"] }
-serde_json = { version = "1.0.120", features = ["preserve_order"] }
+serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] }
 slice-group-by = "0.3.1"
 smallstr = { version = "0.3.0", features = ["serde"] }
 smallvec = "1.13.2"
@ -86,10 +88,10 @@ tracing = "0.1.40"
 ureq = { version = "2.10.0", features = ["json"] }
 url = "2.5.2"
 rayon-par-bridge = "0.1.0"
+hashbrown = "0.14.5"

 [dev-dependencies]
 mimalloc = { version = "0.1.43", default-features = false }
-big_s = "1.0.2"
 insta = "1.39.0"
 maplit = "1.0.2"
 md5 = "0.7.0"
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -292,7 +292,7 @@ mod test {
            .unwrap()
            .into_cursor_and_fields_index();
        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -321,7 +321,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -348,7 +348,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -375,7 +375,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -402,7 +402,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -429,7 +429,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -456,7 +456,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -483,7 +483,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -510,7 +510,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
@ -555,7 +555,7 @@ mod test {
            .into_cursor_and_fields_index();

        let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
+        let val = obkv_to_object(doc, &index).map(Value::from).unwrap();

        assert_eq!(
            val,
--- a/milli/src/documents/enriched.rs
+++ b/milli/src/documents/enriched.rs
@ -69,7 +69,7 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {

 #[derive(Debug, Clone)]
 pub struct EnrichedDocument<'a> {
-    pub document: KvReader<'a, FieldId>,
+    pub document: &'a KvReader<FieldId>,
    pub document_id: DocumentId,
 }

--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -27,7 +27,7 @@ use crate::{FieldId, Object, Result};
 const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();

 /// Helper function to convert an obkv reader into a JSON object.
-pub fn obkv_to_object(obkv: &KvReader<'_, FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
+pub fn obkv_to_object(obkv: &KvReader<FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
    obkv.iter()
        .map(|(field_id, value)| {
            let field_name = index
@ -76,7 +76,7 @@ impl DocumentsBatchIndex {
        self.0.get_by_right(name).cloned()
    }

-    pub fn recreate_json(&self, document: &obkv::KvReaderU16<'_>) -> Result<Object> {
+    pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result<Object> {
        let mut map = Object::new();

        for (k, v) in document.iter() {
--- a/milli/src/documents/primary_key.rs
+++ b/milli/src/documents/primary_key.rs
@ -1,8 +1,10 @@
+use std::borrow::Cow;
 use std::iter;
 use std::result::Result as StdResult;

-use serde_json::Value;
+use serde_json::{from_str, Value};

+use crate::update::new::{CowStr, TopLevelMap};
 use crate::{FieldId, InternalError, Object, Result, UserError};

 /// The symbol used to define levels in a nested primary key.
@ -52,7 +54,7 @@ impl<'a> PrimaryKey<'a> {

    pub fn document_id(
        &self,
-        document: &obkv::KvReader<'_, FieldId>,
+        document: &obkv::KvReader<FieldId>,
        fields: &impl FieldIdMapper,
    ) -> Result<StdResult<String, DocumentIdExtractionError>> {
        match self {
@ -100,6 +102,45 @@ impl<'a> PrimaryKey<'a> {
        }
    }

+    /// Returns the document ID based on the primary and
+    /// search for it recursively in zero-copy-deserialized documents.
+    pub fn document_id_from_top_level_map<'p>(
+        &self,
+        document: &TopLevelMap<'p>,
+    ) -> Result<StdResult<CowStr<'p>, DocumentIdExtractionError>> {
+        fn get_docid<'p>(
+            document: &TopLevelMap<'p>,
+            primary_key: &[&str],
+        ) -> Result<StdResult<CowStr<'p>, DocumentIdExtractionError>> {
+            match primary_key {
+                [] => unreachable!("arrrgh"), // would None be ok?
+                [primary_key] => match document.0.get(*primary_key) {
+                    Some(value) => match from_str::<u64>(value.get()) {
+                        Ok(value) => Ok(Ok(CowStr(Cow::Owned(value.to_string())))),
+                        Err(_) => match from_str(value.get()) {
+                            Ok(document_id) => Ok(Ok(document_id)),
+                            Err(e) => Ok(Err(DocumentIdExtractionError::InvalidDocumentId(
+                                UserError::SerdeJson(e),
+                            ))),
+                        },
+                    },
+                    None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
+                },
+                [head, tail @ ..] => match document.0.get(*head) {
+                    Some(value) => {
+                        let document = from_str(value.get()).map_err(InternalError::SerdeJson)?;
+                        get_docid(&document, tail)
+                    }
+                    None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
+                },
+            }
+        }
+
+        /// TODO do not allocate a vec everytime here
+        let primary_key: Vec<_> = self.name().split(PRIMARY_KEY_SPLIT_SYMBOL).collect();
+        get_docid(document, &primary_key)
+    }
+
    /// Returns an `Iterator` that gives all the possible fields names the primary key
    /// can have depending of the first level name and depth of the objects.
    pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@ -72,15 +72,24 @@ impl<R> DocumentsBatchCursor<R> {
 }

 impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
+    /// Returns a single document from the database.
+    pub fn get(
+        &mut self,
+        offset: u32,
+    ) -> Result<Option<&KvReader<FieldId>>, DocumentsBatchCursorError> {
+        match self.cursor.move_on_key_equal_to(offset.to_be_bytes())? {
+            Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => Ok(Some(value.into())),
+            _otherwise => Ok(None),
+        }
+    }
+
    /// Returns the next document, starting from the first one. Subsequent calls to
    /// `next_document` advance the document reader until all the documents have been read.
    pub fn next_document(
        &mut self,
-    ) -> Result<Option<KvReader<'_, FieldId>>, DocumentsBatchCursorError> {
+    ) -> Result<Option<&KvReader<FieldId>>, DocumentsBatchCursorError> {
        match self.cursor.move_on_next()? {
-            Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
-                Ok(Some(KvReader::new(value)))
-            }
+            Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => Ok(Some(value.into())),
            _otherwise => Ok(None),
        }
    }
--- a/milli/src/fields_ids_map.rs
+++ b/milli/src/fields_ids_map.rs
@ -4,6 +4,9 @@ use serde::{Deserialize, Serialize};

 use crate::FieldId;

+mod global;
+pub use global::GlobalFieldsIdsMap;
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct FieldsIdsMap {
    names_ids: BTreeMap<String, FieldId>,
--- a/milli/src/fields_ids_map/global.rs
+++ b/milli/src/fields_ids_map/global.rs
@ -0,0 +1,86 @@
+use std::collections::BTreeMap;
+use std::sync::RwLock;
+
+use crate::{FieldId, FieldsIdsMap};
+
+/// A fields ids map that can be globally updated to add fields
+#[derive(Debug, Clone)]
+pub struct GlobalFieldsIdsMap<'indexing> {
+    global: &'indexing RwLock<FieldsIdsMap>,
+    local: LocalFieldsIdsMap,
+}
+
+#[derive(Debug, Clone)]
+struct LocalFieldsIdsMap {
+    names_ids: BTreeMap<String, FieldId>,
+    ids_names: BTreeMap<FieldId, String>,
+}
+
+impl LocalFieldsIdsMap {
+    fn new(global: &RwLock<FieldsIdsMap>) -> Self {
+        let global = global.read().unwrap();
+        Self { names_ids: global.names_ids.clone(), ids_names: global.ids_names.clone() }
+    }
+
+    fn insert(&mut self, name: &str, field_id: FieldId) {
+        self.names_ids.insert(name.to_owned(), field_id);
+        self.ids_names.insert(field_id, name.to_owned());
+    }
+
+    fn name(&self, id: FieldId) -> Option<&str> {
+        self.ids_names.get(&id).map(String::as_str)
+    }
+
+    fn id(&self, name: &str) -> Option<FieldId> {
+        self.names_ids.get(name).copied()
+    }
+}
+
+impl<'indexing> GlobalFieldsIdsMap<'indexing> {
+    pub fn new(global: &'indexing RwLock<FieldsIdsMap>) -> Self {
+        Self { local: LocalFieldsIdsMap::new(global), global }
+    }
+
+    /// Returns the field id related to a field name, it will create a new field id if the
+    /// name is not already known. Returns `None` if the maximum field id as been reached.
+    pub fn id_or_insert(&mut self, name: &str) -> Option<FieldId> {
+        if let Some(field_id) = self.local.id(name) {
+            return Some(field_id);
+        }
+
+        {
+            // optimistically lookup the global map
+            let global = self.global.read().unwrap();
+
+            if let Some(field_id) = global.id(name) {
+                self.local.insert(name, field_id);
+                return Some(field_id);
+            }
+        }
+
+        {
+            let mut global = self.global.write().unwrap();
+
+            if let Some(field_id) = global.id(name) {
+                self.local.insert(name, field_id);
+                return Some(field_id);
+            }
+
+            let field_id = global.insert(name)?;
+            self.local.insert(name, field_id);
+            Some(field_id)
+        }
+    }
+
+    /// Get the name of a field based on its id.
+    pub fn name(&mut self, id: FieldId) -> Option<&str> {
+        if self.local.name(id).is_none() {
+            let global = self.global.read().unwrap();
+
+            let name = global.name(id)?;
+            self.local.insert(name, id);
+        }
+
+        self.local.name(id)
+    }
+}
--- a/milli/src/heed_codec/obkv_codec.rs
+++ b/milli/src/heed_codec/obkv_codec.rs
@ -6,10 +6,10 @@ use obkv::{KvReaderU16, KvWriterU16};
 pub struct ObkvCodec;

 impl<'a> heed::BytesDecode<'a> for ObkvCodec {
-    type DItem = KvReaderU16<'a>;
+    type DItem = &'a KvReaderU16;

    fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
-        Ok(KvReaderU16::new(bytes))
+        Ok(KvReaderU16::from_slice(bytes))
    }
 }

--- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
+++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
@ -122,7 +122,7 @@ impl CboRoaringBitmapCodec {

    /// Merges a DelAdd delta into a CboRoaringBitmap.
    pub fn merge_deladd_into<'a>(
-        deladd: KvReaderDelAdd<'_>,
+        deladd: &KvReaderDelAdd,
        previous: &[u8],
        buffer: &'a mut Vec<u8>,
    ) -> io::Result<Option<&'a [u8]>> {
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1251,12 +1251,20 @@ impl Index {

    /* documents */

+    /// Returns a document by using the document id.
+    pub fn document<'t>(&self, rtxn: &'t RoTxn, id: DocumentId) -> Result<&'t obkv::KvReaderU16> {
+        self.documents
+            .get(rtxn, &id)?
+            .ok_or(UserError::UnknownInternalDocumentId { document_id: id })
+            .map_err(Into::into)
+    }
+
    /// Returns an iterator over the requested documents. The next item will be an error if a document is missing.
    pub fn iter_documents<'a, 't: 'a>(
        &'a self,
        rtxn: &'t RoTxn<'t>,
        ids: impl IntoIterator<Item = DocumentId> + 'a,
-    ) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
+    ) -> Result<impl Iterator<Item = Result<(DocumentId, &'t obkv::KvReaderU16)>> + 'a> {
        Ok(ids.into_iter().map(move |id| {
            let kv = self
                .documents
@ -1271,7 +1279,7 @@ impl Index {
        &self,
        rtxn: &'t RoTxn<'t>,
        ids: impl IntoIterator<Item = DocumentId>,
-    ) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
+    ) -> Result<Vec<(DocumentId, &'t obkv::KvReaderU16)>> {
        self.iter_documents(rtxn, ids)?.collect()
    }

@ -1279,7 +1287,7 @@ impl Index {
    pub fn all_documents<'a, 't: 'a>(
        &'a self,
        rtxn: &'t RoTxn<'t>,
-    ) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
+    ) -> Result<impl Iterator<Item = Result<(DocumentId, &'t obkv::KvReaderU16)>> + 'a> {
        self.iter_documents(rtxn, self.documents_ids(rtxn)?)
    }

@ -1303,7 +1311,7 @@ impl Index {
        })?;
        Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
            let (_docid, obkv) = entry?;
-            match primary_key.document_id(&obkv, &fields)? {
+            match primary_key.document_id(obkv, &fields)? {
                Ok(document_id) => Ok(document_id),
                Err(_) => Err(InternalError::DocumentsError(
                    crate::documents::Error::InvalidDocumentFormat,
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -55,7 +55,7 @@ pub use self::error::{
 };
 pub use self::external_documents_ids::ExternalDocumentsIds;
 pub use self::fieldids_weights_map::FieldidsWeightsMap;
-pub use self::fields_ids_map::FieldsIdsMap;
+pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
 pub use self::heed_codec::{
    BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
@ -214,7 +214,7 @@ pub fn bucketed_position(relative: u16) -> u16 {
 pub fn obkv_to_json(
    displayed_fields: &[FieldId],
    fields_ids_map: &FieldsIdsMap,
-    obkv: obkv::KvReaderU16<'_>,
+    obkv: &obkv::KvReaderU16,
 ) -> Result<Object> {
    displayed_fields
        .iter()
@ -232,10 +232,7 @@ pub fn obkv_to_json(
 }

 /// Transform every field of a raw obkv store into a JSON Object.
-pub fn all_obkv_to_json(
-    obkv: obkv::KvReaderU16<'_>,
-    fields_ids_map: &FieldsIdsMap,
-) -> Result<Object> {
+pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result<Object> {
    let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
    obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv)
 }
@ -434,7 +431,7 @@ mod tests {
        writer.insert(id1, b"1234").unwrap();
        writer.insert(id2, b"4321").unwrap();
        let contents = writer.into_inner().unwrap();
-        let obkv = obkv::KvReaderU16::new(&contents);
+        let obkv = obkv::KvReaderU16::from_slice(&contents);

        let expected = json!({
            "field1": 1234,
--- a/milli/src/prompt/document.rs
+++ b/milli/src/prompt/document.rs
@ -30,13 +30,13 @@ impl ParsedValue {

 impl<'a> Document<'a> {
    pub fn new(
-        data: obkv::KvReaderU16<'a>,
+        data: &'a obkv::KvReaderU16,
        side: DelAdd,
        inverted_field_map: &'a FieldsIdsMap,
    ) -> Self {
        let mut out_data = BTreeMap::new();
        for (fid, raw) in data {
-            let obkv = KvReaderDelAdd::new(raw);
+            let obkv = KvReaderDelAdd::from_slice(raw);
            let Some(raw) = obkv.get(side) else {
                continue;
            };
--- a/milli/src/prompt/mod.rs
+++ b/milli/src/prompt/mod.rs
@ -111,7 +111,7 @@ impl Prompt {

    pub fn render(
        &self,
-        document: obkv::KvReaderU16<'_>,
+        document: &obkv::KvReaderU16,
        side: DelAdd,
        field_id_map: &FieldsIdsMapWithMetadata,
    ) -> Result<String, RenderPromptError> {
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::hash::Hash;

 use fxhash::FxHashMap;
+use grenad::MergeFunction;
 use heed::types::Bytes;
 use heed::{BytesEncode, Database, RoTxn};
 use roaring::RoaringBitmap;
@ -11,7 +12,7 @@ use super::interner::Interned;
 use super::Word;
 use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
 use crate::proximity::ProximityPrecision;
-use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
+use crate::update::MergeCboRoaringBitmaps;
 use crate::{
    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
 };
@ -110,19 +111,21 @@ impl<'ctx> DatabaseCache<'ctx> {
            .map_err(Into::into)
    }

-    fn get_value_from_keys<'v, K1, KC>(
+    fn get_value_from_keys<'v, K1, KC, MF>(
        txn: &'ctx RoTxn<'_>,
        cache_key: K1,
        db_keys: &'v [KC::EItem],
        cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
        db: Database<KC, Bytes>,
        universe: Option<&RoaringBitmap>,
-        merger: MergeFn,
+        merger: MF,
    ) -> Result<Option<RoaringBitmap>>
    where
        K1: Copy + Eq + Hash,
        KC: BytesEncode<'v>,
        KC::EItem: Sized,
+        MF: MergeFunction,
+        crate::Error: From<MF::Error>,
    {
        if let Entry::Vacant(entry) = cache.entry(cache_key) {
            let bitmap_ptr: Option<Cow<'ctx, [u8]>> = match db_keys {
@ -138,7 +141,7 @@ impl<'ctx> DatabaseCache<'ctx> {
                    if bitmaps.is_empty() {
                        None
                    } else {
-                        Some(merger(&[], &bitmaps[..])?)
+                        Some(merger.merge(&[], &bitmaps[..])?)
                    }
                }
            };
@ -213,17 +216,17 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _>(
+                DatabaseCache::get_value_from_keys(
                    self.txn,
                    word,
                    &keys[..],
                    &mut self.db_cache.word_docids,
                    self.index.word_fid_docids.remap_data_type::<Bytes>(),
                    universe,
-                    merge_cbo_roaring_bitmaps,
+                    MergeCboRoaringBitmaps,
                )
            }
-            None => DatabaseCache::get_value::<_, _>(
+            None => DatabaseCache::get_value(
                self.txn,
                word,
                self.word_interner.get(word).as_str(),
@ -245,17 +248,17 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _>(
+                DatabaseCache::get_value_from_keys(
                    self.txn,
                    word,
                    &keys[..],
                    &mut self.db_cache.exact_word_docids,
                    self.index.word_fid_docids.remap_data_type::<Bytes>(),
                    universe,
-                    merge_cbo_roaring_bitmaps,
+                    MergeCboRoaringBitmaps,
                )
            }
-            None => DatabaseCache::get_value::<_, _>(
+            None => DatabaseCache::get_value(
                self.txn,
                word,
                self.word_interner.get(word).as_str(),
@ -302,17 +305,17 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _>(
+                DatabaseCache::get_value_from_keys(
                    self.txn,
                    prefix,
                    &keys[..],
                    &mut self.db_cache.word_prefix_docids,
                    self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
                    universe,
-                    merge_cbo_roaring_bitmaps,
+                    MergeCboRoaringBitmaps,
                )
            }
-            None => DatabaseCache::get_value::<_, _>(
+            None => DatabaseCache::get_value(
                self.txn,
                prefix,
                self.word_interner.get(prefix).as_str(),
@ -334,17 +337,17 @@ impl<'ctx> SearchContext<'ctx> {
                let keys: Vec<_> =
                    restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();

-                DatabaseCache::get_value_from_keys::<_, _>(
+                DatabaseCache::get_value_from_keys(
                    self.txn,
                    prefix,
                    &keys[..],
                    &mut self.db_cache.exact_word_prefix_docids,
                    self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
                    universe,
-                    merge_cbo_roaring_bitmaps,
+                    MergeCboRoaringBitmaps,
                )
            }
-            None => DatabaseCache::get_value::<_, _>(
+            None => DatabaseCache::get_value(
                self.txn,
                prefix,
                self.word_interner.get(prefix).as_str(),
@ -405,7 +408,7 @@ impl<'ctx> SearchContext<'ctx> {

                Ok(docids)
            }
-            ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>(
+            ProximityPrecision::ByWord => DatabaseCache::get_value(
                self.txn,
                (proximity, word1, word2),
                &(
@ -538,7 +541,7 @@ impl<'ctx> SearchContext<'ctx> {
            return Ok(None);
        }

-        DatabaseCache::get_value::<_, _>(
+        DatabaseCache::get_value(
            self.txn,
            (word, fid),
            &(self.word_interner.get(word).as_str(), fid),
@ -559,7 +562,7 @@ impl<'ctx> SearchContext<'ctx> {
            return Ok(None);
        }

-        DatabaseCache::get_value::<_, _>(
+        DatabaseCache::get_value(
            self.txn,
            (word_prefix, fid),
            &(self.word_interner.get(word_prefix).as_str(), fid),
@ -629,7 +632,7 @@ impl<'ctx> SearchContext<'ctx> {
        word: Interned<String>,
        position: u16,
    ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _>(
+        DatabaseCache::get_value(
            self.txn,
            (word, position),
            &(self.word_interner.get(word).as_str(), position),
@ -645,7 +648,7 @@ impl<'ctx> SearchContext<'ctx> {
        word_prefix: Interned<String>,
        position: u16,
    ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _>(
+        DatabaseCache::get_value(
            self.txn,
            (word_prefix, position),
            &(self.word_interner.get(word_prefix).as_str(), position),
--- a/milli/src/update/available_documents_ids.rs
+++ b/milli/src/update/available_documents_ids.rs
@ -3,12 +3,12 @@ use std::ops::RangeInclusive;

 use roaring::bitmap::{IntoIter, RoaringBitmap};

-pub struct AvailableDocumentsIds {
+pub struct AvailableIds {
    iter: Chain<IntoIter, RangeInclusive<u32>>,
 }

-impl AvailableDocumentsIds {
-    pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
+impl AvailableIds {
+    pub fn new(docids: &RoaringBitmap) -> AvailableIds {
        match docids.max() {
            Some(last_id) => {
                let mut available = RoaringBitmap::from_iter(0..last_id);
@ -20,17 +20,17 @@ impl AvailableDocumentsIds {
                    None => 1..=0, // empty range iterator
                };

-                AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
+                AvailableIds { iter: available.into_iter().chain(iter) }
            }
            None => {
                let empty = RoaringBitmap::new().into_iter();
-                AvailableDocumentsIds { iter: empty.chain(0..=u32::MAX) }
+                AvailableIds { iter: empty.chain(0..=u32::MAX) }
            }
        }
    }
 }

-impl Iterator for AvailableDocumentsIds {
+impl Iterator for AvailableIds {
    type Item = u32;

    fn next(&mut self) -> Option<Self::Item> {
@ -45,7 +45,7 @@ mod tests {
    #[test]
    fn empty() {
        let base = RoaringBitmap::new();
-        let left = AvailableDocumentsIds::from_documents_ids(&base);
+        let left = AvailableIds::new(&base);
        let right = 0..=u32::MAX;
        left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
    }
@ -58,7 +58,7 @@ mod tests {
        base.insert(100);
        base.insert(405);

-        let left = AvailableDocumentsIds::from_documents_ids(&base);
+        let left = AvailableIds::new(&base);
        let right = (0..=u32::MAX).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
        left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
    }
--- a/milli/src/update/concurrent_available_ids.rs
+++ b/milli/src/update/concurrent_available_ids.rs
@ -0,0 +1,59 @@
+use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
+
+use roaring::RoaringBitmap;
+
+/// A concurrent ID generate that will never return the same ID twice.
+#[derive(Debug)]
+pub struct ConcurrentAvailableIds {
+    /// The current tree node ID we should use if there is no other IDs available.
+    current: AtomicU32,
+    /// The total number of tree node IDs used.
+    used: AtomicU64,
+
+    /// A list of IDs to exhaust before picking IDs from `current`.
+    available: RoaringBitmap,
+    /// The current Nth ID to select in the bitmap.
+    select_in_bitmap: AtomicU32,
+    /// Tells if you should look in the roaring bitmap or if all the IDs are already exhausted.
+    look_into_bitmap: AtomicBool,
+}
+
+impl ConcurrentAvailableIds {
+    /// Creates an ID generator returning unique IDs, avoiding the specified used IDs.
+    pub fn new(used: RoaringBitmap) -> ConcurrentAvailableIds {
+        let last_id = used.max().map_or(0, |id| id + 1);
+        let used_ids = used.len();
+        let available = RoaringBitmap::from_sorted_iter(0..last_id).unwrap() - used;
+
+        ConcurrentAvailableIds {
+            current: AtomicU32::new(last_id),
+            used: AtomicU64::new(used_ids),
+            select_in_bitmap: AtomicU32::new(0),
+            look_into_bitmap: AtomicBool::new(!available.is_empty()),
+            available,
+        }
+    }
+
+    /// Returns a new unique ID and increase the count of IDs used.
+    pub fn next(&self) -> Option<u32> {
+        if self.used.fetch_add(1, Ordering::Relaxed) > u32::MAX as u64 {
+            None
+        } else if self.look_into_bitmap.load(Ordering::Relaxed) {
+            let current = self.select_in_bitmap.fetch_add(1, Ordering::Relaxed);
+            match self.available.select(current) {
+                Some(id) => Some(id),
+                None => {
+                    self.look_into_bitmap.store(false, Ordering::Relaxed);
+                    Some(self.current.fetch_add(1, Ordering::Relaxed))
+                }
+            }
+        } else {
+            Some(self.current.fetch_add(1, Ordering::Relaxed))
+        }
+    }
+
+    /// Returns the number of used ids in total.
+    pub fn used(&self) -> u64 {
+        self.used.load(Ordering::Relaxed)
+    }
+}
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@ -1,7 +1,7 @@
 use obkv::Key;

 pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
-pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
+pub type KvReaderDelAdd = obkv::KvReader<DelAdd>;

 /// DelAdd defines the new value to add in the database and old value to delete from the database.
 ///
@ -36,7 +36,7 @@ impl Key for DelAdd {
 /// Addition: put all the values under DelAdd::Addition,
 /// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
 pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
-    reader: obkv::KvReader<'_, K>,
+    reader: &obkv::KvReader<K>,
    operation: DelAddOperation,
    buffer: &mut Vec<u8>,
 ) -> Result<(), std::io::Error> {
@ -46,7 +46,7 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
 /// Akin to the [into_del_add_obkv] function but lets you
 /// conditionally define the `DelAdd` variant based on the obkv key.
 pub fn into_del_add_obkv_conditional_operation<K, F>(
-    reader: obkv::KvReader<'_, K>,
+    reader: &obkv::KvReader<K>,
    buffer: &mut Vec<u8>,
    operation: F,
 ) -> std::io::Result<()>
@ -86,8 +86,8 @@ pub enum DelAddOperation {
 /// putting each deletion obkv's keys under an DelAdd::Deletion
 /// and putting each addition obkv's keys under an DelAdd::Addition
 pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
-    deletion: &obkv::KvReader<'_, K>,
-    addition: &obkv::KvReader<'_, K>,
+    deletion: &obkv::KvReader<K>,
+    addition: &obkv::KvReader<K>,
    buffer: &mut Vec<u8>,
 ) -> Result<(), std::io::Error> {
    use itertools::merge_join_by;
@ -121,7 +121,7 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
    writer.finish()
 }

-pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd<'_>) -> bool {
+pub fn is_noop_del_add_obkv(del_add: &KvReaderDelAdd) -> bool {
    del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
 }

@ -136,5 +136,5 @@ pub fn deladd_serialize_add_side<'a>(
    obkv: &'a [u8],
    _buffer: &mut Vec<u8>,
 ) -> crate::Result<&'a [u8]> {
-    Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
+    Ok(KvReaderDelAdd::from_slice(obkv).get(DelAdd::Addition).unwrap_or_default())
 }
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@ -14,7 +14,7 @@ use crate::heed_codec::facet::{
 use crate::heed_codec::BytesRefCodec;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
-use crate::update::MergeFn;
+use crate::update::MergeDeladdCboRoaringBitmaps;
 use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};

 /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
@ -29,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> {
    facet_type: FacetType,
    field_ids: Vec<FieldId>,
    // None if level 0 does not need to be updated
-    delta_data: Option<Merger<BufReader<File>, MergeFn>>,
+    delta_data: Option<Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>>,
 }

 impl<'i> FacetsUpdateBulk<'i> {
@ -37,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> {
        index: &'i Index,
        field_ids: Vec<FieldId>,
        facet_type: FacetType,
-        delta_data: Merger<BufReader<File>, MergeFn>,
+        delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
        group_size: u8,
        min_level_size: u8,
    ) -> FacetsUpdateBulk<'i> {
@ -90,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> {
 /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
 pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
    pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
-    pub delta_data: Option<Merger<R, MergeFn>>,
+    pub delta_data: Option<Merger<R, MergeDeladdCboRoaringBitmaps>>,
    pub group_size: u8,
    pub min_level_size: u8,
 }
@ -135,7 +135,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                if !valid_lmdb_key(key) {
                    continue;
                }
-                let value = KvReaderDelAdd::new(value);
+                let value = KvReaderDelAdd::from_slice(value);

                // DB is empty, it is safe to ignore Del operations
                let Some(value) = value.get(DelAdd::Addition) else {
@ -161,7 +161,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                    continue;
                }

-                let value = KvReaderDelAdd::new(value);
+                let value = KvReaderDelAdd::from_slice(value);

                // the value is a CboRoaringBitmap, but I still need to prepend the
                // group size for level 0 (= 1) to it
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@ -15,7 +15,7 @@ use crate::heed_codec::BytesRefCodec;
 use crate::search::facet::get_highest_level;
 use crate::update::del_add::DelAdd;
 use crate::update::index_documents::valid_lmdb_key;
-use crate::update::MergeFn;
+use crate::update::MergeDeladdCboRoaringBitmaps;
 use crate::{CboRoaringBitmapCodec, Index, Result};

 /// Enum used as a return value for the facet incremental indexing.
@ -57,14 +57,14 @@ enum ModificationResult {
 /// `facet_id_(string/f64)_docids` databases.
 pub struct FacetsUpdateIncremental {
    inner: FacetsUpdateIncrementalInner,
-    delta_data: Merger<BufReader<File>, MergeFn>,
+    delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
 }

 impl FacetsUpdateIncremental {
    pub fn new(
        index: &Index,
        facet_type: FacetType,
-        delta_data: Merger<BufReader<File>, MergeFn>,
+        delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
        group_size: u8,
        min_level_size: u8,
        max_group_size: u8,
@ -109,7 +109,7 @@ impl FacetsUpdateIncremental {
            }
            current_field_id = Some(key.field_id);

-            let value = KvReader::new(value);
+            let value = KvReader::from_slice(value);
            let docids_to_delete = value
                .get(DelAdd::Deletion)
                .map(CboRoaringBitmapCodec::bytes_decode)
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@ -86,12 +86,11 @@ use time::OffsetDateTime;
 use tracing::debug;

 use self::incremental::FacetsUpdateIncremental;
-use super::FacetsUpdateBulk;
+use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
 use crate::heed_codec::BytesRefCodec;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
-use crate::update::MergeFn;
 use crate::{try_split_array_at, FieldId, Index, Result};

 pub mod bulk;
@ -105,8 +104,8 @@ pub struct FacetsUpdate<'i> {
    index: &'i Index,
    database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
    facet_type: FacetType,
-    delta_data: Merger<BufReader<File>, MergeFn>,
-    normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
+    delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
+    normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
    group_size: u8,
    max_group_size: u8,
    min_level_size: u8,
@ -116,8 +115,8 @@ impl<'i> FacetsUpdate<'i> {
    pub fn new(
        index: &'i Index,
        facet_type: FacetType,
-        delta_data: Merger<BufReader<File>, MergeFn>,
-        normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
+        delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
+        normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
        data_size: u64,
    ) -> Self {
        let database = match facet_type {
@ -182,12 +181,12 @@ impl<'i> FacetsUpdate<'i> {

 fn index_facet_search(
    wtxn: &mut heed::RwTxn<'_>,
-    normalized_delta_data: Merger<BufReader<File>, MergeFn>,
+    normalized_delta_data: Merger<BufReader<File>, MergeDeladdBtreesetString>,
    index: &Index,
 ) -> Result<()> {
    let mut iter = normalized_delta_data.into_stream_merger_iter()?;
    while let Some((key_bytes, delta_bytes)) = iter.next()? {
-        let deladd_reader = KvReaderDelAdd::new(delta_bytes);
+        let deladd_reader = KvReaderDelAdd::from_slice(delta_bytes);

        let database_set = index
            .facet_id_normalized_string_strings
@ -298,8 +297,8 @@ pub(crate) mod test_helpers {
    use crate::search::facet::get_highest_level;
    use crate::snapshot_tests::display_bitmap;
    use crate::update::del_add::{DelAdd, KvWriterDelAdd};
-    use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps;
-    use crate::update::{FacetsUpdateIncrementalInner, MergeFn};
+    use crate::update::index_documents::MergeDeladdCboRoaringBitmaps;
+    use crate::update::FacetsUpdateIncrementalInner;
    use crate::CboRoaringBitmapCodec;

    /// Utility function to generate a string whose position in a lexicographically
@ -484,7 +483,7 @@ pub(crate) mod test_helpers {
            }
            writer.finish().unwrap();
            let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            builder.push(reader.into_cursor().unwrap());
            let merger = builder.build();

--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@ -47,7 +47,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
                return match cursor.next_document()? {
                    Some(first_document) => Ok(Err(UserError::MissingDocumentId {
                        primary_key: primary_key.to_string(),
-                        document: obkv_to_object(&first_document, &documents_batch_index)?,
+                        document: obkv_to_object(first_document, &documents_batch_index)?,
                    })),
                    None => unreachable!("Called with reader.is_empty()"),
                };
@ -106,7 +106,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
    let mut count = 0;
    while let Some(document) = cursor.next_document()? {
        let document_id = match fetch_or_generate_document_id(
-            &document,
+            document,
            &documents_batch_index,
            primary_key,
            autogenerate_docids,
@ -145,7 +145,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
 #[tracing::instrument(level = "trace", skip(uuid_buffer, documents_batch_index, document)
 target = "indexing::documents")]
 fn fetch_or_generate_document_id(
-    document: &obkv::KvReader<'_, FieldId>,
+    document: &obkv::KvReader<FieldId>,
    documents_batch_index: &DocumentsBatchIndex,
    primary_key: PrimaryKey<'_>,
    autogenerate_docids: bool,
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -8,7 +8,7 @@ use obkv::{KvReader, KvWriterU16};
 use roaring::RoaringBitmap;
 use serde_json::Value;

-use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
+use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepLatestObkv};
 use crate::error::{InternalError, SerializationError};
 use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
 use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
@ -35,7 +35,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    let mut documents_ids = RoaringBitmap::new();
    let mut docid_word_positions_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        keep_latest_obkv,
+        KeepLatestObkv,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -80,10 +80,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
            .try_into()
            .map(u32::from_be_bytes)
            .map_err(|_| SerializationError::InvalidNumberSerialization)?;
-        let obkv = KvReader::<FieldId>::new(value);
+        let obkv = KvReader::<FieldId>::from_slice(value);

        // if the searchable fields didn't change, skip the searchable indexing for this document.
-        if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
+        if !force_reindexing && !searchable_fields_changed(obkv, settings_diff) {
            continue;
        }

@ -98,7 +98,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
            || {
                // deletions
                tokens_from_document(
-                    &obkv,
+                    obkv,
                    &settings_diff.old,
                    &del_tokenizer,
                    max_positions_per_attributes,
@ -109,7 +109,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
            || {
                // additions
                tokens_from_document(
-                    &obkv,
+                    obkv,
                    &settings_diff.new,
                    &add_tokenizer,
                    max_positions_per_attributes,
@ -126,13 +126,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
        // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
        value_buffer.clear();
        del_add_from_two_obkvs(
-            &KvReader::<FieldId>::new(del_obkv),
-            &KvReader::<FieldId>::new(add_obkv),
+            KvReader::<FieldId>::from_slice(del_obkv),
+            KvReader::<FieldId>::from_slice(add_obkv),
            &mut value_buffer,
        )?;

        // write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
-        let obkv = KvReader::<FieldId>::new(&value_buffer);
+        let obkv = KvReader::<FieldId>::from_slice(&value_buffer);
        for (field_id, value) in obkv.iter() {
            key_buffer.truncate(mem::size_of::<u32>());
            key_buffer.extend_from_slice(&field_id.to_be_bytes());
@ -146,13 +146,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(

 /// Check if any searchable fields of a document changed.
 fn searchable_fields_changed(
-    obkv: &KvReader<'_, FieldId>,
+    obkv: &KvReader<FieldId>,
    settings_diff: &InnerIndexSettingsDiff,
 ) -> bool {
    let searchable_fields = &settings_diff.new.searchable_fields_ids;
    for (field_id, field_bytes) in obkv.iter() {
        if searchable_fields.contains(&field_id) {
-            let del_add = KvReaderDelAdd::new(field_bytes);
+            let del_add = KvReaderDelAdd::from_slice(field_bytes);
            match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
                // if both fields are None, check the next field.
                (None, None) => (),
@ -189,7 +189,7 @@ fn tokenizer_builder<'a>(

 /// Extract words mapped with their positions of a document.
 fn tokens_from_document<'a>(
-    obkv: &KvReader<'a, FieldId>,
+    obkv: &'a KvReader<FieldId>,
    settings: &InnerIndexSettings,
    tokenizer: &Tokenizer<'_>,
    max_positions_per_attributes: u32,
@ -202,7 +202,7 @@ fn tokens_from_document<'a>(
        // if field is searchable.
        if settings.searchable_fields_ids.contains(&field_id) {
            // extract deletion or addition only.
-            if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
+            if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
                // parse json.
                let value =
                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@ -4,7 +4,7 @@ use std::io::{self, BufReader};
 use heed::{BytesDecode, BytesEncode};

 use super::helpers::{
-    create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
+    create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps,
 };
 use crate::heed_codec::facet::{
    FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
@ -27,7 +27,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(

    let mut facet_number_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -45,7 +45,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(

        buffer.clear();
        let mut obkv = KvWriterDelAdd::new(&mut buffer);
-        for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
+        for (deladd_key, _) in KvReaderDelAdd::from_slice(deladd_obkv_bytes).iter() {
            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
        }
        obkv.finish()?;
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@ -15,7 +15,7 @@ use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
 use crate::localized_attributes_rules::LocalizedFieldIds;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::helpers::{
-    merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
+    MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
 };
 use crate::update::settings::InnerIndexSettingsDiff;
 use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
@ -56,7 +56,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(

    let mut facet_string_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -65,7 +65,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(

    let mut normalized_facet_string_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        merge_deladd_btreeset_string,
+        MergeDeladdBtreesetString,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -75,7 +75,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
    let mut buffer = Vec::new();
    let mut cursor = docid_fid_facet_string.into_cursor()?;
    while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
-        let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
+        let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);

        let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
            && deladd_reader.get(DelAdd::Addition).is_some();
@ -144,7 +144,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(

    let mut facet_string_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -153,7 +153,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(

    let mut normalized_facet_string_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        merge_deladd_btreeset_string,
+        MergeDeladdBtreesetString,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -163,7 +163,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
    let mut buffer = Vec::new();
    let mut cursor = docid_fid_facet_string.into_cursor()?;
    while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
-        let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
+        let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);

        let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
            && deladd_reader.get(DelAdd::Addition).is_some();
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@ -1,10 +1,8 @@
-use std::borrow::Cow;
 use std::collections::{BTreeMap, BTreeSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::{self, BufReader};
 use std::mem::size_of;
-use std::result::Result as StdResult;

 use bytemuck::bytes_of;
 use grenad::Sorter;
@ -15,13 +13,13 @@ use roaring::RoaringBitmap;
 use serde_json::{from_slice, Value};
 use FilterableValues::{Empty, Null, Values};

-use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
+use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepFirst};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{create_writer, writer_into_reader};
 use crate::update::settings::InnerIndexSettingsDiff;
-use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};

 /// The length of the elements that are always in the buffer when inserting new values.
 const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
@ -50,7 +48,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(

    let mut fid_docid_facet_numbers_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        keep_first,
+        KeepFirst,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -59,7 +57,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(

    let mut fid_docid_facet_strings_sorter = create_sorter(
        grenad::SortAlgorithm::Stable,
-        keep_first,
+        KeepFirst,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -83,10 +81,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
    if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
        let mut cursor = obkv_documents.into_cursor()?;
        while let Some((docid_bytes, value)) = cursor.move_on_next()? {
-            let obkv = obkv::KvReader::new(value);
+            let obkv = obkv::KvReader::from_slice(value);
            let get_document_json_value = move |field_id, side| {
                obkv.get(field_id)
-                    .map(KvReaderDelAdd::new)
+                    .map(KvReaderDelAdd::from_slice)
                    .and_then(|kv| kv.get(side))
                    .map(from_slice)
                    .transpose()
@ -330,15 +328,12 @@ fn truncate_str(s: &str) -> &str {

 /// Computes the diff between both Del and Add numbers and
 /// only inserts the parts that differ in the sorter.
-fn insert_numbers_diff<MF>(
-    fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
+fn insert_numbers_diff(
+    fid_docid_facet_numbers_sorter: &mut Sorter<KeepFirst>,
    key_buffer: &mut Vec<u8>,
    mut del_numbers: Vec<f64>,
    mut add_numbers: Vec<f64>,
-) -> Result<()>
-where
-    MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
-{
+) -> Result<()> {
    // We sort and dedup the float numbers
    del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
    add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
@ -390,15 +385,12 @@ where

 /// Computes the diff between both Del and Add strings and
 /// only inserts the parts that differ in the sorter.
-fn insert_strings_diff<MF>(
-    fid_docid_facet_strings_sorter: &mut Sorter<MF>,
+fn insert_strings_diff(
+    fid_docid_facet_strings_sorter: &mut Sorter<KeepFirst>,
    key_buffer: &mut Vec<u8>,
    mut del_strings: Vec<(String, String)>,
    mut add_strings: Vec<(String, String)>,
-) -> Result<()>
-where
-    MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
-{
+) -> Result<()> {
    // We sort and dedup the normalized and original strings
    del_strings.sort_unstable();
    add_strings.sort_unstable();
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@ -4,8 +4,8 @@ use std::io::{self, BufReader};
 use obkv::KvReaderU16;

 use super::helpers::{
-    create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
-    GrenadParameters,
+    create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
@ -30,7 +30,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(

    let mut fid_word_count_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -45,19 +45,23 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
        let document_id = u32::from_be_bytes(document_id_bytes);

-        let del_add_reader = KvReaderDelAdd::new(value);
+        let del_add_reader = KvReaderDelAdd::from_slice(value);
        let deletion = del_add_reader
            // get deleted words
            .get(DelAdd::Deletion)
            // count deleted words
-            .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
+            .map(|deletion| {
+                KvReaderU16::from_slice(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()
+            })
            // keep the count if under or equal to MAX_COUNTED_WORDS
            .filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
        let addition = del_add_reader
            // get added words
            .get(DelAdd::Addition)
            // count added words
-            .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
+            .map(|addition| {
+                KvReaderU16::from_slice(addition).iter().take(MAX_COUNTED_WORDS + 1).count()
+            })
            // keep the count if under or equal to MAX_COUNTED_WORDS
            .filter(|&word_count| word_count <= MAX_COUNTED_WORDS);

--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@ -29,22 +29,20 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(

    let mut cursor = obkv_documents.into_cursor()?;
    while let Some((docid_bytes, value)) = cursor.move_on_next()? {
-        let obkv = obkv::KvReader::new(value);
+        let obkv = obkv::KvReader::from_slice(value);
        // since we only need the primary key when we throw an error
        // we create this getter to lazily get it when needed
        let document_id = || -> Value {
-            let reader = KvReaderDelAdd::new(obkv.get(primary_key_id).unwrap());
+            let reader = KvReaderDelAdd::from_slice(obkv.get(primary_key_id).unwrap());
            let document_id =
                reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap();
            serde_json::from_slice(document_id).unwrap()
        };

        // extract old version
-        let del_lat_lng =
-            extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
+        let del_lat_lng = extract_lat_lng(obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
        // extract new version
-        let add_lat_lng =
-            extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
+        let add_lat_lng = extract_lat_lng(obkv, &settings_diff.new, DelAdd::Addition, document_id)?;

        if del_lat_lng != add_lat_lng {
            let mut obkv = KvWriterDelAdd::memory();
@ -68,15 +66,17 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(

 /// Extract the finite floats lat and lng from two bytes slices.
 fn extract_lat_lng(
-    document: &obkv::KvReader<'_, FieldId>,
+    document: &obkv::KvReader<FieldId>,
    settings: &InnerIndexSettings,
    deladd: DelAdd,
    document_id: impl Fn() -> Value,
 ) -> Result<Option<[f64; 2]>> {
    match settings.geo_fields_ids {
        Some((lat_fid, lng_fid)) => {
-            let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
-            let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
+            let lat =
+                document.get(lat_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
+            let lng =
+                document.get(lng_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
            let (lat, lng) = match (lat, lng) {
                (Some(lat), Some(lng)) => (lat, lng),
                (Some(_), None) => {
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@ -313,7 +313,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
        debug_assert!(from_utf8(external_id_bytes).is_ok());
        let docid = DocumentId::from_be_bytes(docid_bytes);

-        let obkv = obkv::KvReader::new(value);
+        let obkv = obkv::KvReader::from_slice(value);
        key_buffer.clear();
        key_buffer.extend_from_slice(docid_bytes.as_slice());

@ -481,7 +481,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
 #[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments
 fn extract_vector_document_diff(
    docid: DocumentId,
-    obkv: obkv::KvReader<'_, FieldId>,
+    obkv: &obkv::KvReader<FieldId>,
    prompt: &Prompt,
    (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
    (old, new): (VectorState, VectorState),
@ -526,7 +526,7 @@ fn extract_vector_document_diff(
            // Do we keep this document?
            let document_is_kept = obkv
                .iter()
-                .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
+                .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
                .any(|deladd| deladd.get(DelAdd::Addition).is_some());

            if document_is_kept {
@ -562,7 +562,7 @@ fn extract_vector_document_diff(
            // Do we keep this document?
            let document_is_kept = obkv
                .iter()
-                .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
+                .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
                .any(|deladd| deladd.get(DelAdd::Addition).is_some());
            if document_is_kept {
                if embedder_is_manual {
@ -588,7 +588,7 @@ fn extract_vector_document_diff(
            // Do we keep this document?
            let document_is_kept = obkv
                .iter()
-                .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
+                .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
                .any(|deladd| deladd.get(DelAdd::Addition).is_some());
            if document_is_kept {
                // if the new version of documents has the vectors in the DB,
@ -606,7 +606,7 @@ fn extract_vector_document_diff(
 }

 fn regenerate_if_prompt_changed(
-    obkv: obkv::KvReader<'_, FieldId>,
+    obkv: &obkv::KvReader<FieldId>,
    (old_prompt, new_prompt): (&Prompt, &Prompt),
    (old_fields_ids_map, new_fields_ids_map): (
        &FieldsIdsMapWithMetadata,
@ -624,7 +624,7 @@ fn regenerate_if_prompt_changed(
 }

 fn regenerate_prompt(
-    obkv: obkv::KvReader<'_, FieldId>,
+    obkv: &obkv::KvReader<FieldId>,
    prompt: &Prompt,
    new_fields_ids_map: &FieldsIdsMapWithMetadata,
 ) -> Result<VectorStateDelta> {
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@ -7,8 +7,8 @@ use obkv::KvReaderU16;
 use roaring::RoaringBitmap;

 use super::helpers::{
-    create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
-    writer_into_reader, GrenadParameters,
+    create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
 };
 use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
@ -16,7 +16,6 @@ use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::helpers::sorter_into_reader;
 use crate::update::settings::InnerIndexSettingsDiff;
-use crate::update::MergeFn;
 use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};

 /// Extracts the word and the documents ids where this word appear.
@ -40,7 +39,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(

    let mut word_fid_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -58,17 +57,17 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
        let document_id = u32::from_be_bytes(document_id_bytes);
        let fid = u16::from_be_bytes(fid_bytes);

-        let del_add_reader = KvReaderDelAdd::new(value);
+        let del_add_reader = KvReaderDelAdd::from_slice(value);
        // extract all unique words to remove.
        if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
-            for (_pos, word) in KvReaderU16::new(deletion).iter() {
+            for (_pos, word) in KvReaderU16::from_slice(deletion).iter() {
                del_words.insert(word.to_vec());
            }
        }

        // extract all unique additional words.
        if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
-            for (_pos, word) in KvReaderU16::new(addition).iter() {
+            for (_pos, word) in KvReaderU16::from_slice(addition).iter() {
                add_words.insert(word.to_vec());
            }
        }
@ -94,7 +93,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(

    let mut word_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -103,7 +102,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(

    let mut exact_word_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -115,7 +114,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
    // NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
    while let Some((key, value)) = iter.next()? {
        // only keep the value if their is a change to apply in the DB.
-        if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
+        if !is_noop_del_add_obkv(KvReaderDelAdd::from_slice(value)) {
            word_fid_docids_writer.insert(key, value)?;
        }

@ -123,7 +122,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
            .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;

        // merge all deletions
-        let obkv = KvReaderDelAdd::new(value);
+        let obkv = KvReaderDelAdd::from_slice(value);
        if let Some(value) = obkv.get(DelAdd::Deletion) {
            let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
            buffer.clear();
@ -163,7 +162,7 @@ fn words_into_sorter(
    key_buffer: &mut Vec<u8>,
    del_words: &BTreeSet<Vec<u8>>,
    add_words: &BTreeSet<Vec<u8>>,
-    word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
+    word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
 ) -> Result<()> {
    use itertools::merge_join_by;
    use itertools::EitherOrBoth::{Both, Left, Right};
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@ -6,8 +6,8 @@ use std::{cmp, io};
 use obkv::KvReaderU16;

 use super::helpers::{
-    create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
-    writer_into_reader, GrenadParameters, MergeFn,
+    create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
@ -44,7 +44,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
        .map(|_| {
            create_sorter(
                grenad::SortAlgorithm::Unstable,
-                merge_deladd_cbo_roaring_bitmaps,
+                MergeDeladdCboRoaringBitmaps,
                indexer.chunk_compression_type,
                indexer.chunk_compression_level,
                indexer.max_nb_chunks,
@ -92,8 +92,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
                }

                // deletions
-                if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
-                    for (position, word) in KvReaderU16::new(deletion).iter() {
+                if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) {
+                    for (position, word) in KvReaderU16::from_slice(deletion).iter() {
                        // drain the proximity window until the head word is considered close to the word we are inserting.
                        while del_word_positions.front().map_or(false, |(_w, p)| {
                            index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
@ -125,8 +125,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
                }

                // additions
-                if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
-                    for (position, word) in KvReaderU16::new(addition).iter() {
+                if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
+                    for (position, word) in KvReaderU16::from_slice(addition).iter() {
                        // drain the proximity window until the head word is considered close to the word we are inserting.
                        while add_word_positions.front().map_or(false, |(_w, p)| {
                            index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
@ -197,7 +197,7 @@ fn document_word_positions_into_sorter(
    document_id: DocumentId,
    del_word_pair_proximity: &BTreeMap<(String, String), u8>,
    add_word_pair_proximity: &BTreeMap<(String, String), u8>,
-    word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
+    word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdCboRoaringBitmaps>],
 ) -> Result<()> {
    use itertools::merge_join_by;
    use itertools::EitherOrBoth::{Both, Left, Right};
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@ -5,14 +5,13 @@ use std::io::{self, BufReader};
 use obkv::KvReaderU16;

 use super::helpers::{
-    create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
-    GrenadParameters,
+    create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::settings::InnerIndexSettingsDiff;
-use crate::update::MergeFn;
 use crate::{bucketed_position, DocumentId, Result};

 /// Extracts the word positions and the documents ids where this word appear.
@ -29,7 +28,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(

    let mut word_position_docids_sorter = create_sorter(
        grenad::SortAlgorithm::Unstable,
-        merge_deladd_cbo_roaring_bitmaps,
+        MergeDeladdCboRoaringBitmaps,
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
        indexer.max_nb_chunks,
@ -60,10 +59,10 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(

        current_document_id = Some(document_id);

-        let del_add_reader = KvReaderDelAdd::new(value);
+        let del_add_reader = KvReaderDelAdd::from_slice(value);
        // extract all unique words to remove.
        if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
-            for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
+            for (position, word_bytes) in KvReaderU16::from_slice(deletion).iter() {
                let position = bucketed_position(position);
                del_word_positions.insert((position, word_bytes.to_vec()));
            }
@ -71,7 +70,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(

        // extract all unique additional words.
        if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
-            for (position, word_bytes) in KvReaderU16::new(addition).iter() {
+            for (position, word_bytes) in KvReaderU16::from_slice(addition).iter() {
                let position = bucketed_position(position);
                add_word_positions.insert((position, word_bytes.to_vec()));
            }
@ -100,7 +99,7 @@ fn words_position_into_sorter(
    key_buffer: &mut Vec<u8>,
    del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
    add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
-    word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
+    word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
 ) -> Result<()> {
    use itertools::merge_join_by;
    use itertools::EitherOrBoth::{Both, Left, Right};
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@ -1,11 +1,10 @@
-use std::borrow::Cow;
 use std::fs::File;
 use std::io::{self, BufReader, BufWriter, Seek};

-use grenad::{CompressionType, Sorter};
+use grenad::{CompressionType, MergeFunction, Sorter};
 use heed::types::Bytes;

-use super::{ClonableMmap, MergeFn};
+use super::ClonableMmap;
 use crate::update::index_documents::valid_lmdb_key;
 use crate::Result;

@ -31,14 +30,14 @@ pub fn create_writer<R: io::Write>(
 /// A helper function that creates a grenad sorter
 /// with the given parameters. The max memory is
 /// clamped to something reasonable.
-pub fn create_sorter(
+pub fn create_sorter<MF: MergeFunction>(
    sort_algorithm: grenad::SortAlgorithm,
-    merge: MergeFn,
+    merge: MF,
    chunk_compression_type: grenad::CompressionType,
    chunk_compression_level: Option<u32>,
    max_nb_chunks: Option<usize>,
    max_memory: Option<usize>,
-) -> grenad::Sorter<MergeFn> {
+) -> grenad::Sorter<MF> {
    let mut builder = grenad::Sorter::builder(merge);
    builder.chunk_compression_type(chunk_compression_type);
    if let Some(level) = chunk_compression_level {
@ -57,10 +56,14 @@ pub fn create_sorter(
 }

 #[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
-pub fn sorter_into_reader(
-    sorter: grenad::Sorter<MergeFn>,
+pub fn sorter_into_reader<MF>(
+    sorter: grenad::Sorter<MF>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<BufReader<File>>> {
+) -> Result<grenad::Reader<BufReader<File>>>
+where
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
    let mut writer = create_writer(
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
@ -169,8 +172,8 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
 /// Write provided sorter in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 #[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
-pub fn write_sorter_into_database<K, V, FS, FM>(
-    sorter: Sorter<MergeFn>,
+pub fn write_sorter_into_database<K, V, FS, FM, MF>(
+    sorter: Sorter<MF>,
    database: &heed::Database<K, V>,
    wtxn: &mut heed::RwTxn<'_>,
    index_is_empty: bool,
@ -180,6 +183,8 @@ pub fn write_sorter_into_database<K, V, FS, FM>(
 where
    FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
 {
    let mut buffer = Vec::new();
    let database = database.remap_types::<Bytes, Bytes>();
@ -207,8 +212,3 @@ where

    Ok(())
 }
-
-/// Used when trying to merge readers, but you don't actually care about the values.
-pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
-    Ok(Cow::Owned(Vec::new()))
-}
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@ -3,6 +3,8 @@ use std::collections::BTreeSet;
 use std::io;
 use std::result::Result as StdResult;

+use either::Either;
+use grenad::MergeFunction;
 use roaring::RoaringBitmap;

 use crate::heed_codec::CboRoaringBitmapCodec;
@ -10,7 +12,8 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::transform::Operation;
 use crate::Result;

-pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
+pub type EitherObkvMerge =
+    Either<ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions>;

 pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
    buffer.clear();
@ -18,35 +21,53 @@ pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) ->
    bitmap.serialize_into(buffer)
 }

-pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
-    if values.len() == 1 {
-        Ok(values[0].clone())
-    } else {
-        let merged = values
-            .iter()
-            .map(AsRef::as_ref)
-            .map(RoaringBitmap::deserialize_from)
-            .map(StdResult::unwrap)
-            .reduce(|a, b| a | b)
-            .unwrap();
-        let mut buffer = Vec::new();
-        serialize_roaring_bitmap(&merged, &mut buffer)?;
-        Ok(Cow::Owned(buffer))
+pub struct MergeRoaringBitmaps;
+
+impl MergeFunction for MergeRoaringBitmaps {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            let merged = values
+                .iter()
+                .map(AsRef::as_ref)
+                .map(RoaringBitmap::deserialize_from)
+                .map(StdResult::unwrap)
+                .reduce(|a, b| a | b)
+                .unwrap();
+            let mut buffer = Vec::new();
+            serialize_roaring_bitmap(&merged, &mut buffer)?;
+            Ok(Cow::Owned(buffer))
+        }
    }
 }

-pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
-    Ok(values[0].clone())
+pub struct KeepFirst;
+
+impl MergeFunction for KeepFirst {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        Ok(values[0].clone())
+    }
 }

 /// Only the last value associated with an id is kept.
-pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
-    Ok(obkvs.last().unwrap().clone())
+pub struct KeepLatestObkv;
+
+impl MergeFunction for KeepLatestObkv {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        Ok(obkvs.last().unwrap().clone())
+    }
 }

 pub fn merge_two_del_add_obkvs(
-    base: obkv::KvReaderU16<'_>,
-    update: obkv::KvReaderU16<'_>,
+    base: &obkv::KvReaderU16,
+    update: &obkv::KvReaderU16,
    merge_additions: bool,
    buffer: &mut Vec<u8>,
 ) {
@ -66,7 +87,7 @@ pub fn merge_two_del_add_obkvs(
                    // If merge_additions is false, recreate an obkv keeping the deletions only.
                    value_buffer.clear();
                    let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
-                    let base_reader = KvReaderDelAdd::new(v);
+                    let base_reader = KvReaderDelAdd::from_slice(v);

                    if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
                        value_writer.insert(DelAdd::Deletion, deletion).unwrap();
@ -80,8 +101,8 @@ pub fn merge_two_del_add_obkvs(
                // merge deletions and additions.
                value_buffer.clear();
                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
-                let base_reader = KvReaderDelAdd::new(base);
-                let update_reader = KvReaderDelAdd::new(update);
+                let base_reader = KvReaderDelAdd::from_slice(base);
+                let update_reader = KvReaderDelAdd::from_slice(update);

                // keep newest deletion.
                if let Some(deletion) = update_reader
@ -131,8 +152,8 @@ fn inner_merge_del_add_obkvs<'a>(
            break;
        }

-        let newest = obkv::KvReader::new(&acc);
-        let oldest = obkv::KvReader::new(&current[1..]);
+        let newest = obkv::KvReader::from_slice(&acc);
+        let oldest = obkv::KvReader::from_slice(&current[1..]);
        merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);

        // we want the result of the merge into our accumulator.
@ -145,65 +166,79 @@ fn inner_merge_del_add_obkvs<'a>(
 }

 /// Merge all the obkvs from the newest to the oldest.
-pub fn obkvs_merge_additions_and_deletions<'a>(
-    _key: &[u8],
-    obkvs: &[Cow<'a, [u8]>],
-) -> Result<Cow<'a, [u8]>> {
-    inner_merge_del_add_obkvs(obkvs, true)
+#[derive(Copy, Clone)]
+pub struct ObkvsMergeAdditionsAndDeletions;
+
+impl MergeFunction for ObkvsMergeAdditionsAndDeletions {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        inner_merge_del_add_obkvs(obkvs, true)
+    }
 }

 /// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
-pub fn obkvs_keep_last_addition_merge_deletions<'a>(
-    _key: &[u8],
-    obkvs: &[Cow<'a, [u8]>],
-) -> Result<Cow<'a, [u8]>> {
-    inner_merge_del_add_obkvs(obkvs, false)
+#[derive(Copy, Clone)]
+pub struct ObkvsKeepLastAdditionMergeDeletions;
+
+impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        inner_merge_del_add_obkvs(obkvs, false)
+    }
 }

 /// Do a union of all the CboRoaringBitmaps in the values.
-pub fn merge_cbo_roaring_bitmaps<'a>(
-    _key: &[u8],
-    values: &[Cow<'a, [u8]>],
-) -> Result<Cow<'a, [u8]>> {
-    if values.len() == 1 {
-        Ok(values[0].clone())
-    } else {
-        let mut vec = Vec::new();
-        CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
-        Ok(Cow::from(vec))
+pub struct MergeCboRoaringBitmaps;
+
+impl MergeFunction for MergeCboRoaringBitmaps {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            let mut vec = Vec::new();
+            CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
+            Ok(Cow::from(vec))
+        }
    }
 }

 /// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
 /// separately and outputs a new DelAdd with both unions.
-pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
-    _key: &[u8],
-    values: &[Cow<'a, [u8]>],
-) -> Result<Cow<'a, [u8]>> {
-    if values.len() == 1 {
-        Ok(values[0].clone())
-    } else {
-        // Retrieve the bitmaps from both sides
-        let mut del_bitmaps_bytes = Vec::new();
-        let mut add_bitmaps_bytes = Vec::new();
-        for value in values {
-            let obkv = KvReaderDelAdd::new(value);
-            if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
-                del_bitmaps_bytes.push(bitmap_bytes);
-            }
-            if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
-                add_bitmaps_bytes.push(bitmap_bytes);
-            }
-        }
+pub struct MergeDeladdCboRoaringBitmaps;

-        let mut output_deladd_obkv = KvWriterDelAdd::memory();
-        let mut buffer = Vec::new();
-        CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
-        output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
-        buffer.clear();
-        CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
-        output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
-        output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+impl MergeFunction for MergeDeladdCboRoaringBitmaps {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            // Retrieve the bitmaps from both sides
+            let mut del_bitmaps_bytes = Vec::new();
+            let mut add_bitmaps_bytes = Vec::new();
+            for value in values {
+                let obkv = KvReaderDelAdd::from_slice(value);
+                if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
+                    del_bitmaps_bytes.push(bitmap_bytes);
+                }
+                if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
+                    add_bitmaps_bytes.push(bitmap_bytes);
+                }
+            }
+
+            let mut output_deladd_obkv = KvWriterDelAdd::memory();
+            let mut buffer = Vec::new();
+            CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
+            output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
+            buffer.clear();
+            CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
+            output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
+            output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+        }
    }
 }

@ -217,7 +252,7 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
    buffer: &'a mut Vec<u8>,
 ) -> Result<Option<&'a [u8]>> {
    Ok(CboRoaringBitmapCodec::merge_deladd_into(
-        KvReaderDelAdd::new(deladd_obkv),
+        KvReaderDelAdd::from_slice(deladd_obkv),
        previous,
        buffer,
    )?)
@ -225,37 +260,55 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(

 /// Do a union of BtreeSet on both sides of a DelAdd obkv
 /// separately and outputs a new DelAdd with both unions.
-pub fn merge_deladd_btreeset_string<'a>(
-    _key: &[u8],
-    values: &[Cow<'a, [u8]>],
-) -> Result<Cow<'a, [u8]>> {
-    if values.len() == 1 {
-        Ok(values[0].clone())
-    } else {
-        // Retrieve the bitmaps from both sides
-        let mut del_set = BTreeSet::new();
-        let mut add_set = BTreeSet::new();
-        for value in values {
-            let obkv = KvReaderDelAdd::new(value);
-            if let Some(bytes) = obkv.get(DelAdd::Deletion) {
-                let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
-                for value in set {
-                    del_set.insert(value);
-                }
-            }
-            if let Some(bytes) = obkv.get(DelAdd::Addition) {
-                let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
-                for value in set {
-                    add_set.insert(value);
-                }
-            }
-        }
+pub struct MergeDeladdBtreesetString;

-        let mut output_deladd_obkv = KvWriterDelAdd::memory();
-        let del = serde_json::to_vec(&del_set).unwrap();
-        output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
-        let add = serde_json::to_vec(&add_set).unwrap();
-        output_deladd_obkv.insert(DelAdd::Addition, &add)?;
-        output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+impl MergeFunction for MergeDeladdBtreesetString {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            // Retrieve the bitmaps from both sides
+            let mut del_set = BTreeSet::new();
+            let mut add_set = BTreeSet::new();
+            for value in values {
+                let obkv = KvReaderDelAdd::from_slice(value);
+                if let Some(bytes) = obkv.get(DelAdd::Deletion) {
+                    let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
+                    for value in set {
+                        del_set.insert(value);
+                    }
+                }
+                if let Some(bytes) = obkv.get(DelAdd::Addition) {
+                    let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
+                    for value in set {
+                        add_set.insert(value);
+                    }
+                }
+            }
+
+            let mut output_deladd_obkv = KvWriterDelAdd::memory();
+            let del = serde_json::to_vec(&del_set).unwrap();
+            output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
+            let add = serde_json::to_vec(&add_set).unwrap();
+            output_deladd_obkv.insert(DelAdd::Addition, &add)?;
+            output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+        }
+    }
+}
+
+/// Used when trying to merge readers, but you don't actually care about the values.
+pub struct MergeIgnoreValues;
+
+impl MergeFunction for MergeIgnoreValues {
+    type Error = crate::Error;
+
+    fn merge<'a>(
+        &self,
+        _key: &[u8],
+        _values: &[Cow<'a, [u8]>],
+    ) -> std::result::Result<Cow<'a, [u8]>, Self::Error> {
+        Ok(Cow::Owned(Vec::new()))
    }
 }
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@ -7,17 +7,8 @@ use std::convert::{TryFrom, TryInto};

 pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
-pub use grenad_helpers::{
-    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
-    merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
-    GrenadParameters,
-};
-pub use merge_functions::{
-    keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string,
-    merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
-    merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
-    obkvs_merge_additions_and_deletions, MergeFn,
-};
+pub use grenad_helpers::*;
+pub use merge_functions::*;

 use crate::MAX_WORD_LENGTH;

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -27,13 +27,7 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};

 use self::enrich::enrich_documents_batch;
 pub use self::enrich::{extract_finite_float_from_value, DocumentId};
-pub use self::helpers::{
-    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-    fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
-    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
-    valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
-};
-use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
+pub use self::helpers::*;
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::{obkv_to_object, DocumentsBatchBuilder, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
@ -605,7 +599,7 @@ where
                                let cloneable_chunk =
                                    unsafe { as_cloneable_grenad(&word_docids_reader)? };
                                let word_docids = word_docids.get_or_insert_with(|| {
-                                    MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
+                                    MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
                                });
                                word_docids.push(cloneable_chunk.into_cursor()?);
                                let cloneable_chunk =
@ -613,14 +607,14 @@ where
                                let exact_word_docids =
                                    exact_word_docids.get_or_insert_with(|| {
                                        MergerBuilder::new(
-                                            merge_deladd_cbo_roaring_bitmaps as MergeFn,
+                                            MergeDeladdCboRoaringBitmaps,
                                        )
                                    });
                                exact_word_docids.push(cloneable_chunk.into_cursor()?);
                                let cloneable_chunk =
                                    unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
                                let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
-                                    MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
+                                    MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
                                });
                                word_fid_docids.push(cloneable_chunk.into_cursor()?);
                                TypedChunk::WordDocids {
@ -634,7 +628,7 @@ where
                                let word_position_docids =
                                    word_position_docids.get_or_insert_with(|| {
                                        MergerBuilder::new(
-                                            merge_deladd_cbo_roaring_bitmaps as MergeFn,
+                                            MergeDeladdCboRoaringBitmaps,
                                        )
                                    });
                                word_position_docids.push(cloneable_chunk.into_cursor()?);
@ -747,10 +741,10 @@ where
    )]
    pub fn execute_prefix_databases(
        self,
-        word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
-        exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
-        word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
-        word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
+        word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
+        exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
+        word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
+        word_fid_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
    ) -> Result<()>
    where
        FP: Fn(UpdateIndexingStep) + Sync,
@ -930,7 +924,7 @@ where
 )]
 fn execute_word_prefix_docids(
    txn: &mut heed::RwTxn<'_>,
-    merger: Merger<CursorClonableMmap, MergeFn>,
+    merger: Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
    word_docids_db: Database<Str, CboRoaringBitmapCodec>,
    word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
    indexer_config: &IndexerConfig,
--- a/milli/src/update/index_documents/parallel.rs
+++ b/milli/src/update/index_documents/parallel.rs
@ -31,14 +31,14 @@ impl<'t> ImmutableObkvs<'t> {
    }

    /// Returns the OBKVs identified by the given ID.
-    pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<KvReaderU16<'t>>> {
+    pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
        match self
            .ids
            .rank(docid)
            .checked_sub(1)
            .and_then(|offset| self.slices.get(offset as usize))
        {
-            Some(bytes) => Ok(Some(KvReaderU16::new(bytes))),
+            Some(&bytes) => Ok(Some(bytes.into())),
            None => Ok(None),
        }
    }
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -5,6 +5,7 @@ use std::collections::{BTreeMap, HashMap, HashSet};
 use std::fs::File;
 use std::io::{Read, Seek};

+use either::Either;
 use fxhash::FxHashMap;
 use itertools::Itertools;
 use obkv::{KvReader, KvReaderU16, KvWriter};
@ -13,10 +14,10 @@ use serde_json::Value;
 use smartstring::SmartString;

 use super::helpers::{
-    create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions,
-    obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn,
+    create_sorter, create_writer, sorter_into_reader, EitherObkvMerge,
+    ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions,
 };
-use super::{IndexDocumentsMethod, IndexerConfig};
+use super::{IndexDocumentsMethod, IndexerConfig, KeepFirst};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
@ -26,7 +27,7 @@ use crate::update::del_add::{
 };
 use crate::update::index_documents::GrenadParameters;
 use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
-use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
+use crate::update::{AvailableIds, UpdateIndexingStep};
 use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
 use crate::vector::settings::WriteBackToDocuments;
 use crate::vector::ArroyWrapper;
@ -55,13 +56,13 @@ pub struct Transform<'a, 'i> {

    indexer_settings: &'a IndexerConfig,
    pub index_documents_method: IndexDocumentsMethod,
-    available_documents_ids: AvailableDocumentsIds,
+    available_documents_ids: AvailableIds,

    // Both grenad follows the same format:
    // key | value
    // u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
-    original_sorter: grenad::Sorter<MergeFn>,
-    flattened_sorter: grenad::Sorter<MergeFn>,
+    original_sorter: grenad::Sorter<EitherObkvMerge>,
+    flattened_sorter: grenad::Sorter<EitherObkvMerge>,

    replaced_documents_ids: RoaringBitmap,
    new_documents_ids: RoaringBitmap,
@ -109,17 +110,19 @@ impl<'a, 'i> Transform<'a, 'i> {
        index_documents_method: IndexDocumentsMethod,
        _autogenerate_docids: bool,
    ) -> Result<Self> {
+        use IndexDocumentsMethod::{ReplaceDocuments, UpdateDocuments};
+
        // We must choose the appropriate merge function for when two or more documents
        // with the same user id must be merged or fully replaced in the same batch.
        let merge_function = match index_documents_method {
-            IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
-            IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
+            ReplaceDocuments => Either::Left(ObkvsKeepLastAdditionMergeDeletions),
+            UpdateDocuments => Either::Right(ObkvsMergeAdditionsAndDeletions),
        };

        // We initialize the sorter with the user indexing settings.
        let original_sorter = create_sorter(
            grenad::SortAlgorithm::Stable,
-            merge_function,
+            merge_function.clone(),
            indexer_settings.chunk_compression_type,
            indexer_settings.chunk_compression_level,
            indexer_settings.max_nb_chunks,
@ -141,7 +144,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            index,
            fields_ids_map: index.fields_ids_map(wtxn)?,
            indexer_settings,
-            available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
+            available_documents_ids: AvailableIds::new(&documents_ids),
            original_sorter,
            flattened_sorter,
            index_documents_method,
@ -279,21 +282,21 @@ impl<'a, 'i> Transform<'a, 'i> {
                    document_sorter_value_buffer.clear();
                    document_sorter_value_buffer.push(Operation::Addition as u8);
                    into_del_add_obkv(
-                        KvReaderU16::new(base_obkv),
+                        KvReaderU16::from_slice(base_obkv),
                        deladd_operation,
                        &mut document_sorter_value_buffer,
                    )?;
                    self.original_sorter
                        .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
-                    let base_obkv = KvReader::new(base_obkv);
+                    let base_obkv = KvReader::from_slice(base_obkv);
                    if let Some(flattened_obkv) =
-                        Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)?
+                        Self::flatten_from_fields_ids_map(base_obkv, &mut self.fields_ids_map)?
                    {
                        // we recreate our buffer with the flattened documents
                        document_sorter_value_buffer.clear();
                        document_sorter_value_buffer.push(Operation::Addition as u8);
                        into_del_add_obkv(
-                            KvReaderU16::new(&flattened_obkv),
+                            KvReaderU16::from_slice(&flattened_obkv),
                            deladd_operation,
                            &mut document_sorter_value_buffer,
                        )?;
@ -312,7 +315,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                document_sorter_value_buffer.clear();
                document_sorter_value_buffer.push(Operation::Addition as u8);
                into_del_add_obkv(
-                    KvReaderU16::new(&obkv_buffer),
+                    KvReaderU16::from_slice(&obkv_buffer),
                    DelAddOperation::Addition,
                    &mut document_sorter_value_buffer,
                )?;
@ -320,14 +323,14 @@ impl<'a, 'i> Transform<'a, 'i> {
                self.original_sorter
                    .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;

-                let flattened_obkv = KvReader::new(&obkv_buffer);
+                let flattened_obkv = KvReader::from_slice(&obkv_buffer);
                if let Some(obkv) =
-                    Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
+                    Self::flatten_from_fields_ids_map(flattened_obkv, &mut self.fields_ids_map)?
                {
                    document_sorter_value_buffer.clear();
                    document_sorter_value_buffer.push(Operation::Addition as u8);
                    into_del_add_obkv(
-                        KvReaderU16::new(&obkv),
+                        KvReaderU16::from_slice(&obkv),
                        DelAddOperation::Addition,
                        &mut document_sorter_value_buffer,
                    )?
@ -520,22 +523,22 @@ impl<'a, 'i> Transform<'a, 'i> {
        document_sorter_value_buffer.clear();
        document_sorter_value_buffer.push(Operation::Deletion as u8);
        into_del_add_obkv(
-            KvReaderU16::new(base_obkv),
+            KvReaderU16::from_slice(base_obkv),
            DelAddOperation::Deletion,
            document_sorter_value_buffer,
        )?;
        self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;

        // flatten it and push it as to delete in the flattened_sorter
-        let flattened_obkv = KvReader::new(base_obkv);
+        let flattened_obkv = KvReader::from_slice(base_obkv);
        if let Some(obkv) =
-            Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
+            Self::flatten_from_fields_ids_map(flattened_obkv, &mut self.fields_ids_map)?
        {
            // we recreate our buffer with the flattened documents
            document_sorter_value_buffer.clear();
            document_sorter_value_buffer.push(Operation::Deletion as u8);
            into_del_add_obkv(
-                KvReaderU16::new(&obkv),
+                KvReaderU16::from_slice(&obkv),
                DelAddOperation::Deletion,
                document_sorter_value_buffer,
            )?;
@ -553,7 +556,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        target = "indexing::transform"
    )]
    fn flatten_from_fields_ids_map(
-        obkv: &KvReader<'_, FieldId>,
+        obkv: &KvReader<FieldId>,
        fields_ids_map: &mut FieldsIdsMap,
    ) -> Result<Option<Vec<u8>>> {
        if obkv
@ -721,10 +724,10 @@ impl<'a, 'i> Transform<'a, 'i> {
                total_documents: self.documents_count,
            });

-            for (key, value) in KvReader::new(val) {
-                let reader = KvReaderDelAdd::new(value);
+            for (key, value) in KvReader::from_slice(val) {
+                let reader = KvReaderDelAdd::from_slice(value);
                match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
-                    (None, None) => {}
+                    (None, None) => (),
                    (None, Some(_)) => {
                        // New field
                        let name = self.fields_ids_map.name(key).ok_or(
@ -838,7 +841,7 @@ impl<'a, 'i> Transform<'a, 'i> {
    /// then fill the provided buffers with delta documents using KvWritterDelAdd.
    #[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo
    fn rebind_existing_document(
-        old_obkv: KvReader<'_, FieldId>,
+        old_obkv: &KvReader<FieldId>,
        settings_diff: &InnerIndexSettingsDiff,
        modified_faceted_fields: &HashSet<String>,
        mut injected_vectors: serde_json::Map<String, serde_json::Value>,
@ -926,7 +929,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        }

        let data = obkv_writer.into_inner()?;
-        let obkv = KvReader::<FieldId>::new(&data);
+        let obkv = KvReader::<FieldId>::from_slice(&data);

        if let Some(original_obkv_buffer) = original_obkv_buffer {
            original_obkv_buffer.clear();
@ -936,8 +939,8 @@ impl<'a, 'i> Transform<'a, 'i> {
        if let Some(flattened_obkv_buffer) = flattened_obkv_buffer {
            // take the non-flattened version if flatten_from_fields_ids_map returns None.
            let mut fields_ids_map = settings_diff.new.fields_ids_map.clone();
-            let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?;
-            let flattened = flattened.as_deref().map_or(obkv, KvReader::new);
+            let flattened = Self::flatten_from_fields_ids_map(obkv, &mut fields_ids_map)?;
+            let flattened = flattened.as_deref().map_or(obkv, KvReader::from_slice);

            flattened_obkv_buffer.clear();
            into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| {
@ -980,7 +983,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        let mut original_sorter = if settings_diff.reindex_vectors() {
            Some(create_sorter(
                grenad::SortAlgorithm::Stable,
-                keep_first,
+                KeepFirst,
                self.indexer_settings.chunk_compression_type,
                self.indexer_settings.chunk_compression_level,
                self.indexer_settings.max_nb_chunks,
@ -1022,7 +1025,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
                Some(create_sorter(
                    grenad::SortAlgorithm::Stable,
-                    keep_first,
+                    KeepFirst,
                    self.indexer_settings.chunk_compression_type,
                    self.indexer_settings.chunk_compression_level,
                    self.indexer_settings.max_nb_chunks,
@ -1152,6 +1155,8 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {

 #[cfg(test)]
 mod test {
+    use grenad::MergeFunction;
+
    use super::*;

    #[test]
@ -1163,21 +1168,21 @@ mod test {
        kv_writer.insert(0_u8, [0]).unwrap();
        let buffer = kv_writer.into_inner().unwrap();
        into_del_add_obkv(
-            KvReaderU16::new(&buffer),
+            KvReaderU16::from_slice(&buffer),
            DelAddOperation::Addition,
            &mut additive_doc_0,
        )
        .unwrap();
        additive_doc_0.insert(0, Operation::Addition as u8);
        into_del_add_obkv(
-            KvReaderU16::new(&buffer),
+            KvReaderU16::from_slice(&buffer),
            DelAddOperation::Deletion,
            &mut deletive_doc_0,
        )
        .unwrap();
        deletive_doc_0.insert(0, Operation::Deletion as u8);
        into_del_add_obkv(
-            KvReaderU16::new(&buffer),
+            KvReaderU16::from_slice(&buffer),
            DelAddOperation::DeletionAndAddition,
            &mut del_add_doc_0,
        )
@ -1189,7 +1194,7 @@ mod test {
        kv_writer.insert(1_u8, [1]).unwrap();
        let buffer = kv_writer.into_inner().unwrap();
        into_del_add_obkv(
-            KvReaderU16::new(&buffer),
+            KvReaderU16::from_slice(&buffer),
            DelAddOperation::Addition,
            &mut additive_doc_1,
        )
@ -1202,32 +1207,39 @@ mod test {
        kv_writer.insert(1_u8, [1]).unwrap();
        let buffer = kv_writer.into_inner().unwrap();
        into_del_add_obkv(
-            KvReaderU16::new(&buffer),
+            KvReaderU16::from_slice(&buffer),
            DelAddOperation::Addition,
            &mut additive_doc_0_1,
        )
        .unwrap();
        additive_doc_0_1.insert(0, Operation::Addition as u8);

-        let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
-            .unwrap();
+        let ret = MergeFunction::merge(
+            &ObkvsMergeAdditionsAndDeletions,
+            &[],
+            &[Cow::from(additive_doc_0.as_slice())],
+        )
+        .unwrap();
        assert_eq!(*ret, additive_doc_0);

-        let ret = obkvs_merge_additions_and_deletions(
+        let ret = MergeFunction::merge(
+            &ObkvsMergeAdditionsAndDeletions,
            &[],
            &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
        )
        .unwrap();
        assert_eq!(*ret, del_add_doc_0);

-        let ret = obkvs_merge_additions_and_deletions(
+        let ret = MergeFunction::merge(
+            &ObkvsMergeAdditionsAndDeletions,
            &[],
            &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
        )
        .unwrap();
        assert_eq!(*ret, deletive_doc_0);

-        let ret = obkvs_merge_additions_and_deletions(
+        let ret = MergeFunction::merge(
+            &ObkvsMergeAdditionsAndDeletions,
            &[],
            &[
                Cow::from(additive_doc_1.as_slice()),
@ -1238,21 +1250,24 @@ mod test {
        .unwrap();
        assert_eq!(*ret, del_add_doc_0);

-        let ret = obkvs_merge_additions_and_deletions(
+        let ret = MergeFunction::merge(
+            &ObkvsMergeAdditionsAndDeletions,
            &[],
            &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
        )
        .unwrap();
        assert_eq!(*ret, additive_doc_0_1);

-        let ret = obkvs_keep_last_addition_merge_deletions(
+        let ret = MergeFunction::merge(
+            &ObkvsKeepLastAdditionMergeDeletions,
            &[],
            &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
        )
        .unwrap();
        assert_eq!(*ret, additive_doc_0);

-        let ret = obkvs_keep_last_addition_merge_deletions(
+        let ret = MergeFunction::merge(
+            &ObkvsKeepLastAdditionMergeDeletions,
            &[],
            &[
                Cow::from(deletive_doc_0.as_slice()),
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -4,18 +4,17 @@ use std::fs::File;
 use std::io::{self, BufReader};

 use bytemuck::allocation::pod_collect_to_vec;
-use grenad::{Merger, MergerBuilder};
+use grenad::{MergeFunction, Merger, MergerBuilder};
 use heed::types::Bytes;
 use heed::{BytesDecode, RwTxn};
 use obkv::{KvReader, KvWriter};
 use roaring::RoaringBitmap;

 use super::helpers::{
-    self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
-    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key,
-    CursorClonableMmap,
+    self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
+    MergeIgnoreValues,
 };
-use super::MergeFn;
 use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
 use crate::index::db_name::DOCUMENTS;
@ -24,7 +23,7 @@ use crate::proximity::MAX_DISTANCE;
 use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{
-    as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
+    as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
 };
 use crate::update::settings::InnerIndexSettingsDiff;
 use crate::vector::ArroyWrapper;
@ -141,7 +140,7 @@ pub(crate) fn write_typed_chunk_into_index(
            let vectors_fid =
                fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);

-            let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
+            let mut builder = MergerBuilder::new(KeepLatestObkv);
            for typed_chunk in typed_chunks {
                let TypedChunk::Documents(chunk) = typed_chunk else {
                    unreachable!();
@ -163,7 +162,7 @@ pub(crate) fn write_typed_chunk_into_index(
            let mut vectors_buffer = Vec::new();
            while let Some((key, reader)) = iter.next()? {
                let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
-                let reader: KvReader<'_, FieldId> = KvReader::new(reader);
+                let reader: &KvReader<FieldId> = reader.into();

                let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
                    .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
@ -171,7 +170,7 @@ pub(crate) fn write_typed_chunk_into_index(
                let external_id = std::str::from_utf8(external_id_bytes)?;

                for (field_id, value) in reader.iter() {
-                    let del_add_reader = KvReaderDelAdd::new(value);
+                    let del_add_reader = KvReaderDelAdd::from_slice(value);

                    if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
                        let addition = if vectors_fid == Some(field_id) {
@ -235,7 +234,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
                    unreachable!();
@ -258,13 +257,10 @@ pub(crate) fn write_typed_chunk_into_index(
            let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
            let _entered = span.enter();

-            let mut word_docids_builder =
-                MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
-            let mut exact_word_docids_builder =
-                MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
-            let mut word_fid_docids_builder =
-                MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
-            let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
+            let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
            for typed_chunk in typed_chunks {
                let TypedChunk::WordDocids {
                    word_docids_reader,
@ -329,7 +325,7 @@ pub(crate) fn write_typed_chunk_into_index(
            let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            for typed_chunk in typed_chunks {
                let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
                    unreachable!();
@ -353,7 +349,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            let mut data_size = 0;
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
@ -375,10 +371,9 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
            let _entered = span.enter();

-            let mut facet_id_string_builder =
-                MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            let mut normalized_facet_id_string_builder =
-                MergerBuilder::new(merge_deladd_btreeset_string as MergeFn);
+                MergerBuilder::new(MergeDeladdBtreesetString);
            let mut data_size = 0;
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdFacetStringDocids((
@ -412,7 +407,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
                    unreachable!();
@ -436,7 +431,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
                    unreachable!();
@ -459,7 +454,7 @@ pub(crate) fn write_typed_chunk_into_index(
            let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
                    unreachable!();
@ -483,7 +478,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
            for typed_chunk in typed_chunks {
                let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
                    unreachable!();
@ -516,7 +511,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(keep_first as MergeFn);
+            let mut builder = MergerBuilder::new(KeepFirst);
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
                    unreachable!();
@ -530,7 +525,7 @@ pub(crate) fn write_typed_chunk_into_index(
                index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
            let mut iter = merger.into_stream_merger_iter()?;
            while let Some((key, value)) = iter.next()? {
-                let reader = KvReaderDelAdd::new(value);
+                let reader = KvReaderDelAdd::from_slice(value);
                if valid_lmdb_key(key) {
                    match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
                        (None, None) => {}
@ -550,7 +545,7 @@ pub(crate) fn write_typed_chunk_into_index(
                tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(keep_first as MergeFn);
+            let mut builder = MergerBuilder::new(KeepFirst);
            for typed_chunk in typed_chunks {
                let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
                    unreachable!();
@ -564,7 +559,7 @@ pub(crate) fn write_typed_chunk_into_index(
                index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
            let mut iter = merger.into_stream_merger_iter()?;
            while let Some((key, value)) = iter.next()? {
-                let reader = KvReaderDelAdd::new(value);
+                let reader = KvReaderDelAdd::from_slice(value);
                if valid_lmdb_key(key) {
                    match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
                        (None, None) => {}
@ -583,7 +578,7 @@ pub(crate) fn write_typed_chunk_into_index(
            let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
            let _entered = span.enter();

-            let mut builder = MergerBuilder::new(keep_first as MergeFn);
+            let mut builder = MergerBuilder::new(KeepFirst);
            for typed_chunk in typed_chunks {
                let TypedChunk::GeoPoints(chunk) = typed_chunk else {
                    unreachable!();
@ -601,7 +596,7 @@ pub(crate) fn write_typed_chunk_into_index(
                // convert the key back to a u32 (4 bytes)
                let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();

-                let deladd_obkv = KvReaderDelAdd::new(value);
+                let deladd_obkv = KvReaderDelAdd::from_slice(value);
                if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
                    let geopoint = extract_geo_point(value, docid);
                    rtree.remove(&geopoint);
@ -620,9 +615,9 @@ pub(crate) fn write_typed_chunk_into_index(
            let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
            let _entered = span.enter();

-            let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
-            let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
-            let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
+            let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
+            let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
+            let mut embeddings_builder = MergerBuilder::new(KeepFirst);
            let mut add_to_user_provided = RoaringBitmap::new();
            let mut remove_from_user_provided = RoaringBitmap::new();
            let mut params = None;
@ -729,7 +724,7 @@ pub(crate) fn write_typed_chunk_into_index(
                let (left, _index) = try_split_array_at(key).unwrap();
                let docid = DocumentId::from_be_bytes(left);

-                let vector_deladd_obkv = KvReaderDelAdd::new(value);
+                let vector_deladd_obkv = KvReaderDelAdd::from_slice(value);
                if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
                    let vector: Vec<f32> = pod_collect_to_vec(value);

@ -797,9 +792,13 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
    GeoPoint::new(xyz_point, (docid, point))
 }

-fn merge_word_docids_reader_into_fst(
-    merger: Merger<CursorClonableMmap, MergeFn>,
-) -> Result<fst::Set<Vec<u8>>> {
+fn merge_word_docids_reader_into_fst<MF>(
+    merger: Merger<CursorClonableMmap, MF>,
+) -> Result<fst::Set<Vec<u8>>>
+where
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
    let mut iter = merger.into_stream_merger_iter()?;
    let mut builder = fst::SetBuilder::memory();

@ -813,8 +812,8 @@ fn merge_word_docids_reader_into_fst(
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
-fn write_entries_into_database<R, K, V, FS, FM>(
-    merger: Merger<R, MergeFn>,
+fn write_entries_into_database<R, K, V, FS, FM, MF>(
+    merger: Merger<R, MF>,
    database: &heed::Database<K, V>,
    wtxn: &mut RwTxn<'_>,
    serialize_value: FS,
@ -824,6 +823,8 @@ where
    R: io::Read + io::Seek,
    FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
 {
    let mut buffer = Vec::new();
    let database = database.remap_types::<Bytes, Bytes>();
@ -850,20 +851,22 @@ where
 /// Akin to the `write_entries_into_database` function but specialized
 /// for the case when we only index additional searchable fields only.
 #[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
-fn write_proximity_entries_into_database_additional_searchables<R>(
-    merger: Merger<R, MergeFn>,
+fn write_proximity_entries_into_database_additional_searchables<R, MF>(
+    merger: Merger<R, MF>,
    database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    wtxn: &mut RwTxn<'_>,
 ) -> Result<()>
 where
    R: io::Read + io::Seek,
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
 {
    let mut iter = merger.into_stream_merger_iter()?;
    while let Some((key, value)) = iter.next()? {
        if valid_lmdb_key(key) {
            let (proximity_to_insert, word1, word2) =
                U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
-            let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) {
+            let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
                Some(value) => {
                    CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
                }
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -1,11 +1,9 @@
-pub use self::available_documents_ids::AvailableDocumentsIds;
+pub use self::available_ids::AvailableIds;
 pub use self::clear_documents::ClearDocuments;
+pub use self::concurrent_available_ids::ConcurrentAvailableIds;
 pub use self::facet::bulk::FacetsUpdateBulk;
 pub use self::facet::incremental::FacetsUpdateIncrementalInner;
-pub use self::index_documents::{
-    merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
-    IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
-};
+pub use self::index_documents::*;
 pub use self::indexer_config::IndexerConfig;
 pub use self::settings::{validate_embedding_settings, Setting, Settings};
 pub use self::update_step::UpdateIndexingStep;
@ -13,12 +11,14 @@ pub use self::word_prefix_docids::WordPrefixDocids;
 pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
 pub use self::words_prefixes_fst::WordsPrefixesFst;

-mod available_documents_ids;
+mod available_ids;
 mod clear_documents;
+mod concurrent_available_ids;
 pub(crate) mod del_add;
 pub(crate) mod facet;
 mod index_documents;
 mod indexer_config;
+pub mod new;
 mod settings;
 mod update_step;
 mod word_prefix_docids;
--- a/milli/src/update/new/channel.rs
+++ b/milli/src/update/new/channel.rs
@ -0,0 +1,522 @@
+use std::marker::PhantomData;
+use std::sync::atomic::Ordering;
+
+use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
+use heed::types::Bytes;
+use memmap2::Mmap;
+
+use super::extract::{FacetKind, HashMapMerger};
+use super::StdResult;
+use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY};
+use crate::update::new::KvReaderFieldId;
+use crate::{DocumentId, Index};
+
+/// The capacity of the channel is currently in number of messages.
+pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) {
+    let (sender, receiver) = crossbeam_channel::bounded(cap);
+    (
+        MergerSender {
+            sender,
+            send_count: Default::default(),
+            writer_contentious_count: Default::default(),
+            merger_contentious_count: Default::default(),
+        },
+        WriterReceiver(receiver),
+    )
+}
+
+/// The capacity of the channel is currently in number of messages.
+pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceiver) {
+    let (sender, receiver) = crossbeam_channel::bounded(cap);
+    (ExtractorSender(sender), MergerReceiver(receiver))
+}
+
+pub enum KeyValueEntry {
+    SmallInMemory { key_length: usize, data: Box<[u8]> },
+    LargeOnDisk { key: Box<[u8]>, value: Mmap },
+}
+
+impl KeyValueEntry {
+    pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self {
+        let mut data = Vec::with_capacity(key.len() + value.len());
+        data.extend_from_slice(key);
+        data.extend_from_slice(value);
+        KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() }
+    }
+
+    pub fn from_large_key_value(key: &[u8], value: Mmap) -> Self {
+        KeyValueEntry::LargeOnDisk { key: key.to_vec().into_boxed_slice(), value }
+    }
+
+    pub fn key(&self) -> &[u8] {
+        match self {
+            KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[..*key_length],
+            KeyValueEntry::LargeOnDisk { key, value: _ } => key.as_ref(),
+        }
+    }
+
+    pub fn value(&self) -> &[u8] {
+        match self {
+            KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[*key_length..],
+            KeyValueEntry::LargeOnDisk { key: _, value } => value.as_ref(),
+        }
+    }
+}
+
+pub struct KeyEntry {
+    data: Box<[u8]>,
+}
+
+impl KeyEntry {
+    pub fn from_key(key: &[u8]) -> Self {
+        KeyEntry { data: key.to_vec().into_boxed_slice() }
+    }
+
+    pub fn entry(&self) -> &[u8] {
+        self.data.as_ref()
+    }
+}
+
+pub enum EntryOperation {
+    Delete(KeyEntry),
+    Write(KeyValueEntry),
+}
+
+pub struct DocumentEntry {
+    docid: DocumentId,
+    content: Box<[u8]>,
+}
+
+impl DocumentEntry {
+    pub fn new_uncompressed(docid: DocumentId, content: Box<KvReaderFieldId>) -> Self {
+        DocumentEntry { docid, content: content.into() }
+    }
+
+    pub fn new_compressed(docid: DocumentId, content: Box<[u8]>) -> Self {
+        DocumentEntry { docid, content }
+    }
+
+    pub fn key(&self) -> [u8; 4] {
+        self.docid.to_be_bytes()
+    }
+
+    pub fn content(&self) -> &[u8] {
+        &self.content
+    }
+}
+
+pub struct DocumentDeletionEntry(DocumentId);
+
+impl DocumentDeletionEntry {
+    pub fn key(&self) -> [u8; 4] {
+        self.0.to_be_bytes()
+    }
+}
+
+pub struct WriterOperation {
+    database: Database,
+    entry: EntryOperation,
+}
+
+pub enum Database {
+    Documents,
+    ExactWordDocids,
+    FidWordCountDocids,
+    Main,
+    WordDocids,
+    WordFidDocids,
+    WordPairProximityDocids,
+    WordPositionDocids,
+    FacetIdIsNullDocids,
+    FacetIdIsEmptyDocids,
+    FacetIdExistsDocids,
+    FacetIdF64NumberDocids,
+    FacetIdStringDocids,
+}
+
+impl Database {
+    pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
+        match self {
+            Database::Documents => index.documents.remap_types(),
+            Database::ExactWordDocids => index.exact_word_docids.remap_types(),
+            Database::Main => index.main.remap_types(),
+            Database::WordDocids => index.word_docids.remap_types(),
+            Database::WordFidDocids => index.word_fid_docids.remap_types(),
+            Database::WordPositionDocids => index.word_position_docids.remap_types(),
+            Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
+            Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
+            Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(),
+            Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(),
+            Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(),
+            Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(),
+            Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
+        }
+    }
+}
+
+impl WriterOperation {
+    pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
+        self.database.database(index)
+    }
+
+    pub fn entry(self) -> EntryOperation {
+        self.entry
+    }
+}
+
+pub struct WriterReceiver(Receiver<WriterOperation>);
+
+impl IntoIterator for WriterReceiver {
+    type Item = WriterOperation;
+    type IntoIter = IntoIter<Self::Item>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+pub struct MergerSender {
+    sender: Sender<WriterOperation>,
+    /// The number of message we send in total in the channel.
+    send_count: std::sync::atomic::AtomicUsize,
+    /// The number of times we sent something in a channel that was full.
+    writer_contentious_count: std::sync::atomic::AtomicUsize,
+    /// The number of times we sent something in a channel that was empty.
+    merger_contentious_count: std::sync::atomic::AtomicUsize,
+}
+
+impl Drop for MergerSender {
+    fn drop(&mut self) {
+        eprintln!(
+            "Merger channel stats: {} sends, {} writer contentions ({}%), {} merger contentions ({}%)",
+            self.send_count.load(Ordering::SeqCst),
+            self.writer_contentious_count.load(Ordering::SeqCst),
+            (self.writer_contentious_count.load(Ordering::SeqCst) as f32 / self.send_count.load(Ordering::SeqCst) as f32) * 100.0,
+            self.merger_contentious_count.load(Ordering::SeqCst),
+            (self.merger_contentious_count.load(Ordering::SeqCst) as f32 / self.send_count.load(Ordering::SeqCst) as f32) * 100.0
+        )
+    }
+}
+
+impl MergerSender {
+    pub fn main(&self) -> MainSender<'_> {
+        MainSender(self)
+    }
+
+    pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
+        WordDocidsSender { sender: self, _marker: PhantomData }
+    }
+
+    pub fn facet_docids(&self) -> FacetDocidsSender<'_> {
+        FacetDocidsSender { sender: self }
+    }
+
+    pub fn documents(&self) -> DocumentsSender<'_> {
+        DocumentsSender(self)
+    }
+
+    pub fn send_documents_ids(&self, bitmap: &[u8]) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
+            DOCUMENTS_IDS_KEY.as_bytes(),
+            bitmap,
+        ));
+        match self.send(WriterOperation { database: Database::Main, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> {
+        if self.sender.is_full() {
+            self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
+        }
+        if self.sender.is_empty() {
+            self.merger_contentious_count.fetch_add(1, Ordering::SeqCst);
+        }
+        self.send_count.fetch_add(1, Ordering::SeqCst);
+        match self.sender.send(op) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub struct MainSender<'a>(&'a MergerSender);
+
+impl MainSender<'_> {
+    pub fn write_words_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(
+            WORDS_FST_KEY.as_bytes(),
+            value,
+        ));
+        match self.0.send(WriterOperation { database: Database::Main, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
+        match self.0.send(WriterOperation { database: Database::Main, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub enum ExactWordDocids {}
+pub enum FidWordCountDocids {}
+pub enum WordDocids {}
+pub enum WordFidDocids {}
+pub enum WordPairProximityDocids {}
+pub enum WordPositionDocids {}
+pub enum FacetDocids {}
+
+pub trait DatabaseType {
+    const DATABASE: Database;
+}
+
+pub trait MergerOperationType {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation;
+}
+
+impl DatabaseType for ExactWordDocids {
+    const DATABASE: Database = Database::ExactWordDocids;
+}
+
+impl MergerOperationType for ExactWordDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::ExactWordDocidsMerger(merger)
+    }
+}
+
+impl DatabaseType for FidWordCountDocids {
+    const DATABASE: Database = Database::FidWordCountDocids;
+}
+
+impl MergerOperationType for FidWordCountDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::FidWordCountDocidsMerger(merger)
+    }
+}
+
+impl DatabaseType for WordDocids {
+    const DATABASE: Database = Database::WordDocids;
+}
+
+impl MergerOperationType for WordDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::WordDocidsMerger(merger)
+    }
+}
+
+impl DatabaseType for WordFidDocids {
+    const DATABASE: Database = Database::WordFidDocids;
+}
+
+impl MergerOperationType for WordFidDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::WordFidDocidsMerger(merger)
+    }
+}
+
+impl DatabaseType for WordPairProximityDocids {
+    const DATABASE: Database = Database::WordPairProximityDocids;
+}
+
+impl MergerOperationType for WordPairProximityDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::WordPairProximityDocidsMerger(merger)
+    }
+}
+
+impl DatabaseType for WordPositionDocids {
+    const DATABASE: Database = Database::WordPositionDocids;
+}
+
+impl MergerOperationType for WordPositionDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::WordPositionDocidsMerger(merger)
+    }
+}
+
+impl MergerOperationType for FacetDocids {
+    fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
+        MergerOperation::FacetDocidsMerger(merger)
+    }
+}
+
+pub trait DocidsSender {
+    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
+    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
+}
+
+pub struct WordDocidsSender<'a, D> {
+    sender: &'a MergerSender,
+    _marker: PhantomData<D>,
+}
+
+impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
+    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
+        match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
+        match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub struct FacetDocidsSender<'a> {
+    sender: &'a MergerSender,
+}
+
+impl DocidsSender for FacetDocidsSender<'_> {
+    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
+        let (database, key) = self.extract_database(key);
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
+        match self.sender.send(WriterOperation { database, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
+        let (database, key) = self.extract_database(key);
+        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
+        match self.sender.send(WriterOperation { database, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+impl FacetDocidsSender<'_> {
+    fn extract_database<'a>(&self, key: &'a [u8]) -> (Database, &'a [u8]) {
+        let database = match FacetKind::from(key[0]) {
+            FacetKind::Number => Database::FacetIdF64NumberDocids,
+            FacetKind::String => Database::FacetIdStringDocids,
+            FacetKind::Null => Database::FacetIdIsNullDocids,
+            FacetKind::Empty => Database::FacetIdIsEmptyDocids,
+            FacetKind::Exists => Database::FacetIdExistsDocids,
+        };
+        (database, &key[1..])
+    }
+}
+
+pub struct DocumentsSender<'a>(&'a MergerSender);
+
+impl DocumentsSender<'_> {
+    /// TODO do that efficiently
+    pub fn uncompressed(
+        &self,
+        docid: DocumentId,
+        document: &KvReaderFieldId,
+    ) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
+            &docid.to_be_bytes(),
+            document.as_bytes(),
+        ));
+        match self.0.send(WriterOperation { database: Database::Documents, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
+        match self.0.send(WriterOperation { database: Database::Documents, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub enum MergerOperation {
+    ExactWordDocidsMerger(HashMapMerger),
+    FidWordCountDocidsMerger(HashMapMerger),
+    WordDocidsMerger(HashMapMerger),
+    WordFidDocidsMerger(HashMapMerger),
+    WordPairProximityDocidsMerger(HashMapMerger),
+    WordPositionDocidsMerger(HashMapMerger),
+    FacetDocidsMerger(HashMapMerger),
+    DeleteDocument { docid: DocumentId },
+    InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
+    FinishedDocument,
+}
+
+pub struct MergerReceiver(Receiver<MergerOperation>);
+
+impl IntoIterator for MergerReceiver {
+    type Item = MergerOperation;
+    type IntoIter = IntoIter<Self::Item>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+pub struct ExtractorSender(Sender<MergerOperation>);
+
+impl ExtractorSender {
+    pub fn document_sender(&self) -> DocumentSender<'_> {
+        DocumentSender(Some(&self.0))
+    }
+
+    pub fn send_searchable<D: MergerOperationType>(
+        &self,
+        merger: HashMapMerger,
+    ) -> StdResult<(), SendError<()>> {
+        match self.0.send(D::new_merger_operation(merger)) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub struct DocumentSender<'a>(Option<&'a Sender<MergerOperation>>);
+
+impl DocumentSender<'_> {
+    pub fn insert(
+        &self,
+        docid: DocumentId,
+        document: Box<KvReaderFieldId>,
+    ) -> StdResult<(), SendError<()>> {
+        let sender = self.0.unwrap();
+        match sender.send(MergerOperation::InsertDocument { docid, document }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> {
+        let sender = self.0.unwrap();
+        match sender.send(MergerOperation::DeleteDocument { docid }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    pub fn finish(mut self) -> StdResult<(), SendError<()>> {
+        let sender = self.0.take().unwrap();
+        match sender.send(MergerOperation::FinishedDocument) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+impl Drop for DocumentSender<'_> {
+    fn drop(&mut self) {
+        if let Some(sender) = self.0.take() {
+            sender.send(MergerOperation::FinishedDocument);
+        }
+    }
+}
--- a/milli/src/update/new/document_change.rs
+++ b/milli/src/update/new/document_change.rs
@ -0,0 +1,96 @@
+use heed::RoTxn;
+use obkv::KvReader;
+
+use crate::update::new::KvReaderFieldId;
+use crate::{DocumentId, FieldId, Index, Result};
+
+pub enum DocumentChange {
+    Deletion(Deletion),
+    Update(Update),
+    Insertion(Insertion),
+}
+
+pub struct Deletion {
+    docid: DocumentId,
+    current: Box<KvReaderFieldId>,
+}
+
+pub struct Update {
+    docid: DocumentId,
+    current: Box<KvReaderFieldId>,
+    new: Box<KvReaderFieldId>,
+}
+
+pub struct Insertion {
+    docid: DocumentId,
+    new: Box<KvReaderFieldId>,
+}
+
+impl DocumentChange {
+    pub fn docid(&self) -> DocumentId {
+        match &self {
+            Self::Deletion(inner) => inner.docid(),
+            Self::Update(inner) => inner.docid(),
+            Self::Insertion(inner) => inner.docid(),
+        }
+    }
+}
+
+impl Deletion {
+    pub fn create(docid: DocumentId, current: Box<KvReaderFieldId>) -> Self {
+        Self { docid, current }
+    }
+
+    pub fn docid(&self) -> DocumentId {
+        self.docid
+    }
+
+    // TODO shouldn't we use the one in self?
+    pub fn current<'a>(
+        &self,
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<&'a KvReader<FieldId>>> {
+        index.documents.get(rtxn, &self.docid).map_err(crate::Error::from)
+    }
+}
+
+impl Insertion {
+    pub fn create(docid: DocumentId, new: Box<KvReaderFieldId>) -> Self {
+        Insertion { docid, new }
+    }
+
+    pub fn docid(&self) -> DocumentId {
+        self.docid
+    }
+
+    pub fn new(&self) -> &KvReader<FieldId> {
+        self.new.as_ref()
+    }
+}
+
+impl Update {
+    pub fn create(
+        docid: DocumentId,
+        current: Box<KvReaderFieldId>,
+        new: Box<KvReaderFieldId>,
+    ) -> Self {
+        Update { docid, current, new }
+    }
+
+    pub fn docid(&self) -> DocumentId {
+        self.docid
+    }
+
+    pub fn current<'a>(
+        &self,
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<&'a KvReader<FieldId>>> {
+        index.documents.get(rtxn, &self.docid).map_err(crate::Error::from)
+    }
+
+    pub fn new(&self) -> &KvReader<FieldId> {
+        self.new.as_ref()
+    }
+}
--- a/milli/src/update/new/extract/cache.rs
+++ b/milli/src/update/new/extract/cache.rs
@ -0,0 +1,149 @@
+use std::collections::HashMap;
+
+use roaring::RoaringBitmap;
+use smallvec::SmallVec;
+
+pub const KEY_SIZE: usize = 12;
+
+#[derive(Debug)]
+pub struct CboCachedSorter {
+    cache: HashMap<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap>,
+    total_insertions: usize,
+    fitted_in_key: usize,
+}
+
+impl CboCachedSorter {
+    pub fn new() -> Self {
+        CboCachedSorter { cache: HashMap::new(), total_insertions: 0, fitted_in_key: 0 }
+    }
+}
+
+impl CboCachedSorter {
+    pub fn insert_del_u32(&mut self, key: &[u8], n: u32) {
+        match self.cache.get_mut(key) {
+            Some(DelAddRoaringBitmap { del, add: _ }) => {
+                del.get_or_insert_with(RoaringBitmap::default).insert(n);
+            }
+            None => {
+                self.total_insertions += 1;
+                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
+                let value = DelAddRoaringBitmap::new_del_u32(n);
+                assert!(self.cache.insert(key.into(), value).is_none());
+            }
+        }
+    }
+
+    pub fn insert_del(&mut self, key: &[u8], bitmap: RoaringBitmap) {
+        match self.cache.get_mut(key) {
+            Some(DelAddRoaringBitmap { del, add: _ }) => {
+                *del.get_or_insert_with(RoaringBitmap::default) |= bitmap;
+            }
+            None => {
+                self.total_insertions += 1;
+                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
+                let value = DelAddRoaringBitmap::new_del(bitmap);
+                assert!(self.cache.insert(key.into(), value).is_none());
+            }
+        }
+    }
+
+    pub fn insert_add_u32(&mut self, key: &[u8], n: u32) {
+        match self.cache.get_mut(key) {
+            Some(DelAddRoaringBitmap { del: _, add }) => {
+                add.get_or_insert_with(RoaringBitmap::default).insert(n);
+            }
+            None => {
+                self.total_insertions += 1;
+                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
+                let value = DelAddRoaringBitmap::new_add_u32(n);
+                assert!(self.cache.insert(key.into(), value).is_none());
+            }
+        }
+    }
+
+    pub fn insert_add(&mut self, key: &[u8], bitmap: RoaringBitmap) {
+        match self.cache.get_mut(key) {
+            Some(DelAddRoaringBitmap { del: _, add }) => {
+                *add.get_or_insert_with(RoaringBitmap::default) |= bitmap;
+            }
+            None => {
+                self.total_insertions += 1;
+                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
+                let value = DelAddRoaringBitmap::new_add(bitmap);
+                assert!(self.cache.insert(key.into(), value).is_none());
+            }
+        }
+    }
+
+    pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) {
+        match self.cache.get_mut(key) {
+            Some(DelAddRoaringBitmap { del, add }) => {
+                del.get_or_insert_with(RoaringBitmap::default).insert(n);
+                add.get_or_insert_with(RoaringBitmap::default).insert(n);
+            }
+            None => {
+                self.total_insertions += 1;
+                self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
+                let value = DelAddRoaringBitmap::new_del_add_u32(n);
+                assert!(self.cache.insert(key.into(), value).is_none());
+            }
+        }
+    }
+
+    pub fn into_sorter(self) -> HashMap<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap> {
+        eprintln!(
+            "LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions",
+            self.fitted_in_key,
+            (self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0,
+            self.total_insertions,
+        );
+
+        self.cache
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub struct DelAddRoaringBitmap {
+    pub(crate) del: Option<RoaringBitmap>,
+    pub(crate) add: Option<RoaringBitmap>,
+}
+
+impl DelAddRoaringBitmap {
+    fn new_del_add_u32(n: u32) -> Self {
+        DelAddRoaringBitmap {
+            del: Some(RoaringBitmap::from([n])),
+            add: Some(RoaringBitmap::from([n])),
+        }
+    }
+
+    fn new_del(bitmap: RoaringBitmap) -> Self {
+        DelAddRoaringBitmap { del: Some(bitmap), add: None }
+    }
+
+    fn new_del_u32(n: u32) -> Self {
+        DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None }
+    }
+
+    fn new_add(bitmap: RoaringBitmap) -> Self {
+        DelAddRoaringBitmap { del: None, add: Some(bitmap) }
+    }
+
+    fn new_add_u32(n: u32) -> Self {
+        DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
+    }
+
+    pub fn merge_with(&mut self, other: DelAddRoaringBitmap) {
+        self.del = match (self.del.take(), other.del) {
+            (None, None) => None,
+            (None, Some(other)) => Some(other),
+            (Some(this), None) => Some(this),
+            (Some(this), Some(other)) => Some(this | other),
+        };
+        self.add = match (self.add.take(), other.add) {
+            (None, None) => None,
+            (None, Some(other)) => Some(other),
+            (Some(this), None) => Some(this),
+            (Some(this), Some(other)) => Some(this | other),
+        };
+    }
+}
--- a/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/milli/src/update/new/extract/faceted/extract_facets.rs
@ -0,0 +1,240 @@
+use std::collections::HashSet;
+
+use heed::RoTxn;
+use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
+use serde_json::Value;
+
+use super::super::cache::CboCachedSorter;
+use super::facet_document::extract_document_facets;
+use super::FacetKind;
+use crate::facet::value_encoding::f64_into_bytes;
+use crate::update::new::extract::{DocidsExtractor, HashMapMerger};
+use crate::update::new::{DocumentChange, ItemsPool};
+use crate::update::GrenadParameters;
+use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH};
+pub struct FacetedDocidsExtractor;
+
+impl FacetedDocidsExtractor {
+    fn extract_document_change(
+        rtxn: &RoTxn,
+        index: &Index,
+        buffer: &mut Vec<u8>,
+        fields_ids_map: &mut GlobalFieldsIdsMap,
+        attributes_to_extract: &[&str],
+        cached_sorter: &mut CboCachedSorter,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        match document_change {
+            DocumentChange::Deletion(inner) => extract_document_facets(
+                attributes_to_extract,
+                inner.current(rtxn, index)?.unwrap(),
+                fields_ids_map,
+                &mut |fid, value| {
+                    Self::facet_fn_with_options(
+                        buffer,
+                        cached_sorter,
+                        CboCachedSorter::insert_del_u32,
+                        inner.docid(),
+                        fid,
+                        value,
+                    )
+                },
+            ),
+            DocumentChange::Update(inner) => {
+                extract_document_facets(
+                    attributes_to_extract,
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut |fid, value| {
+                        Self::facet_fn_with_options(
+                            buffer,
+                            cached_sorter,
+                            CboCachedSorter::insert_del_u32,
+                            inner.docid(),
+                            fid,
+                            value,
+                        )
+                    },
+                )?;
+
+                extract_document_facets(
+                    attributes_to_extract,
+                    inner.new(),
+                    fields_ids_map,
+                    &mut |fid, value| {
+                        Self::facet_fn_with_options(
+                            buffer,
+                            cached_sorter,
+                            CboCachedSorter::insert_add_u32,
+                            inner.docid(),
+                            fid,
+                            value,
+                        )
+                    },
+                )
+            }
+            DocumentChange::Insertion(inner) => extract_document_facets(
+                attributes_to_extract,
+                inner.new(),
+                fields_ids_map,
+                &mut |fid, value| {
+                    Self::facet_fn_with_options(
+                        buffer,
+                        cached_sorter,
+                        CboCachedSorter::insert_add_u32,
+                        inner.docid(),
+                        fid,
+                        value,
+                    )
+                },
+            ),
+        }
+    }
+
+    fn facet_fn_with_options(
+        buffer: &mut Vec<u8>,
+        cached_sorter: &mut CboCachedSorter,
+        cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32),
+        docid: DocumentId,
+        fid: FieldId,
+        value: &Value,
+    ) -> Result<()> {
+        // Exists
+        // key: fid
+        buffer.clear();
+        buffer.push(FacetKind::Exists as u8);
+        buffer.extend_from_slice(&fid.to_be_bytes());
+        cache_fn(cached_sorter, &*buffer, docid);
+
+        match value {
+            // Number
+            // key: fid - level - orderedf64 - orignalf64
+            Value::Number(number) => {
+                if let Some((n, ordered)) =
+                    number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered)))
+                {
+                    buffer.clear();
+                    buffer.push(FacetKind::Number as u8);
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(1); // level 0
+                    buffer.extend_from_slice(&ordered);
+                    buffer.extend_from_slice(&n.to_be_bytes());
+
+                    Ok(cache_fn(cached_sorter, &*buffer, docid))
+                } else {
+                    Ok(())
+                }
+            }
+            // String
+            // key: fid - level - truncated_string
+            Value::String(s) => {
+                let truncated = truncate_str(s);
+                buffer.clear();
+                buffer.push(FacetKind::String as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                buffer.push(1); // level 0
+                buffer.extend_from_slice(truncated.as_bytes());
+                Ok(cache_fn(cached_sorter, &*buffer, docid))
+            }
+            // Null
+            // key: fid
+            Value::Null => {
+                buffer.clear();
+                buffer.push(FacetKind::Null as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                Ok(cache_fn(cached_sorter, &*buffer, docid))
+            }
+            // Empty
+            // key: fid
+            Value::Array(a) if a.is_empty() => {
+                buffer.clear();
+                buffer.push(FacetKind::Empty as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                Ok(cache_fn(cached_sorter, &*buffer, docid))
+            }
+            Value::Object(o) if o.is_empty() => {
+                buffer.clear();
+                buffer.push(FacetKind::Empty as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                Ok(cache_fn(cached_sorter, &*buffer, docid))
+            }
+            // Otherwise, do nothing
+            /// TODO: What about Value::Bool?
+            _ => Ok(()),
+        }
+    }
+
+    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
+        index.user_defined_faceted_fields(rtxn)
+    }
+}
+
+/// Truncates a string to the biggest valid LMDB key size.
+fn truncate_str(s: &str) -> &str {
+    let index = s
+        .char_indices()
+        .map(|(idx, _)| idx)
+        .chain(std::iter::once(s.len()))
+        .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
+        .last();
+
+    &s[..index.unwrap_or(0)]
+}
+
+impl DocidsExtractor for FacetedDocidsExtractor {
+    #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
+    fn run_extraction(
+        index: &Index,
+        fields_ids_map: &GlobalFieldsIdsMap,
+        indexer: GrenadParameters,
+        document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
+    ) -> Result<HashMapMerger> {
+        let max_memory = indexer.max_memory_by_thread();
+
+        let rtxn = index.read_txn()?;
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
+        let attributes_to_extract: Vec<_> =
+            attributes_to_extract.iter().map(|s| s.as_ref()).collect();
+
+        let context_pool = ItemsPool::new(|| {
+            Ok((index.read_txn()?, fields_ids_map.clone(), Vec::new(), CboCachedSorter::new()))
+        });
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            document_changes.into_par_iter().try_for_each(|document_change| {
+                context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| {
+                    Self::extract_document_change(
+                        &*rtxn,
+                        index,
+                        buffer,
+                        fields_ids_map,
+                        &attributes_to_extract,
+                        cached_sorter,
+                        document_change?,
+                    )
+                })
+            })?;
+        }
+        {
+            let mut builder = HashMapMerger::new();
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
+            let _entered = span.enter();
+
+            let readers: Vec<_> = context_pool
+                .into_items()
+                .par_bridge()
+                .map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| {
+                    cached_sorter.into_sorter()
+                })
+                .collect();
+
+            builder.extend(readers);
+
+            Ok(builder)
+        }
+    }
+}
--- a/milli/src/update/new/extract/faceted/facet_document.rs
+++ b/milli/src/update/new/extract/faceted/facet_document.rs
@ -0,0 +1,52 @@
+use serde_json::Value;
+
+use crate::update::new::extract::perm_json_p;
+use crate::update::new::KvReaderFieldId;
+use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
+
+pub fn extract_document_facets(
+    attributes_to_extract: &[&str],
+    obkv: &KvReaderFieldId,
+    field_id_map: &mut GlobalFieldsIdsMap,
+    facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
+) -> Result<()> {
+    let mut field_name = String::new();
+    for (field_id, field_bytes) in obkv {
+        let Some(field_name) = field_id_map.name(field_id).map(|s| {
+            field_name.clear();
+            field_name.push_str(s);
+            &field_name
+        }) else {
+            unreachable!("field id not found in field id map");
+        };
+
+        let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
+            Some(field_id) => facet_fn(field_id, value),
+            None => Err(UserError::AttributeLimitReached.into()),
+        };
+
+        // if the current field is searchable or contains a searchable attribute
+        if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
+            // parse json.
+            match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
+                Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
+                    &object,
+                    Some(attributes_to_extract),
+                    &[], // skip no attributes
+                    field_name,
+                    &mut tokenize_field,
+                )?,
+                Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
+                    &array,
+                    Some(attributes_to_extract),
+                    &[], // skip no attributes
+                    field_name,
+                    &mut tokenize_field,
+                )?,
+                value => tokenize_field(field_name, &value)?,
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/milli/src/update/new/extract/faceted/mod.rs
+++ b/milli/src/update/new/extract/faceted/mod.rs
@ -0,0 +1,26 @@
+mod extract_facets;
+mod facet_document;
+
+pub use extract_facets::FacetedDocidsExtractor;
+
+#[repr(u8)]
+pub enum FacetKind {
+    Number = 0,
+    String = 1,
+    Null = 2,
+    Empty = 3,
+    Exists,
+}
+
+impl From<u8> for FacetKind {
+    fn from(value: u8) -> Self {
+        match value {
+            0 => Self::Number,
+            1 => Self::String,
+            2 => Self::Null,
+            3 => Self::Empty,
+            4 => Self::Exists,
+            _ => unreachable!(),
+        }
+    }
+}
--- a/milli/src/update/new/extract/lru.rs
+++ b/milli/src/update/new/extract/lru.rs
@ -0,0 +1,234 @@
+use std::borrow::Borrow;
+use std::hash::{BuildHasher, Hash};
+use std::iter::repeat_with;
+use std::mem;
+use std::num::NonZeroUsize;
+
+use hashbrown::hash_map::{DefaultHashBuilder, Entry};
+use hashbrown::HashMap;
+
+#[derive(Debug)]
+pub struct Lru<K, V, S = DefaultHashBuilder> {
+    lookup: HashMap<K, usize, S>,
+    storage: FixedSizeList<LruNode<K, V>>,
+}
+
+impl<K: Eq + Hash, V> Lru<K, V> {
+    /// Creates a new LRU cache that holds at most `capacity` elements.
+    pub fn new(capacity: NonZeroUsize) -> Self {
+        Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) }
+    }
+}
+
+impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
+    /// Creates a new LRU cache that holds at most `capacity` elements
+    /// and uses the provided hash builder to hash keys.
+    pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru<K, V, S> {
+        Self {
+            lookup: HashMap::with_hasher(hash_builder),
+            storage: FixedSizeList::new(capacity.get()),
+        }
+    }
+}
+
+impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
+    /// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache.
+    ///
+    /// Moves the key to the head of the LRU list if it exists.
+    pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut V>
+    where
+        K: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
+    {
+        let idx = *self.lookup.get(key)?;
+        self.storage.move_front(idx).map(|node| &mut node.value)
+    }
+}
+
+impl<K: Clone + Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
+    pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
+        match self.lookup.entry(key) {
+            Entry::Occupied(occ) => {
+                // It's fine to unwrap here because:
+                // * the entry already exists
+                let node = self.storage.move_front(*occ.get()).unwrap();
+                let old_value = mem::replace(&mut node.value, value);
+                let old_key = occ.replace_key();
+                Some((old_key, old_value))
+            }
+            Entry::Vacant(vac) => {
+                let key = vac.key().clone();
+                if self.storage.is_full() {
+                    // It's fine to unwrap here because:
+                    // * the cache capacity is non zero
+                    // * the cache is full
+                    let idx = self.storage.back_idx();
+                    let node = self.storage.move_front(idx).unwrap();
+                    let LruNode { key, value } = mem::replace(node, LruNode { key, value });
+                    vac.insert(idx);
+                    self.lookup.remove(&key);
+                    Some((key, value))
+                } else {
+                    // It's fine to unwrap here because:
+                    // * the cache capacity is non zero
+                    // * the cache is not full
+                    let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap();
+                    vac.insert(idx);
+                    None
+                }
+            }
+        }
+    }
+}
+
+impl<K, V, S> IntoIterator for Lru<K, V, S> {
+    type Item = (K, V);
+    type IntoIter = IntoIter<K, V>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes }
+    }
+}
+
+pub struct IntoIter<K, V> {
+    lookup_iter: hashbrown::hash_map::IntoIter<K, usize>,
+    nodes: Box<[Option<FixedSizeListNode<LruNode<K, V>>>]>,
+}
+
+impl<K, V> Iterator for IntoIter<K, V> {
+    type Item = (K, V);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let (_key, idx) = self.lookup_iter.next()?;
+        let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data;
+        Some((key, value))
+    }
+}
+
+#[derive(Debug)]
+struct LruNode<K, V> {
+    key: K,
+    value: V,
+}
+
+#[derive(Debug)]
+struct FixedSizeListNode<T> {
+    prev: usize,
+    next: usize,
+    data: T,
+}
+
+#[derive(Debug)]
+struct FixedSizeList<T> {
+    nodes: Box<[Option<FixedSizeListNode<T>>]>,
+    /// Also corresponds to the first `None` in the nodes.
+    length: usize,
+    // TODO Also, we probably do not need one of the front and back cursors.
+    front: usize,
+    back: usize,
+}
+
+impl<T> FixedSizeList<T> {
+    fn new(capacity: usize) -> Self {
+        Self {
+            nodes: repeat_with(|| None).take(capacity).collect::<Vec<_>>().into_boxed_slice(),
+            length: 0,
+            front: usize::MAX,
+            back: usize::MAX,
+        }
+    }
+
+    #[inline]
+    fn capacity(&self) -> usize {
+        self.nodes.len()
+    }
+
+    #[inline]
+    fn len(&self) -> usize {
+        self.length
+    }
+
+    #[inline]
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    #[inline]
+    fn is_full(&self) -> bool {
+        self.len() == self.capacity()
+    }
+
+    #[inline]
+    fn back_idx(&self) -> usize {
+        self.back
+    }
+
+    #[inline]
+    fn next(&mut self) -> Option<usize> {
+        if self.is_full() {
+            None
+        } else {
+            let current_free = self.length;
+            self.length += 1;
+            Some(current_free)
+        }
+    }
+
+    #[inline]
+    fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode<T>> {
+        self.nodes.get_mut(idx).and_then(|node| node.as_mut())
+    }
+
+    #[inline]
+    fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode<T>> {
+        self.nodes.get(idx).and_then(|node| node.as_ref())
+    }
+
+    #[inline]
+    fn move_front(&mut self, idx: usize) -> Option<&mut T> {
+        let node = self.nodes.get_mut(idx)?.take()?;
+        if let Some(prev) = self.node_mut(node.prev) {
+            prev.next = node.next;
+        } else {
+            self.front = node.next;
+        }
+        if let Some(next) = self.node_mut(node.next) {
+            next.prev = node.prev;
+        } else {
+            self.back = node.prev;
+        }
+
+        if let Some(front) = self.node_mut(self.front) {
+            front.prev = idx;
+        }
+        if self.node_ref(self.back).is_none() {
+            self.back = idx;
+        }
+
+        let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
+            prev: usize::MAX,
+            next: self.front,
+            data: node.data,
+        });
+        self.front = idx;
+        Some(&mut node.data)
+    }
+
+    #[inline]
+    fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> {
+        let idx = self.next()?;
+        if let Some(front) = self.node_mut(self.front) {
+            front.prev = idx;
+        }
+        if self.node_ref(self.back).is_none() {
+            self.back = idx;
+        }
+        let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
+            prev: usize::MAX,
+            next: self.front,
+            data,
+        });
+        self.front = idx;
+        Some((idx, &mut node.data))
+    }
+}
--- a/milli/src/update/new/extract/mod.rs
+++ b/milli/src/update/new/extract/mod.rs
@ -0,0 +1,218 @@
+mod cache;
+mod faceted;
+mod lru;
+mod searchable;
+
+use std::collections::HashMap;
+use std::mem;
+
+pub use faceted::*;
+use grenad::MergeFunction;
+use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
+use rayon::slice::ParallelSliceMut as _;
+pub use searchable::*;
+use smallvec::SmallVec;
+
+use super::DocumentChange;
+use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
+use crate::{GlobalFieldsIdsMap, Index, Result};
+
+pub trait DocidsExtractor {
+    fn run_extraction(
+        index: &Index,
+        fields_ids_map: &GlobalFieldsIdsMap,
+        indexer: GrenadParameters,
+        document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
+    ) -> Result<HashMapMerger>;
+}
+
+pub struct HashMapMerger {
+    maps: Vec<HashMap<SmallVec<[u8; cache::KEY_SIZE]>, cache::DelAddRoaringBitmap>>,
+}
+
+impl HashMapMerger {
+    pub fn new() -> HashMapMerger {
+        HashMapMerger { maps: Vec::new() }
+    }
+
+    pub fn extend<I>(&mut self, iter: I)
+    where
+        I: IntoIterator<
+            Item = HashMap<SmallVec<[u8; cache::KEY_SIZE]>, cache::DelAddRoaringBitmap>,
+        >,
+    {
+        self.maps.extend(iter);
+    }
+}
+
+impl IntoIterator for HashMapMerger {
+    type Item = (SmallVec<[u8; 12]>, cache::DelAddRoaringBitmap);
+    type IntoIter = IntoIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        let mut entries = {
+            let span = tracing::trace_span!(target: "indexing::documents::merge", "into_par_iter");
+            let _entered = span.enter();
+            let entries: Vec<_> =
+                self.maps.into_par_iter().flat_map(|m| m.into_par_iter()).collect();
+            eprintln!("There are {} entries in the HashMapMerger", entries.len());
+            entries
+        };
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::merge", "par_sort_unstable_by");
+            let _entered = span.enter();
+            entries.par_sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
+            IntoIter {
+                sorted_entries: entries.into_iter(),
+                current_key: None,
+                current_deladd: cache::DelAddRoaringBitmap::default(),
+            }
+        }
+    }
+}
+
+pub struct IntoIter {
+    sorted_entries: std::vec::IntoIter<(SmallVec<[u8; 12]>, cache::DelAddRoaringBitmap)>,
+    current_key: Option<SmallVec<[u8; 12]>>,
+    current_deladd: cache::DelAddRoaringBitmap,
+}
+
+impl Iterator for IntoIter {
+    type Item = (SmallVec<[u8; 12]>, cache::DelAddRoaringBitmap);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        loop {
+            match self.sorted_entries.next() {
+                Some((k, deladd)) => {
+                    if self.current_key.as_deref() == Some(k.as_slice()) {
+                        self.current_deladd.merge_with(deladd);
+                    } else {
+                        let previous_key = self.current_key.replace(k);
+                        let previous_deladd = mem::replace(&mut self.current_deladd, deladd);
+                        if let Some(previous_key) = previous_key {
+                            return Some((previous_key, previous_deladd));
+                        }
+                    }
+                }
+                None => {
+                    let current_deladd = mem::take(&mut self.current_deladd);
+                    return self.current_key.take().map(|ck| (ck, current_deladd));
+                }
+            }
+        }
+    }
+}
+
+/// TODO move in permissive json pointer
+pub mod perm_json_p {
+    use serde_json::{Map, Value};
+
+    use crate::Result;
+    const SPLIT_SYMBOL: char = '.';
+
+    /// Returns `true` if the `selector` match the `key`.
+    ///
+    /// ```text
+    /// Example:
+    /// `animaux`           match `animaux`
+    /// `animaux.chien`     match `animaux`
+    /// `animaux.chien`     match `animaux`
+    /// `animaux.chien.nom` match `animaux`
+    /// `animaux.chien.nom` match `animaux.chien`
+    /// -----------------------------------------
+    /// `animaux`    doesn't match `animaux.chien`
+    /// `animaux.`   doesn't match `animaux`
+    /// `animaux.ch` doesn't match `animaux.chien`
+    /// `animau`     doesn't match `animaux`
+    /// ```
+    pub fn contained_in(selector: &str, key: &str) -> bool {
+        selector.starts_with(key)
+            && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
+    }
+
+    pub fn seek_leaf_values_in_object(
+        value: &Map<String, Value>,
+        selectors: Option<&[&str]>,
+        skip_selectors: &[&str],
+        base_key: &str,
+        seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
+    ) -> Result<()> {
+        if value.is_empty() {
+            seeker(&base_key, &Value::Object(Map::with_capacity(0)))?;
+        }
+
+        for (key, value) in value.iter() {
+            let base_key = if base_key.is_empty() {
+                key.to_string()
+            } else {
+                format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
+            };
+
+            // here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
+            // so we check the contained_in on both side
+            let should_continue = select_field(&base_key, selectors, skip_selectors);
+            if should_continue {
+                match value {
+                    Value::Object(object) => seek_leaf_values_in_object(
+                        object,
+                        selectors,
+                        skip_selectors,
+                        &base_key,
+                        seeker,
+                    ),
+                    Value::Array(array) => seek_leaf_values_in_array(
+                        array,
+                        selectors,
+                        skip_selectors,
+                        &base_key,
+                        seeker,
+                    ),
+                    value => seeker(&base_key, value),
+                }?;
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn seek_leaf_values_in_array(
+        values: &[Value],
+        selectors: Option<&[&str]>,
+        skip_selectors: &[&str],
+        base_key: &str,
+        seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
+    ) -> Result<()> {
+        if values.is_empty() {
+            seeker(&base_key, &Value::Array(vec![]))?;
+        }
+
+        for value in values {
+            match value {
+                Value::Object(object) => {
+                    seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
+                }
+                Value::Array(array) => {
+                    seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
+                }
+                value => seeker(base_key, value),
+            }?;
+        }
+
+        Ok(())
+    }
+
+    pub fn select_field(
+        field_name: &str,
+        selectors: Option<&[&str]>,
+        skip_selectors: &[&str],
+    ) -> bool {
+        selectors.map_or(true, |selectors| {
+            selectors.iter().any(|selector| {
+                contained_in(selector, &field_name) || contained_in(&field_name, selector)
+            })
+        }) && !skip_selectors.iter().any(|skip_selector| {
+            contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector)
+        })
+    }
+}
--- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs
@ -0,0 +1,124 @@
+use std::collections::HashMap;
+
+use heed::RoTxn;
+
+use super::tokenize_document::DocumentTokenizer;
+use super::SearchableExtractor;
+use crate::update::new::extract::cache::CboCachedSorter;
+use crate::update::new::DocumentChange;
+use crate::update::MergeDeladdCboRoaringBitmaps;
+use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+
+const MAX_COUNTED_WORDS: usize = 30;
+
+pub struct FidWordCountDocidsExtractor;
+impl SearchableExtractor for FidWordCountDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    // This method is reimplemented to count the number of words in the document in each field
+    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
+    fn extract_document_change(
+        rtxn: &RoTxn,
+        index: &Index,
+        document_tokenizer: &DocumentTokenizer,
+        fields_ids_map: &mut GlobalFieldsIdsMap,
+        cached_sorter: &mut CboCachedSorter,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let mut key_buffer = Vec::new();
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let mut fid_word_count = HashMap::new();
+                let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
+                    fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
+                    Ok(())
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut token_fn,
+                )?;
+
+                // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are deleted.
+                for (fid, count) in fid_word_count.iter() {
+                    if *count <= MAX_COUNTED_WORDS {
+                        let key = build_key(*fid, *count as u8, &mut key_buffer);
+                        cached_sorter.insert_del_u32(key, inner.docid());
+                    }
+                }
+            }
+            DocumentChange::Update(inner) => {
+                let mut fid_word_count = HashMap::new();
+                let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
+                    fid_word_count
+                        .entry(fid)
+                        .and_modify(|(current_count, _new_count)| *current_count += 1)
+                        .or_insert((1, 0));
+                    Ok(())
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut token_fn,
+                )?;
+
+                let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
+                    fid_word_count
+                        .entry(fid)
+                        .and_modify(|(_current_count, new_count)| *new_count += 1)
+                        .or_insert((0, 1));
+                    Ok(())
+                };
+                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
+
+                // Only the fields that have a change in the number of words are updated.
+                for (fid, (current_count, new_count)) in fid_word_count.iter() {
+                    if *current_count != *new_count {
+                        if *current_count <= MAX_COUNTED_WORDS {
+                            let key = build_key(*fid, *current_count as u8, &mut key_buffer);
+                            cached_sorter.insert_del_u32(key, inner.docid());
+                        }
+                        if *new_count <= MAX_COUNTED_WORDS {
+                            let key = build_key(*fid, *new_count as u8, &mut key_buffer);
+                            cached_sorter.insert_add_u32(key, inner.docid());
+                        }
+                    }
+                }
+            }
+            DocumentChange::Insertion(inner) => {
+                let mut fid_word_count = HashMap::new();
+                let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
+                    fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
+                    Ok(())
+                };
+                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
+
+                // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are stored.
+                for (fid, count) in fid_word_count.iter() {
+                    if *count <= MAX_COUNTED_WORDS {
+                        let key = build_key(*fid, *count as u8, &mut key_buffer);
+                        cached_sorter.insert_add_u32(key, inner.docid());
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+fn build_key(fid: FieldId, count: u8, key_buffer: &mut Vec<u8>) -> &[u8] {
+    key_buffer.clear();
+    key_buffer.extend_from_slice(&fid.to_be_bytes());
+    key_buffer.push(count);
+    key_buffer.as_slice()
+}
--- a/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -0,0 +1,594 @@
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fs::File;
+use std::num::NonZero;
+
+use grenad::{Merger, MergerBuilder};
+use heed::RoTxn;
+use rayon::iter::{IntoParallelIterator, ParallelIterator};
+
+use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
+use super::SearchableExtractor;
+use crate::update::new::extract::cache::CboCachedSorter;
+use crate::update::new::extract::perm_json_p::contained_in;
+use crate::update::new::extract::HashMapMerger;
+use crate::update::new::{DocumentChange, ItemsPool};
+use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
+use crate::{
+    bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result,
+    MAX_POSITION_PER_ATTRIBUTE,
+};
+
+const MAX_COUNTED_WORDS: usize = 30;
+
+trait ProtoWordDocidsExtractor {
+    fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
+    fn attributes_to_extract<'a>(
+        _rtxn: &'a RoTxn,
+        _index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>>;
+
+    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
+}
+
+impl<T> SearchableExtractor for T
+where
+    T: ProtoWordDocidsExtractor,
+{
+    fn extract_document_change(
+        rtxn: &RoTxn,
+        index: &Index,
+        document_tokenizer: &DocumentTokenizer,
+        fields_ids_map: &mut GlobalFieldsIdsMap,
+        cached_sorter: &mut CboCachedSorter,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let mut token_fn = |_fname: &str, fid, pos, word: &str| {
+                    let key = Self::build_key(fid, pos, word);
+                    Ok(cached_sorter.insert_del_u32(&key, inner.docid()))
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let mut token_fn = |_fname: &str, fid, pos, word: &str| {
+                    let key = Self::build_key(fid, pos, word);
+                    Ok(cached_sorter.insert_del_u32(&key, inner.docid()))
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut token_fn,
+                )?;
+
+                let mut token_fn = |_fname: &str, fid, pos, word: &str| {
+                    let key = Self::build_key(fid, pos, word);
+                    Ok(cached_sorter.insert_add_u32(&key, inner.docid()))
+                };
+                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let mut token_fn = |_fname: &str, fid, pos, word: &str| {
+                    let key = Self::build_key(fid, pos, word);
+                    Ok(cached_sorter.insert_add_u32(&key, inner.docid()))
+                };
+                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        Self::attributes_to_extract(rtxn, index)
+    }
+
+    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
+        Self::attributes_to_skip(rtxn, index)
+    }
+}
+
+pub struct WordDocidsExtractor;
+impl ProtoWordDocidsExtractor for WordDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
+        // exact attributes must be skipped and stored in a separate DB, see `ExactWordDocidsExtractor`.
+        index.exact_attributes(rtxn).map_err(Into::into)
+    }
+
+    /// TODO write in an external Vec buffer
+    fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
+        Cow::Borrowed(word.as_bytes())
+    }
+}
+
+pub struct ExactWordDocidsExtractor;
+impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        let exact_attributes = index.exact_attributes(rtxn)?;
+        // If there are no user-defined searchable fields, we return all exact attributes.
+        // Otherwise, we return the intersection of exact attributes and user-defined searchable fields.
+        if let Some(searchable_attributes) = index.user_defined_searchable_fields(rtxn)? {
+            let attributes = exact_attributes
+                .into_iter()
+                .filter(|attr| searchable_attributes.contains(attr))
+                .collect();
+            Ok(Some(attributes))
+        } else {
+            Ok(Some(exact_attributes))
+        }
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
+        Cow::Borrowed(word.as_bytes())
+    }
+}
+
+pub struct WordFidDocidsExtractor;
+impl ProtoWordDocidsExtractor for WordFidDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
+        let mut key = Vec::new();
+        key.extend_from_slice(word.as_bytes());
+        key.push(0);
+        key.extend_from_slice(&field_id.to_be_bytes());
+        Cow::Owned(key)
+    }
+}
+
+pub struct WordPositionDocidsExtractor;
+impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> {
+        // position must be bucketed to reduce the number of keys in the DB.
+        let position = bucketed_position(position);
+        let mut key = Vec::new();
+        key.extend_from_slice(word.as_bytes());
+        key.push(0);
+        key.extend_from_slice(&position.to_be_bytes());
+        Cow::Owned(key)
+    }
+}
+
+// V2
+
+struct WordDocidsCachedSorters {
+    word_fid_docids: CboCachedSorter,
+    word_docids: CboCachedSorter,
+    exact_word_docids: CboCachedSorter,
+    word_position_docids: CboCachedSorter,
+    fid_word_count_docids: CboCachedSorter,
+    fid_word_count: HashMap<FieldId, (usize, usize)>,
+    current_docid: Option<DocumentId>,
+}
+
+impl WordDocidsCachedSorters {
+    pub fn new(
+        indexer: GrenadParameters,
+        max_memory: Option<usize>,
+        capacity: NonZero<usize>,
+    ) -> Self {
+        let max_memory = max_memory.map(|max_memory| max_memory / 4);
+
+        let word_fid_docids = CboCachedSorter::new();
+        let word_docids = CboCachedSorter::new();
+        let exact_word_docids = CboCachedSorter::new();
+        let word_position_docids = CboCachedSorter::new();
+        let fid_word_count_docids = CboCachedSorter::new();
+
+        Self {
+            word_fid_docids,
+            word_docids,
+            exact_word_docids,
+            word_position_docids,
+            fid_word_count_docids,
+            fid_word_count: HashMap::new(),
+            current_docid: None,
+        }
+    }
+
+    fn insert_add_u32(
+        &mut self,
+        field_id: FieldId,
+        position: u16,
+        word: &str,
+        exact: bool,
+        docid: u32,
+        buffer: &mut Vec<u8>,
+    ) -> Result<()> {
+        let key = word.as_bytes();
+        if exact {
+            self.exact_word_docids.insert_add_u32(key, docid);
+        } else {
+            self.word_docids.insert_add_u32(key, docid);
+        }
+
+        buffer.clear();
+        buffer.extend_from_slice(word.as_bytes());
+        buffer.push(0);
+        buffer.extend_from_slice(&position.to_be_bytes());
+        self.word_fid_docids.insert_add_u32(buffer, docid);
+
+        buffer.clear();
+        buffer.extend_from_slice(word.as_bytes());
+        buffer.push(0);
+        buffer.extend_from_slice(&field_id.to_be_bytes());
+        self.word_position_docids.insert_add_u32(buffer, docid);
+
+        if self.current_docid.map_or(false, |id| docid != id) {
+            self.flush_fid_word_count(buffer)?;
+        }
+
+        self.fid_word_count
+            .entry(field_id)
+            .and_modify(|(_current_count, new_count)| *new_count += 1)
+            .or_insert((0, 1));
+        self.current_docid = Some(docid);
+
+        Ok(())
+    }
+
+    fn insert_del_u32(
+        &mut self,
+        field_id: FieldId,
+        position: u16,
+        word: &str,
+        exact: bool,
+        docid: u32,
+        buffer: &mut Vec<u8>,
+    ) -> Result<()> {
+        let key = word.as_bytes();
+        if exact {
+            self.exact_word_docids.insert_del_u32(key, docid);
+        } else {
+            self.word_docids.insert_del_u32(key, docid);
+        }
+
+        buffer.clear();
+        buffer.extend_from_slice(word.as_bytes());
+        buffer.push(0);
+        buffer.extend_from_slice(&position.to_be_bytes());
+        self.word_fid_docids.insert_del_u32(buffer, docid);
+
+        buffer.clear();
+        buffer.extend_from_slice(word.as_bytes());
+        buffer.push(0);
+        buffer.extend_from_slice(&field_id.to_be_bytes());
+        self.word_position_docids.insert_del_u32(buffer, docid);
+
+        if self.current_docid.map_or(false, |id| docid != id) {
+            self.flush_fid_word_count(buffer)?;
+        }
+
+        self.fid_word_count
+            .entry(field_id)
+            .and_modify(|(current_count, _new_count)| *current_count += 1)
+            .or_insert((1, 0));
+        self.current_docid = Some(docid);
+
+        Ok(())
+    }
+
+    fn flush_fid_word_count(&mut self, buffer: &mut Vec<u8>) -> Result<()> {
+        for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
+            if current_count != new_count {
+                if current_count <= MAX_COUNTED_WORDS {
+                    buffer.clear();
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(current_count as u8);
+                    self.fid_word_count_docids.insert_del_u32(buffer, self.current_docid.unwrap());
+                }
+                if new_count <= MAX_COUNTED_WORDS {
+                    buffer.clear();
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(new_count as u8);
+                    self.fid_word_count_docids.insert_add_u32(buffer, self.current_docid.unwrap());
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+struct WordDocidsMergerBuilders {
+    word_fid_docids: HashMapMerger,
+    word_docids: HashMapMerger,
+    exact_word_docids: HashMapMerger,
+    word_position_docids: HashMapMerger,
+    fid_word_count_docids: HashMapMerger,
+}
+
+pub struct WordDocidsMergers {
+    pub word_fid_docids: HashMapMerger,
+    pub word_docids: HashMapMerger,
+    pub exact_word_docids: HashMapMerger,
+    pub word_position_docids: HashMapMerger,
+    pub fid_word_count_docids: HashMapMerger,
+}
+
+impl WordDocidsMergerBuilders {
+    fn new() -> Self {
+        Self {
+            word_fid_docids: HashMapMerger::new(),
+            word_docids: HashMapMerger::new(),
+            exact_word_docids: HashMapMerger::new(),
+            word_position_docids: HashMapMerger::new(),
+            fid_word_count_docids: HashMapMerger::new(),
+        }
+    }
+
+    fn add_sorters(&mut self, other: WordDocidsCachedSorters) -> Result<()> {
+        let WordDocidsCachedSorters {
+            word_fid_docids,
+            word_docids,
+            exact_word_docids,
+            word_position_docids,
+            fid_word_count_docids,
+            fid_word_count: _,
+            current_docid: _,
+        } = other;
+
+        let mut word_fid_docids_readers = HashMap::new();
+        let mut word_docids_readers = HashMap::new();
+        let mut exact_word_docids_readers = HashMap::new();
+        let mut word_position_docids_readers = HashMap::new();
+        let mut fid_word_count_docids_readers = HashMap::new();
+        rayon::scope(|s| {
+            s.spawn(|_| {
+                word_fid_docids_readers = word_fid_docids.into_sorter();
+            });
+            s.spawn(|_| {
+                word_docids_readers = word_docids.into_sorter();
+            });
+            s.spawn(|_| {
+                exact_word_docids_readers = exact_word_docids.into_sorter();
+            });
+            s.spawn(|_| {
+                word_position_docids_readers = word_position_docids.into_sorter();
+            });
+            s.spawn(|_| {
+                fid_word_count_docids_readers = fid_word_count_docids.into_sorter();
+            });
+        });
+        self.word_fid_docids.extend([word_fid_docids_readers]);
+        self.word_docids.extend([word_docids_readers]);
+        self.exact_word_docids.extend([exact_word_docids_readers]);
+        self.word_position_docids.extend([word_position_docids_readers]);
+        self.fid_word_count_docids.extend([fid_word_count_docids_readers]);
+
+        Ok(())
+    }
+
+    fn build(self) -> WordDocidsMergers {
+        WordDocidsMergers {
+            word_fid_docids: self.word_fid_docids,
+            word_docids: self.word_docids,
+            exact_word_docids: self.exact_word_docids,
+            word_position_docids: self.word_position_docids,
+            fid_word_count_docids: self.fid_word_count_docids,
+        }
+    }
+}
+
+pub struct WordDocidsExtractors;
+
+impl WordDocidsExtractors {
+    pub fn run_extraction(
+        index: &Index,
+        fields_ids_map: &GlobalFieldsIdsMap,
+        indexer: GrenadParameters,
+        document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
+    ) -> Result<WordDocidsMergers> {
+        let max_memory = indexer.max_memory_by_thread();
+
+        let rtxn = index.read_txn()?;
+        let stop_words = index.stop_words(&rtxn)?;
+        let allowed_separators = index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.into_tokenizer();
+
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
+        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
+        let localized_attributes_rules =
+            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            attribute_to_extract: attributes_to_extract.as_deref(),
+            attribute_to_skip: attributes_to_skip.as_slice(),
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+
+        let context_pool = ItemsPool::new(|| {
+            Ok((
+                index.read_txn()?,
+                &document_tokenizer,
+                fields_ids_map.clone(),
+                WordDocidsCachedSorters::new(
+                    indexer,
+                    max_memory,
+                    // TODO use a better value
+                    200_000.try_into().unwrap(),
+                ),
+            ))
+        });
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            document_changes.into_par_iter().try_for_each(|document_change| {
+                context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
+                    Self::extract_document_change(
+                        &*rtxn,
+                        index,
+                        document_tokenizer,
+                        fields_ids_map,
+                        cached_sorter,
+                        document_change?,
+                    )
+                })
+            })?;
+        }
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
+            let _entered = span.enter();
+            let mut builder = WordDocidsMergerBuilders::new();
+            for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() {
+                builder.add_sorters(cache)?;
+            }
+
+            Ok(builder.build())
+        }
+    }
+
+    fn extract_document_change(
+        rtxn: &RoTxn,
+        index: &Index,
+        document_tokenizer: &DocumentTokenizer,
+        fields_ids_map: &mut GlobalFieldsIdsMap,
+        cached_sorter: &mut WordDocidsCachedSorters,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let exact_attributes = index.exact_attributes(rtxn)?;
+        let is_exact_attribute =
+            |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
+        let mut buffer = Vec::new();
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter
+                        .insert_del_u32(
+                            fid,
+                            pos,
+                            word,
+                            is_exact_attribute(fname),
+                            inner.docid(),
+                            &mut buffer,
+                        )
+                        .map_err(crate::Error::from)
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter
+                        .insert_del_u32(
+                            fid,
+                            pos,
+                            word,
+                            is_exact_attribute(fname),
+                            inner.docid(),
+                            &mut buffer,
+                        )
+                        .map_err(crate::Error::from)
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index)?.unwrap(),
+                    fields_ids_map,
+                    &mut token_fn,
+                )?;
+
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter
+                        .insert_add_u32(
+                            fid,
+                            pos,
+                            word,
+                            is_exact_attribute(fname),
+                            inner.docid(),
+                            &mut buffer,
+                        )
+                        .map_err(crate::Error::from)
+                };
+                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter
+                        .insert_add_u32(
+                            fid,
+                            pos,
+                            word,
+                            is_exact_attribute(fname),
+                            inner.docid(),
+                            &mut buffer,
+                        )
+                        .map_err(crate::Error::from)
+                };
+                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
+            }
+        }
+
+        cached_sorter.flush_fid_word_count(&mut buffer)
+    }
+
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+}
--- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -0,0 +1,160 @@
+use std::collections::VecDeque;
+use std::rc::Rc;
+
+use heed::RoTxn;
+use obkv::KvReader;
+
+use super::tokenize_document::DocumentTokenizer;
+use super::SearchableExtractor;
+use crate::proximity::{index_proximity, MAX_DISTANCE};
+use crate::update::new::extract::cache::CboCachedSorter;
+use crate::update::new::DocumentChange;
+use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+
+pub struct WordPairProximityDocidsExtractor;
+impl SearchableExtractor for WordPairProximityDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    // This method is reimplemented to count the number of words in the document in each field
+    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
+    fn extract_document_change(
+        rtxn: &RoTxn,
+        index: &Index,
+        document_tokenizer: &DocumentTokenizer,
+        fields_ids_map: &mut GlobalFieldsIdsMap,
+        cached_sorter: &mut CboCachedSorter,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let mut key_buffer = Vec::new();
+        let mut del_word_pair_proximity = Vec::new();
+        let mut add_word_pair_proximity = Vec::new();
+        let mut word_positions: VecDeque<(Rc<str>, u16)> =
+            VecDeque::with_capacity(MAX_DISTANCE as usize);
+
+        let docid = document_change.docid();
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let document = inner.current(rtxn, index)?.unwrap();
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        del_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let document = inner.current(rtxn, index)?.unwrap();
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        del_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+                let document = inner.new();
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        add_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let document = inner.new();
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        add_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+        }
+
+        del_word_pair_proximity.sort_unstable();
+        del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
+        for ((w1, w2), prox) in del_word_pair_proximity.iter() {
+            let key = build_key(*prox, w1, w2, &mut key_buffer);
+            cached_sorter.insert_del_u32(key, docid);
+        }
+
+        add_word_pair_proximity.sort_unstable();
+        add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
+        for ((w1, w2), prox) in add_word_pair_proximity.iter() {
+            let key = build_key(*prox, w1, w2, &mut key_buffer);
+            cached_sorter.insert_add_u32(key, docid);
+        }
+        Ok(())
+    }
+}
+
+fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> &'a [u8] {
+    key_buffer.clear();
+    key_buffer.push(prox);
+    key_buffer.extend_from_slice(w1.as_bytes());
+    key_buffer.push(0);
+    key_buffer.extend_from_slice(w2.as_bytes());
+    key_buffer.as_slice()
+}
+
+fn word_positions_into_word_pair_proximity(
+    word_positions: &mut VecDeque<(Rc<str>, u16)>,
+    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
+) -> Result<()> {
+    let (head_word, head_position) = word_positions.pop_front().unwrap();
+    for (word, position) in word_positions.iter() {
+        let prox = index_proximity(head_position as u32, *position as u32) as u8;
+        if prox > 0 && prox < MAX_DISTANCE as u8 {
+            word_pair_proximity((head_word.clone(), word.clone()), prox);
+        }
+    }
+    Ok(())
+}
+
+fn process_document_tokens(
+    document: &KvReader<FieldId>,
+    document_tokenizer: &DocumentTokenizer,
+    fields_ids_map: &mut GlobalFieldsIdsMap,
+    word_positions: &mut VecDeque<(Rc<str>, u16)>,
+    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
+) -> Result<()> {
+    let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
+        // drain the proximity window until the head word is considered close to the word we are inserting.
+        while word_positions
+            .front()
+            .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
+        {
+            word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
+        }
+
+        // insert the new word.
+        word_positions.push_back((Rc::from(word), pos));
+        Ok(())
+    };
+    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
+
+    while !word_positions.is_empty() {
+        word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
+    }
+
+    Ok(())
+}
--- a/milli/src/update/new/extract/searchable/mod.rs
+++ b/milli/src/update/new/extract/searchable/mod.rs
@ -0,0 +1,126 @@
+mod extract_fid_word_count_docids;
+mod extract_word_docids;
+mod extract_word_pair_proximity_docids;
+mod tokenize_document;
+
+use std::fs::File;
+
+pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
+pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
+use grenad::Merger;
+use heed::RoTxn;
+use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
+use tokenize_document::{tokenizer_builder, DocumentTokenizer};
+
+use super::cache::CboCachedSorter;
+use super::{DocidsExtractor, HashMapMerger};
+use crate::update::new::{DocumentChange, ItemsPool};
+use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
+use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+pub trait SearchableExtractor {
+    fn run_extraction(
+        index: &Index,
+        fields_ids_map: &GlobalFieldsIdsMap,
+        indexer: GrenadParameters,
+        document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
+    ) -> Result<HashMapMerger> {
+        let max_memory = indexer.max_memory_by_thread();
+
+        let rtxn = index.read_txn()?;
+        let stop_words = index.stop_words(&rtxn)?;
+        let allowed_separators = index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.into_tokenizer();
+
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
+        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
+        let localized_attributes_rules =
+            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            attribute_to_extract: attributes_to_extract.as_deref(),
+            attribute_to_skip: attributes_to_skip.as_slice(),
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+
+        let context_pool = ItemsPool::new(|| {
+            Ok((
+                index.read_txn()?,
+                &document_tokenizer,
+                fields_ids_map.clone(),
+                CboCachedSorter::new(),
+            ))
+        });
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            document_changes.into_par_iter().try_for_each(|document_change| {
+                context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
+                    Self::extract_document_change(
+                        &*rtxn,
+                        index,
+                        document_tokenizer,
+                        fields_ids_map,
+                        cached_sorter,
+                        document_change?,
+                    )
+                })
+            })?;
+        }
+        {
+            let mut builder = HashMapMerger::new();
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
+            let _entered = span.enter();
+
+            let readers: Vec<_> = context_pool
+                .into_items()
+                .par_bridge()
+                .map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| {
+                    cached_sorter.into_sorter()
+                })
+                .collect();
+            builder.extend(readers);
+            Ok(builder)
+        }
+    }
+
+    fn extract_document_change(
+        rtxn: &RoTxn,
+        index: &Index,
+        document_tokenizer: &DocumentTokenizer,
+        fields_ids_map: &mut GlobalFieldsIdsMap,
+        cached_sorter: &mut CboCachedSorter,
+        document_change: DocumentChange,
+    ) -> Result<()>;
+
+    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
+        -> Result<Option<Vec<&'a str>>>;
+
+    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
+}
+
+impl<T: SearchableExtractor> DocidsExtractor for T {
+    fn run_extraction(
+        index: &Index,
+        fields_ids_map: &GlobalFieldsIdsMap,
+        indexer: GrenadParameters,
+        document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
+    ) -> Result<HashMapMerger> {
+        Self::run_extraction(index, fields_ids_map, indexer, document_changes)
+    }
+}
--- a/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/milli/src/update/new/extract/searchable/tokenize_document.rs
@ -0,0 +1,273 @@
+use std::collections::HashMap;
+
+use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
+use serde_json::Value;
+
+use crate::proximity::MAX_DISTANCE;
+use crate::update::new::extract::perm_json_p::{
+    seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
+};
+use crate::update::new::KvReaderFieldId;
+use crate::{
+    FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
+    MAX_WORD_LENGTH,
+};
+
+pub struct DocumentTokenizer<'a> {
+    pub tokenizer: &'a Tokenizer<'a>,
+    pub attribute_to_extract: Option<&'a [&'a str]>,
+    pub attribute_to_skip: &'a [&'a str],
+    pub localized_attributes_rules: &'a [LocalizedAttributesRule],
+    pub max_positions_per_attributes: u32,
+}
+
+impl<'a> DocumentTokenizer<'a> {
+    pub fn tokenize_document(
+        &self,
+        obkv: &KvReaderFieldId,
+        field_id_map: &mut GlobalFieldsIdsMap,
+        token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
+    ) -> Result<()> {
+        let mut field_position = HashMap::new();
+        let mut field_name = String::new();
+        for (field_id, field_bytes) in obkv {
+            let Some(field_name) = field_id_map.name(field_id).map(|s| {
+                field_name.clear();
+                field_name.push_str(s);
+                &field_name
+            }) else {
+                unreachable!("field id not found in field id map");
+            };
+
+            let mut tokenize_field = |name: &str, value: &Value| {
+                let Some(field_id) = field_id_map.id_or_insert(name) else {
+                    return Err(UserError::AttributeLimitReached.into());
+                };
+
+                let position = field_position
+                    .entry(field_id)
+                    .and_modify(|counter| *counter += MAX_DISTANCE)
+                    .or_insert(0);
+                if *position as u32 >= self.max_positions_per_attributes {
+                    return Ok(());
+                }
+
+                match value {
+                    Value::Number(n) => {
+                        let token = n.to_string();
+                        if let Ok(position) = (*position).try_into() {
+                            token_fn(name, field_id, position, token.as_str())?;
+                        }
+
+                        Ok(())
+                    }
+                    Value::String(text) => {
+                        // create an iterator of token with their positions.
+                        let locales = self
+                            .localized_attributes_rules
+                            .iter()
+                            .find(|rule| rule.match_str(field_name))
+                            .map(|rule| rule.locales());
+                        let tokens = process_tokens(
+                            *position,
+                            self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
+                        )
+                        .take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes);
+
+                        for (index, token) in tokens {
+                            // keep a word only if it is not empty and fit in a LMDB key.
+                            let token = token.lemma().trim();
+                            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                                *position = index;
+                                if let Ok(position) = (*position).try_into() {
+                                    token_fn(name, field_id, position, token)?;
+                                }
+                            }
+                        }
+
+                        Ok(())
+                    }
+                    _ => Ok(()),
+                }
+            };
+
+            // if the current field is searchable or contains a searchable attribute
+            if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
+                // parse json.
+                match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
+                    Value::Object(object) => seek_leaf_values_in_object(
+                        &object,
+                        self.attribute_to_extract,
+                        self.attribute_to_skip,
+                        field_name,
+                        &mut tokenize_field,
+                    )?,
+                    Value::Array(array) => seek_leaf_values_in_array(
+                        &array,
+                        self.attribute_to_extract,
+                        self.attribute_to_skip,
+                        field_name,
+                        &mut tokenize_field,
+                    )?,
+                    value => tokenize_field(field_name, &value)?,
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// take an iterator on tokens and compute their relative position depending on separator kinds
+/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
+/// else we keep the standard proximity of 1 between words.
+fn process_tokens<'a>(
+    start_offset: u32,
+    tokens: impl Iterator<Item = Token<'a>>,
+) -> impl Iterator<Item = (u32, Token<'a>)> {
+    tokens
+        .skip_while(|token| token.is_separator())
+        .scan((start_offset, None), |(offset, prev_kind), mut token| {
+            match token.kind {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
+                    *offset += match *prev_kind {
+                        Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
+                        Some(_) => 1,
+                        None => 0,
+                    };
+                    *prev_kind = Some(token.kind)
+                }
+                TokenKind::Separator(SeparatorKind::Hard) => {
+                    *prev_kind = Some(token.kind);
+                }
+                TokenKind::Separator(SeparatorKind::Soft)
+                    if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
+                {
+                    *prev_kind = Some(token.kind);
+                }
+                _ => token.kind = TokenKind::Unknown,
+            }
+            Some((*offset, token))
+        })
+        .filter(|(_, t)| t.is_word())
+}
+
+/// Factorize tokenizer building.
+pub fn tokenizer_builder<'a>(
+    stop_words: Option<&'a fst::Set<&'a [u8]>>,
+    allowed_separators: Option<&'a [&str]>,
+    dictionary: Option<&'a [&str]>,
+) -> TokenizerBuilder<'a, &'a [u8]> {
+    let mut tokenizer_builder = TokenizerBuilder::new();
+    if let Some(stop_words) = stop_words {
+        tokenizer_builder.stop_words(stop_words);
+    }
+    if let Some(dictionary) = dictionary {
+        tokenizer_builder.words_dict(dictionary);
+    }
+    if let Some(separators) = allowed_separators {
+        tokenizer_builder.separators(separators);
+    }
+
+    tokenizer_builder
+}
+
+#[cfg(test)]
+mod test {
+    use charabia::TokenizerBuilder;
+    use meili_snap::snapshot;
+    use obkv::KvReader;
+    use serde_json::json;
+
+    use super::*;
+    use crate::FieldsIdsMap;
+
+    #[test]
+    fn test_tokenize_document() {
+        let mut fields_ids_map = FieldsIdsMap::new();
+
+        let field_1 = json!({
+                "name": "doggo",
+                "age": 10,
+        });
+
+        let field_2 = json!({
+                "catto": {
+                    "name": "pesti",
+                    "age": 23,
+                }
+        });
+
+        let field_3 = json!(["doggo", "catto"]);
+        let field_4 = json!("UNSEARCHABLE");
+        let field_5 = json!({"nope": "unsearchable"});
+
+        let mut obkv = obkv::KvWriter::memory();
+        let field_1_id = fields_ids_map.insert("doggo").unwrap();
+        let field_1 = serde_json::to_string(&field_1).unwrap();
+        obkv.insert(field_1_id, field_1.as_bytes()).unwrap();
+        let field_2_id = fields_ids_map.insert("catto").unwrap();
+        let field_2 = serde_json::to_string(&field_2).unwrap();
+        obkv.insert(field_2_id, field_2.as_bytes()).unwrap();
+        let field_3_id = fields_ids_map.insert("doggo.name").unwrap();
+        let field_3 = serde_json::to_string(&field_3).unwrap();
+        obkv.insert(field_3_id, field_3.as_bytes()).unwrap();
+        let field_4_id = fields_ids_map.insert("not-me").unwrap();
+        let field_4 = serde_json::to_string(&field_4).unwrap();
+        obkv.insert(field_4_id, field_4.as_bytes()).unwrap();
+        let field_5_id = fields_ids_map.insert("me-nether").unwrap();
+        let field_5 = serde_json::to_string(&field_5).unwrap();
+        obkv.insert(field_5_id, field_5.as_bytes()).unwrap();
+        let value = obkv.into_inner().unwrap();
+        let obkv = KvReader::from_slice(value.as_slice());
+
+        let mut tb = TokenizerBuilder::default();
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tb.build(),
+            attribute_to_extract: None,
+            attribute_to_skip: &["not-me", "me-nether.nope"],
+            localized_attributes_rules: &[],
+            max_positions_per_attributes: 1000,
+        };
+
+        let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
+        let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
+
+        let mut words = std::collections::BTreeMap::new();
+        document_tokenizer
+            .tokenize_document(obkv, &mut global_fields_ids_map, &mut |_fname, fid, pos, word| {
+                words.insert([fid, pos], word.to_string());
+                Ok(())
+            })
+            .unwrap();
+
+        snapshot!(format!("{:#?}", words), @r###"
+        {
+            [
+                2,
+                0,
+            ]: "doggo",
+            [
+                2,
+                MAX_DISTANCE,
+            ]: "doggo",
+            [
+                2,
+                16,
+            ]: "catto",
+            [
+                3,
+                0,
+            ]: "10",
+            [
+                4,
+                0,
+            ]: "pesti",
+            [
+                5,
+                0,
+            ]: "23",
+        }
+        "###);
+    }
+}
--- a/milli/src/update/new/indexer/document_deletion.rs
+++ b/milli/src/update/new/indexer/document_deletion.rs
@ -0,0 +1,42 @@
+use std::sync::Arc;
+
+use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+use roaring::RoaringBitmap;
+
+use super::DocumentChanges;
+use crate::update::new::{Deletion, DocumentChange, ItemsPool};
+use crate::{FieldsIdsMap, Index, Result};
+
+pub struct DocumentDeletion {
+    pub to_delete: RoaringBitmap,
+}
+
+impl DocumentDeletion {
+    pub fn new() -> Self {
+        Self { to_delete: Default::default() }
+    }
+
+    pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) {
+        self.to_delete |= docids;
+    }
+}
+
+impl<'p> DocumentChanges<'p> for DocumentDeletion {
+    type Parameter = &'p Index;
+
+    fn document_changes(
+        self,
+        _fields_ids_map: &mut FieldsIdsMap,
+        param: Self::Parameter,
+    ) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
+        let index = param;
+        let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from)));
+        let to_delete: Vec<_> = self.to_delete.into_iter().collect();
+        Ok(to_delete.into_par_iter().map_with(items, |items, docid| {
+            items.with(|rtxn| {
+                let current = index.document(rtxn, docid)?;
+                Ok(DocumentChange::Deletion(Deletion::create(docid, current.boxed())))
+            })
+        }))
+    }
+}
--- a/milli/src/update/new/indexer/document_operation.rs
+++ b/milli/src/update/new/indexer/document_operation.rs
@ -0,0 +1,392 @@
+use std::borrow::Cow;
+use std::collections::{BTreeMap, HashMap};
+use std::sync::Arc;
+
+use heed::types::Bytes;
+use heed::RoTxn;
+use memmap2::Mmap;
+use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+use IndexDocumentsMethod as Idm;
+
+use super::super::document_change::DocumentChange;
+use super::super::items_pool::ItemsPool;
+use super::super::{CowStr, TopLevelMap};
+use super::DocumentChanges;
+use crate::documents::{DocumentIdExtractionError, PrimaryKey};
+use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update};
+use crate::update::{AvailableIds, IndexDocumentsMethod};
+use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError};
+
+pub struct DocumentOperation<'pl> {
+    operations: Vec<Payload<'pl>>,
+    index_documents_method: IndexDocumentsMethod,
+}
+
+pub enum Payload<'pl> {
+    Addition(&'pl [u8]),
+    Deletion(Vec<String>),
+}
+
+pub struct PayloadStats {
+    pub document_count: usize,
+    pub bytes: u64,
+}
+
+#[derive(Clone)]
+enum InnerDocOp<'pl> {
+    Addition(DocumentOffset<'pl>),
+    Deletion,
+}
+
+/// Represents an offset where a document lives
+/// in an mmapped grenad reader file.
+#[derive(Clone)]
+pub struct DocumentOffset<'pl> {
+    /// The mmapped payload files.
+    pub content: &'pl [u8],
+}
+
+impl<'pl> DocumentOperation<'pl> {
+    pub fn new(method: IndexDocumentsMethod) -> Self {
+        Self { operations: Default::default(), index_documents_method: method }
+    }
+
+    /// TODO please give me a type
+    /// The payload is expected to be in the grenad format
+    pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result<PayloadStats> {
+        payload.advise(memmap2::Advice::Sequential)?;
+        let document_count =
+            memchr::memmem::find_iter(&payload[..], "}{").count().saturating_add(1);
+        self.operations.push(Payload::Addition(&payload[..]));
+        Ok(PayloadStats { bytes: payload.len() as u64, document_count })
+    }
+
+    pub fn delete_documents(&mut self, to_delete: Vec<String>) {
+        self.operations.push(Payload::Deletion(to_delete))
+    }
+}
+
+impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> {
+    type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>);
+
+    fn document_changes(
+        self,
+        fields_ids_map: &mut FieldsIdsMap,
+        param: Self::Parameter,
+    ) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
+        let (index, rtxn, primary_key) = param;
+
+        let documents_ids = index.documents_ids(rtxn)?;
+        let mut available_docids = AvailableIds::new(&documents_ids);
+        let mut docids_version_offsets = HashMap::<CowStr<'pl>, _>::new();
+
+        for operation in self.operations {
+            match operation {
+                Payload::Addition(payload) => {
+                    let mut iter =
+                        serde_json::Deserializer::from_slice(payload).into_iter::<TopLevelMap>();
+
+                    /// TODO manage the error
+                    let mut previous_offset = 0;
+                    while let Some(document) = iter.next().transpose().unwrap() {
+                        // TODO Fetch all document fields to fill the fields ids map
+                        document.0.keys().for_each(|key| {
+                            fields_ids_map.insert(key.as_ref());
+                        });
+
+                        // TODO we must manage the TooManyDocumentIds,InvalidDocumentId
+                        //      we must manage the unwrap
+                        let external_document_id =
+                            match primary_key.document_id_from_top_level_map(&document)? {
+                                Ok(document_id) => Ok(document_id),
+                                Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e),
+                                Err(DocumentIdExtractionError::MissingDocumentId) => {
+                                    Err(UserError::MissingDocumentId {
+                                        primary_key: primary_key.name().to_string(),
+                                        document: document.try_into().unwrap(),
+                                    })
+                                }
+                                Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
+                                    Err(UserError::TooManyDocumentIds {
+                                        primary_key: primary_key.name().to_string(),
+                                        document: document.try_into().unwrap(),
+                                    })
+                                }
+                            }?;
+
+                        let current_offset = iter.byte_offset();
+                        let document_operation = InnerDocOp::Addition(DocumentOffset {
+                            content: &payload[previous_offset..current_offset],
+                        });
+
+                        match docids_version_offsets.get_mut(external_document_id.as_ref()) {
+                            None => {
+                                let docid = match index
+                                    .external_documents_ids()
+                                    .get(rtxn, &external_document_id)?
+                                {
+                                    Some(docid) => docid,
+                                    None => available_docids
+                                        .next()
+                                        .ok_or(Error::UserError(UserError::DocumentLimitReached))?,
+                                };
+
+                                docids_version_offsets.insert(
+                                    external_document_id,
+                                    (docid, vec![document_operation]),
+                                );
+                            }
+                            Some((_, offsets)) => {
+                                let useless_previous_addition = match self.index_documents_method {
+                                    IndexDocumentsMethod::ReplaceDocuments => {
+                                        MergeDocumentForReplacement::USELESS_PREVIOUS_CHANGES
+                                    }
+                                    IndexDocumentsMethod::UpdateDocuments => {
+                                        MergeDocumentForUpdates::USELESS_PREVIOUS_CHANGES
+                                    }
+                                };
+
+                                if useless_previous_addition {
+                                    offsets.clear();
+                                }
+
+                                offsets.push(document_operation);
+                            }
+                        }
+
+                        previous_offset = iter.byte_offset();
+                    }
+                }
+                Payload::Deletion(to_delete) => {
+                    for external_document_id in to_delete {
+                        match docids_version_offsets.get_mut(external_document_id.as_str()) {
+                            None => {
+                                let docid = match index
+                                    .external_documents_ids()
+                                    .get(rtxn, &external_document_id)?
+                                {
+                                    Some(docid) => docid,
+                                    None => available_docids
+                                        .next()
+                                        .ok_or(Error::UserError(UserError::DocumentLimitReached))?,
+                                };
+
+                                docids_version_offsets.insert(
+                                    CowStr(external_document_id.into()),
+                                    (docid, vec![InnerDocOp::Deletion]),
+                                );
+                            }
+                            Some((_, offsets)) => {
+                                offsets.clear();
+                                offsets.push(InnerDocOp::Deletion);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        /// TODO is it the best way to provide FieldsIdsMap to the parallel iterator?
+        let fields_ids_map = fields_ids_map.clone();
+        // TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
+        let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect();
+        // Reorder the offsets to make sure we iterate on the file sequentially
+        let sort_function_key = match self.index_documents_method {
+            Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key,
+            Idm::UpdateDocuments => MergeDocumentForUpdates::sort_key,
+        };
+
+        // And finally sort them
+        docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops));
+
+        Ok(docids_version_offsets.into_par_iter().map_with(
+            Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))),
+            move |context_pool, (external_docid, (internal_docid, operations))| {
+                context_pool.with(|rtxn| {
+                    let document_merge_function = match self.index_documents_method {
+                        Idm::ReplaceDocuments => MergeDocumentForReplacement::merge,
+                        Idm::UpdateDocuments => MergeDocumentForUpdates::merge,
+                    };
+
+                    document_merge_function(
+                        rtxn,
+                        index,
+                        &fields_ids_map,
+                        internal_docid,
+                        external_docid.to_string(), // TODO do not clone
+                        &operations,
+                    )
+                })
+            },
+        ))
+    }
+}
+
+trait MergeChanges {
+    /// Wether the payloads in the list of operations are useless or not.
+    const USELESS_PREVIOUS_CHANGES: bool;
+
+    /// Returns a key that is used to order the payloads the right way.
+    fn sort_key(docops: &[InnerDocOp]) -> usize;
+
+    fn merge(
+        rtxn: &RoTxn,
+        index: &Index,
+        fields_ids_map: &FieldsIdsMap,
+        docid: DocumentId,
+        external_docid: String,
+        operations: &[InnerDocOp],
+    ) -> Result<DocumentChange>;
+}
+
+struct MergeDocumentForReplacement;
+
+impl MergeChanges for MergeDocumentForReplacement {
+    const USELESS_PREVIOUS_CHANGES: bool = true;
+
+    /// Reorders to read only the last change.
+    fn sort_key(docops: &[InnerDocOp]) -> usize {
+        let f = |ido: &_| match ido {
+            InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize),
+            InnerDocOp::Deletion => None,
+        };
+        docops.iter().rev().find_map(f).unwrap_or(0)
+    }
+
+    /// Returns only the most recent version of a document based on the updates from the payloads.
+    ///
+    /// This function is only meant to be used when doing a replacement and not an update.
+    fn merge(
+        rtxn: &RoTxn,
+        index: &Index,
+        fields_ids_map: &FieldsIdsMap,
+        docid: DocumentId,
+        external_docid: String,
+        operations: &[InnerDocOp],
+    ) -> Result<DocumentChange> {
+        let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
+        let current: Option<&KvReaderFieldId> = current.map(Into::into);
+
+        match operations.last() {
+            Some(InnerDocOp::Addition(DocumentOffset { content })) => {
+                let map: TopLevelMap = serde_json::from_slice(content).unwrap();
+                let mut document_entries = Vec::new();
+                for (key, v) in map.0 {
+                    let id = fields_ids_map.id(key.as_ref()).unwrap();
+                    document_entries.push((id, v));
+                }
+
+                document_entries.sort_unstable_by_key(|(id, _)| *id);
+
+                let mut writer = KvWriterFieldId::memory();
+                document_entries
+                    .into_iter()
+                    .for_each(|(id, value)| writer.insert(id, value.get()).unwrap());
+                let new = writer.into_boxed();
+
+                match current {
+                    Some(current) => {
+                        let update = Update::create(docid, current.boxed(), new);
+                        Ok(DocumentChange::Update(update))
+                    }
+                    None => Ok(DocumentChange::Insertion(Insertion::create(docid, new))),
+                }
+            }
+            Some(InnerDocOp::Deletion) => {
+                let deletion = match current {
+                    Some(current) => Deletion::create(docid, current.boxed()),
+                    None => todo!("Do that with Louis"),
+                };
+                Ok(DocumentChange::Deletion(deletion))
+            }
+            None => unreachable!("We must not have empty set of operations on a document"),
+        }
+    }
+}
+
+struct MergeDocumentForUpdates;
+
+impl MergeChanges for MergeDocumentForUpdates {
+    const USELESS_PREVIOUS_CHANGES: bool = false;
+
+    /// Reorders to read the first changes first so that it's faster to read the first one and then the rest.
+    fn sort_key(docops: &[InnerDocOp]) -> usize {
+        let f = |ido: &_| match ido {
+            InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize),
+            InnerDocOp::Deletion => None,
+        };
+        docops.iter().find_map(f).unwrap_or(0)
+    }
+
+    /// Reads the previous version of a document from the database, the new versions
+    /// in the grenad update files and merges them to generate a new boxed obkv.
+    ///
+    /// This function is only meant to be used when doing an update and not a replacement.
+    fn merge(
+        rtxn: &RoTxn,
+        index: &Index,
+        fields_ids_map: &FieldsIdsMap,
+        docid: DocumentId,
+        external_docid: String,
+        operations: &[InnerDocOp],
+    ) -> Result<DocumentChange> {
+        let mut document = BTreeMap::<_, Cow<_>>::new();
+        let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
+        let current: Option<&KvReaderFieldId> = current.map(Into::into);
+
+        if operations.is_empty() {
+            unreachable!("We must not have empty set of operations on a document");
+        }
+
+        let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion));
+        let operations = &operations[last_deletion.map_or(0, |i| i + 1)..];
+
+        // If there was a deletion we must not start
+        // from the original document but from scratch.
+        if last_deletion.is_none() {
+            if let Some(current) = current {
+                current.into_iter().for_each(|(k, v)| {
+                    document.insert(k, v.into());
+                });
+            }
+        }
+
+        if operations.is_empty() {
+            let deletion = match current {
+                Some(current) => Deletion::create(docid, current.boxed()),
+                None => todo!("Do that with Louis"),
+            };
+            return Ok(DocumentChange::Deletion(deletion));
+        }
+
+        for operation in operations {
+            let DocumentOffset { content } = match operation {
+                InnerDocOp::Addition(offset) => offset,
+                InnerDocOp::Deletion => {
+                    unreachable!("Deletion in document operations")
+                }
+            };
+
+            let map: TopLevelMap = serde_json::from_slice(content).unwrap();
+            for (key, v) in map.0 {
+                let id = fields_ids_map.id(key.as_ref()).unwrap();
+                document.insert(id, v.get().as_bytes().to_vec().into());
+            }
+        }
+
+        let mut writer = KvWriterFieldId::memory();
+        document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap());
+        let new = writer.into_boxed();
+
+        match current {
+            Some(current) => {
+                let update = Update::create(docid, current.boxed(), new);
+                Ok(DocumentChange::Update(update))
+            }
+            None => {
+                let insertion = Insertion::create(docid, new);
+                Ok(DocumentChange::Insertion(insertion))
+            }
+        }
+    }
+}
--- a/milli/src/update/new/indexer/mod.rs
+++ b/milli/src/update/new/indexer/mod.rs
@ -0,0 +1,316 @@
+use std::sync::RwLock;
+use std::thread::{self, Builder};
+
+use big_s::S;
+pub use document_deletion::DocumentDeletion;
+pub use document_operation::DocumentOperation;
+use heed::{RoTxn, RwTxn};
+pub use partial_dump::PartialDump;
+use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+use rayon::ThreadPool;
+pub use update_by_function::UpdateByFunction;
+
+use super::channel::*;
+use super::document_change::DocumentChange;
+use super::extract::*;
+use super::merger::merge_grenad_entries;
+use super::{ItemsPool, StdResult, TopLevelMap};
+use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
+use crate::update::new::channel::ExtractorSender;
+use crate::update::GrenadParameters;
+use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
+
+mod document_deletion;
+mod document_operation;
+mod partial_dump;
+mod update_by_function;
+
+pub trait DocumentChanges<'p> {
+    type Parameter: 'p;
+
+    fn document_changes(
+        self,
+        fields_ids_map: &mut FieldsIdsMap,
+        param: Self::Parameter,
+    ) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
+}
+
+/// This is the main function of this crate.
+///
+/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
+///
+/// TODO return stats
+pub fn index<PI>(
+    wtxn: &mut RwTxn,
+    index: &Index,
+    fields_ids_map: FieldsIdsMap,
+    pool: &ThreadPool,
+    document_changes: PI,
+) -> Result<()>
+where
+    PI: IndexedParallelIterator<Item = Result<DocumentChange>> + Send + Clone,
+{
+    let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
+    // This channel acts as a rendezvous point to ensure that we are one task ahead
+    let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
+
+    let fields_ids_map_lock = RwLock::new(fields_ids_map);
+    let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
+    let global_fields_ids_map_clone = global_fields_ids_map.clone();
+
+    thread::scope(|s| {
+        // TODO manage the errors correctly
+        let current_span = tracing::Span::current();
+        let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
+            pool.in_place_scope(|_s| {
+                    let span = tracing::trace_span!(target: "indexing::documents", parent: &current_span, "extract");
+                    let _entered = span.enter();
+                    let document_changes = document_changes.into_par_iter();
+
+                    // document but we need to create a function that collects and compresses documents.
+                    let document_sender = extractor_sender.document_sender();
+                    document_changes.clone().into_par_iter().try_for_each(|result| {
+                        match result? {
+                            DocumentChange::Deletion(deletion) => {
+                                let docid = deletion.docid();
+                                document_sender.delete(docid).unwrap();
+                            }
+                            DocumentChange::Update(update) => {
+                                let docid = update.docid();
+                                let content = update.new();
+                                document_sender.insert(docid, content.boxed()).unwrap();
+                            }
+                            DocumentChange::Insertion(insertion) => {
+                                let docid = insertion.docid();
+                                let content = insertion.new();
+                                document_sender.insert(docid, content.boxed()).unwrap();
+                                // extracted_dictionary_sender.send(self, dictionary: &[u8]);
+                            }
+                        }
+                        Ok(()) as Result<_>
+                    })?;
+
+                    document_sender.finish().unwrap();
+
+                    const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
+                    let max_memory = TEN_GIB / dbg!(rayon::current_num_threads());
+                    let grenad_parameters = GrenadParameters {
+                        max_memory: Some(max_memory),
+                        ..GrenadParameters::default()
+                    };
+
+                    {
+                        let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
+                        let _entered = span.enter();
+                        extract_and_send_docids::<
+                            FacetedDocidsExtractor,
+                            FacetDocids,
+                        >(
+                            index,
+                            &global_fields_ids_map,
+                            grenad_parameters,
+                            document_changes.clone(),
+                            &extractor_sender,
+                        )?;
+                    }
+
+                    {
+                        let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
+                        let _entered = span.enter();
+
+                        let WordDocidsMergers {
+                            word_fid_docids,
+                            word_docids,
+                            exact_word_docids,
+                            word_position_docids,
+                            fid_word_count_docids,
+                        } = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?;
+                        extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
+                        extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
+                        extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
+                        extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
+                        extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
+                    }
+
+                    // {
+                    //     let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids");
+                    //     let _entered = span.enter();
+                    //     extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
+                    //         index,
+                    //         &global_fields_ids_map,
+                    //         grenad_parameters,
+                    //         document_changes.clone(),
+                    //         &extractor_sender,
+                    //     )?;
+                    // }
+
+                    // {
+                    //     let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids");
+                    //     let _entered = span.enter();
+                    //     extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
+                    //         index,
+                    //         &global_fields_ids_map,
+                    //         grenad_parameters,
+                    //         document_changes.clone(),
+                    //         &extractor_sender,
+                    //     )?;
+                    // }
+
+                    // {
+                    //     let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids");
+                    //     let _entered = span.enter();
+                    //     extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
+                    //         index,
+                    //         &global_fields_ids_map,
+                    //         GrenadParameters::default(),
+                    //         document_changes.clone(),
+                    //         &extractor_sender,
+                    //     )?;
+                    // }
+
+                    {
+                        let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
+                        let _entered = span.enter();
+                        extract_and_send_docids::<
+                            WordPairProximityDocidsExtractor,
+                            WordPairProximityDocids,
+                        >(
+                            index,
+                            &global_fields_ids_map,
+                            grenad_parameters,
+                            document_changes.clone(),
+                            &extractor_sender,
+                        )?;
+                    }
+
+                    {
+                        let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
+                        let _entered = span.enter();
+                    }
+
+                    // TODO THIS IS TOO MUCH
+                    // - [ ] Extract fieldid docid facet number
+                    // - [ ] Extract fieldid docid facet string
+                    // - [ ] Extract facetid string fst
+                    // - [ ] Extract facetid normalized string strings
+
+                    // TODO Inverted Indexes again
+                    // - [x] Extract fieldid facet isempty docids
+                    // - [x] Extract fieldid facet isnull docids
+                    // - [x] Extract fieldid facet exists docids
+
+                    // TODO This is the normal system
+                    // - [x] Extract fieldid facet number docids
+                    // - [x] Extract fieldid facet string docids
+
+                    Ok(()) as Result<_>
+                })
+        })?;
+
+        // TODO manage the errors correctly
+        let current_span = tracing::Span::current();
+        let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
+            let span =
+                tracing::trace_span!(target: "indexing::documents", parent: &current_span, "merge");
+            let _entered = span.enter();
+
+            let rtxn_pool = ItemsPool::new(|| index.read_txn().map_err(Into::into));
+            merge_grenad_entries(
+                merger_receiver,
+                merger_sender,
+                &rtxn_pool,
+                index,
+                global_fields_ids_map_clone,
+            )
+        })?;
+
+        let mut entries_count = 0;
+        for operation in writer_receiver {
+            let database = operation.database(index);
+            match operation.entry() {
+                EntryOperation::Delete(e) => {
+                    if !database.delete(wtxn, e.entry())? {
+                        unreachable!("We tried to delete an unknown key")
+                    }
+                }
+                EntryOperation::Write(e) => {
+                    entries_count += 1;
+                    database.put(wtxn, e.key(), e.value())?
+                }
+            }
+        }
+
+        eprintln!("We saw {entries_count}");
+
+        /// TODO handle the panicking threads
+        handle.join().unwrap()?;
+        handle2.join().unwrap()?;
+
+        Ok(()) as Result<_>
+    })?;
+
+    let fields_ids_map = fields_ids_map_lock.into_inner().unwrap();
+    index.put_fields_ids_map(wtxn, &fields_ids_map)?;
+
+    Ok(())
+}
+
+/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
+/// TODO: manage the errors correctly
+/// TODO: we must have a single trait that also gives the extractor type
+fn extract_and_send_docids<E: DocidsExtractor, D: MergerOperationType>(
+    index: &Index,
+    fields_ids_map: &GlobalFieldsIdsMap,
+    indexer: GrenadParameters,
+    document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
+    sender: &ExtractorSender,
+) -> Result<()> {
+    let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?;
+    Ok(sender.send_searchable::<D>(merger).unwrap())
+}
+
+/// Returns the primary key *field id* that has already been set for this index or the
+/// one we will guess by searching for the first key that contains "id" as a substring.
+/// TODO move this elsewhere
+pub fn retrieve_or_guess_primary_key<'a>(
+    rtxn: &'a RoTxn<'a>,
+    index: &Index,
+    fields_ids_map: &mut FieldsIdsMap,
+    first_document: Option<&'a TopLevelMap<'_>>,
+) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
+    match index.primary_key(rtxn)? {
+        Some(primary_key) => match PrimaryKey::new(primary_key, fields_ids_map) {
+            Some(primary_key) => Ok(Ok(primary_key)),
+            None => unreachable!("Why is the primary key not in the fidmap?"),
+        },
+        None => {
+            let first_document = match first_document {
+                Some(document) => document,
+                None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
+            };
+
+            let mut guesses: Vec<&str> = first_document
+                .keys()
+                .map(AsRef::as_ref)
+                .filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
+                .collect();
+
+            // sort the keys in lexicographical order, so that fields are always in the same order.
+            guesses.sort_unstable();
+
+            match guesses.as_slice() {
+                [] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
+                [name] => {
+                    tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
+                    match fields_ids_map.insert(name) {
+                        Some(field_id) => Ok(Ok(PrimaryKey::Flat { name, field_id })),
+                        None => Ok(Err(UserError::AttributeLimitReached)),
+                    }
+                }
+                multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
+                    candidates: multiple.iter().map(|candidate| candidate.to_string()).collect(),
+                })),
+            }
+        }
+    }
+}
--- a/milli/src/update/new/indexer/partial_dump.rs
+++ b/milli/src/update/new/indexer/partial_dump.rs
@ -0,0 +1,73 @@
+use rayon::iter::{IndexedParallelIterator, ParallelBridge, ParallelIterator};
+
+use super::DocumentChanges;
+use crate::documents::{DocumentIdExtractionError, PrimaryKey};
+use crate::update::concurrent_available_ids::ConcurrentAvailableIds;
+use crate::update::new::{DocumentChange, Insertion, KvWriterFieldId};
+use crate::{all_obkv_to_json, Error, FieldsIdsMap, Object, Result, UserError};
+
+pub struct PartialDump<I> {
+    iter: I,
+}
+
+impl<I> PartialDump<I> {
+    pub fn new_from_jsonlines(iter: I) -> Self {
+        PartialDump { iter }
+    }
+}
+
+impl<'p, I> DocumentChanges<'p> for PartialDump<I>
+where
+    I: IndexedParallelIterator<Item = Object> + Clone + 'p,
+{
+    type Parameter = (&'p FieldsIdsMap, &'p ConcurrentAvailableIds, &'p PrimaryKey<'p>);
+
+    /// Note for future self:
+    ///   - the field ids map must already be valid so you must have to generate it beforehand.
+    ///   - We should probably expose another method that generates the fields ids map from an iterator of JSON objects.
+    ///   - We recommend sending chunks of documents in this `PartialDumpIndexer` we therefore need to create a custom take_while_size method (that doesn't drop items).
+    fn document_changes(
+        self,
+        _fields_ids_map: &mut FieldsIdsMap,
+        param: Self::Parameter,
+    ) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
+        let (fields_ids_map, concurrent_available_ids, primary_key) = param;
+
+        Ok(self.iter.map(|object| {
+            let docid = match concurrent_available_ids.next() {
+                Some(id) => id,
+                None => return Err(Error::UserError(UserError::DocumentLimitReached)),
+            };
+
+            let mut writer = KvWriterFieldId::memory();
+            object.iter().for_each(|(key, value)| {
+                let key = fields_ids_map.id(key).unwrap();
+                /// TODO better error management
+                let value = serde_json::to_vec(&value).unwrap();
+                /// TODO it is not ordered
+                writer.insert(key, value).unwrap();
+            });
+
+            let document = writer.into_boxed();
+            let external_docid = match primary_key.document_id(&document, fields_ids_map)? {
+                Ok(document_id) => Ok(document_id),
+                Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
+                Err(DocumentIdExtractionError::MissingDocumentId) => {
+                    Err(UserError::MissingDocumentId {
+                        primary_key: primary_key.name().to_string(),
+                        document: all_obkv_to_json(&document, fields_ids_map)?,
+                    })
+                }
+                Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
+                    Err(UserError::TooManyDocumentIds {
+                        primary_key: primary_key.name().to_string(),
+                        document: all_obkv_to_json(&document, fields_ids_map)?,
+                    })
+                }
+            }?;
+
+            let insertion = Insertion::create(docid, document);
+            Ok(DocumentChange::Insertion(insertion))
+        }))
+    }
+}
--- a/milli/src/update/new/indexer/update_by_function.rs
+++ b/milli/src/update/new/indexer/update_by_function.rs
@ -0,0 +1,19 @@
+use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
+
+use super::DocumentChanges;
+use crate::update::new::DocumentChange;
+use crate::{FieldsIdsMap, Result};
+
+pub struct UpdateByFunction;
+
+impl<'p> DocumentChanges<'p> for UpdateByFunction {
+    type Parameter = ();
+
+    fn document_changes(
+        self,
+        _fields_ids_map: &mut FieldsIdsMap,
+        _param: Self::Parameter,
+    ) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
+        Ok((0..100).into_par_iter().map(|_| todo!()))
+    }
+}
--- a/milli/src/update/new/items_pool.rs
+++ b/milli/src/update/new/items_pool.rs
@ -0,0 +1,54 @@
+use crossbeam_channel::{Receiver, Sender, TryRecvError};
+
+/// A pool of items that can be pull and generated on demand.
+pub struct ItemsPool<F, T, E>
+where
+    F: Fn() -> Result<T, E>,
+{
+    init: F,
+    sender: Sender<T>,
+    receiver: Receiver<T>,
+}
+
+impl<F, T, E> ItemsPool<F, T, E>
+where
+    F: Fn() -> Result<T, E>,
+{
+    /// Create a new unbounded items pool with the specified function
+    /// to generate items when needed.
+    ///
+    /// The `init` function will be invoked whenever a call to `with` requires new items.
+    pub fn new(init: F) -> Self {
+        let (sender, receiver) = crossbeam_channel::unbounded();
+        ItemsPool { init, sender, receiver }
+    }
+
+    /// Consumes the pool to retrieve all remaining items.
+    ///
+    /// This method is useful for cleaning up and managing the items once they are no longer needed.
+    pub fn into_items(self) -> crossbeam_channel::IntoIter<T> {
+        self.receiver.into_iter()
+    }
+
+    /// Allows running a function on an item from the pool,
+    /// potentially generating a new item if the pool is empty.
+    pub fn with<G, R>(&self, f: G) -> Result<R, E>
+    where
+        G: FnOnce(&mut T) -> Result<R, E>,
+    {
+        let mut item = match self.receiver.try_recv() {
+            Ok(item) => item,
+            Err(TryRecvError::Empty) => (self.init)()?,
+            Err(TryRecvError::Disconnected) => unreachable!(),
+        };
+
+        // Run the user's closure with the retrieved item
+        let result = f(&mut item);
+
+        if let Err(e) = self.sender.send(item) {
+            unreachable!("error when sending into channel {e}");
+        }
+
+        result
+    }
+}
--- a/milli/src/update/new/merger.rs
+++ b/milli/src/update/new/merger.rs
@ -0,0 +1,406 @@
+use std::io::{self};
+
+use bincode::ErrorKind;
+use heed::types::Bytes;
+use heed::{Database, RoTxn};
+use rayon::iter::{ParallelBridge, ParallelIterator as _};
+use roaring::RoaringBitmap;
+
+use super::channel::*;
+use super::extract::{FacetKind, HashMapMerger};
+use super::{Deletion, DocumentChange, Insertion, ItemsPool, KvReaderFieldId, Update};
+use crate::update::del_add::DelAdd;
+use crate::update::new::channel::MergerOperation;
+use crate::update::new::word_fst_builder::WordFstBuilder;
+use crate::{CboRoaringBitmapCodec, Error, GeoPoint, GlobalFieldsIdsMap, Index, Result};
+
+/// TODO We must return some infos/stats
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")]
+pub fn merge_grenad_entries<'t>(
+    receiver: MergerReceiver,
+    sender: MergerSender,
+    rtxn_pool: &ItemsPool<impl Fn() -> Result<RoTxn<'t>> + Send + Sync, RoTxn<'t>, Error>,
+    index: &Index,
+    mut global_fields_ids_map: GlobalFieldsIdsMap<'_>,
+) -> Result<()> {
+    let mut buffer: Vec<u8> = Vec::new();
+    let (mut documents_ids, mut geo_extractor) = rtxn_pool.with(|rtxn| {
+        let documents_ids = index.documents_ids(rtxn)?;
+        let geo_extractor = GeoExtractor::new(rtxn, index)?;
+        Ok((documents_ids, geo_extractor))
+    })?;
+
+    for merger_operation in receiver {
+        match merger_operation {
+            MergerOperation::ExactWordDocidsMerger(merger) => {
+                let span =
+                    tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
+                let _entered = span.enter();
+                merge_and_send_docids(
+                    merger,
+                    /// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>.
+                    index.exact_word_docids.remap_types(),
+                    rtxn_pool,
+                    &mut buffer,
+                    sender.docids::<ExactWordDocids>(),
+                    // |_, _key| Ok(()),
+                )?;
+            }
+            MergerOperation::FidWordCountDocidsMerger(merger) => {
+                let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
+                let _entered = span.enter();
+                merge_and_send_docids(
+                    merger,
+                    index.field_id_word_count_docids.remap_types(),
+                    rtxn_pool,
+                    &mut buffer,
+                    sender.docids::<FidWordCountDocids>(),
+                    // |_, _key| Ok(()),
+                )?;
+            }
+            MergerOperation::WordDocidsMerger(merger) => {
+                // let words_fst = index.words_fst(rtxn)?;
+                // let mut word_fst_builder = WordFstBuilder::new(&words_fst, 4)?;
+                {
+                    let span =
+                        tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
+                    let _entered = span.enter();
+
+                    merge_and_send_docids(
+                        merger,
+                        index.word_docids.remap_types(),
+                        rtxn_pool,
+                        &mut buffer,
+                        sender.docids::<WordDocids>(),
+                        // |deladd, key| word_fst_builder.register_word(deladd, key),
+                    )?;
+                }
+
+                {
+                    let span =
+                        tracing::trace_span!(target: "indexing::documents::merge", "words_fst");
+                    let _entered = span.enter();
+
+                    // let (word_fst_mmap, prefix_fst_mmap) = word_fst_builder.build()?;
+                    // sender.main().write_words_fst(word_fst_mmap).unwrap();
+                }
+            }
+            MergerOperation::WordFidDocidsMerger(merger) => {
+                let span =
+                    tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
+                let _entered = span.enter();
+                merge_and_send_docids(
+                    merger,
+                    index.word_fid_docids.remap_types(),
+                    rtxn_pool,
+                    &mut buffer,
+                    sender.docids::<WordFidDocids>(),
+                    // |_, _key| Ok(()),
+                )?;
+            }
+            MergerOperation::WordPairProximityDocidsMerger(merger) => {
+                let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
+                let _entered = span.enter();
+                merge_and_send_docids(
+                    merger,
+                    index.word_pair_proximity_docids.remap_types(),
+                    rtxn_pool,
+                    &mut buffer,
+                    sender.docids::<WordPairProximityDocids>(),
+                    // |_, _key| Ok(()),
+                )?;
+            }
+            MergerOperation::WordPositionDocidsMerger(merger) => {
+                let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
+                let _entered = span.enter();
+                merge_and_send_docids(
+                    merger,
+                    index.word_position_docids.remap_types(),
+                    rtxn_pool,
+                    &mut buffer,
+                    sender.docids::<WordPositionDocids>(),
+                    // |_, _key| Ok(()),
+                )?;
+            }
+            MergerOperation::InsertDocument { docid, document } => {
+                let span =
+                    tracing::trace_span!(target: "indexing::documents::merge", "insert_document");
+                let _entered = span.enter();
+                documents_ids.insert(docid);
+                sender.documents().uncompressed(docid, &document).unwrap();
+
+                if let Some(geo_extractor) = geo_extractor.as_mut() {
+                    rtxn_pool.with(|rtxn| {
+                        let current =
+                            index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
+                        let current: Option<&KvReaderFieldId> = current.map(Into::into);
+                        let change = match current {
+                            Some(current) => DocumentChange::Update(Update::create(
+                                docid,
+                                current.boxed(),
+                                document,
+                            )),
+                            None => DocumentChange::Insertion(Insertion::create(docid, document)),
+                        };
+                        geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
+                        Ok(())
+                    })?;
+                }
+            }
+            MergerOperation::DeleteDocument { docid } => {
+                let span =
+                    tracing::trace_span!(target: "indexing::documents::merge", "delete_document");
+                let _entered = span.enter();
+                if !documents_ids.remove(docid) {
+                    unreachable!("Tried deleting a document that we do not know about");
+                }
+                sender.documents().delete(docid).unwrap();
+
+                if let Some(geo_extractor) = geo_extractor.as_mut() {
+                    rtxn_pool.with(|rtxn| {
+                        let current = index.document(rtxn, docid)?;
+                        let change =
+                            DocumentChange::Deletion(Deletion::create(docid, current.boxed()));
+                        geo_extractor.manage_change(&mut global_fields_ids_map, &change)?;
+                        Ok(())
+                    })?;
+                }
+            }
+            MergerOperation::FinishedDocument => {
+                // send the rtree
+            }
+            MergerOperation::FacetDocidsMerger(merger) => {
+                let span =
+                    tracing::trace_span!(target: "indexing::documents::merge", "facet_docids");
+                let _entered = span.enter();
+                merge_and_send_facet_docids(
+                    merger,
+                    FacetDatabases::new(index),
+                    rtxn_pool,
+                    &mut buffer,
+                    sender.facet_docids(),
+                )?;
+            }
+        }
+    }
+
+    {
+        let span = tracing::trace_span!(target: "indexing::documents::merge", "documents_ids");
+        let _entered = span.enter();
+
+        // Send the documents ids unionized with the current one
+        /// TODO return the slice of bytes directly
+        serialize_bitmap_into_vec(&documents_ids, &mut buffer);
+        sender.send_documents_ids(&buffer).unwrap();
+    }
+
+    // ...
+
+    Ok(())
+}
+
+pub struct GeoExtractor {
+    rtree: Option<rstar::RTree<GeoPoint>>,
+}
+
+impl GeoExtractor {
+    pub fn new(rtxn: &RoTxn, index: &Index) -> Result<Option<Self>> {
+        let is_sortable = index.sortable_fields(rtxn)?.contains("_geo");
+        let is_filterable = index.filterable_fields(rtxn)?.contains("_geo");
+        if is_sortable || is_filterable {
+            Ok(Some(GeoExtractor { rtree: index.geo_rtree(rtxn)? }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    pub fn manage_change(
+        &mut self,
+        fidmap: &mut GlobalFieldsIdsMap,
+        change: &DocumentChange,
+    ) -> Result<()> {
+        match change {
+            DocumentChange::Deletion(_) => todo!(),
+            DocumentChange::Update(_) => todo!(),
+            DocumentChange::Insertion(_) => todo!(),
+        }
+    }
+
+    pub fn serialize_rtree<W: io::Write>(self, writer: &mut W) -> Result<bool> {
+        match self.rtree {
+            Some(rtree) => {
+                // TODO What should I do?
+                bincode::serialize_into(writer, &rtree).map(|_| true).map_err(|e| match *e {
+                    ErrorKind::Io(e) => Error::IoError(e),
+                    ErrorKind::InvalidUtf8Encoding(_) => todo!(),
+                    ErrorKind::InvalidBoolEncoding(_) => todo!(),
+                    ErrorKind::InvalidCharEncoding => todo!(),
+                    ErrorKind::InvalidTagEncoding(_) => todo!(),
+                    ErrorKind::DeserializeAnyNotSupported => todo!(),
+                    ErrorKind::SizeLimit => todo!(),
+                    ErrorKind::SequenceMustHaveLength => todo!(),
+                    ErrorKind::Custom(_) => todo!(),
+                })
+            }
+            None => Ok(false),
+        }
+    }
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
+#[inline(never)]
+fn merge_and_send_docids<'t>(
+    merger: HashMapMerger,
+    database: Database<Bytes, Bytes>,
+    rtxn_pool: &ItemsPool<impl Fn() -> Result<RoTxn<'t>> + Send + Sync, RoTxn<'t>, Error>,
+    buffer: &mut Vec<u8>,
+    docids_sender: impl DocidsSender + Sync,
+    // mut register_key: impl FnMut(DelAdd, &[u8]) -> Result<()> + Send + Sync,
+) -> Result<()> {
+    let now = std::time::Instant::now();
+    merger.into_iter().par_bridge().try_for_each(|(key, deladd)| {
+        rtxn_pool.with(|rtxn| {
+            let mut buffer = Vec::new();
+            let current = database.get(rtxn, &key)?;
+            match merge_cbo_bitmaps(current, deladd.del, deladd.add)? {
+                Operation::Write(bitmap) => {
+                    let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
+                    docids_sender.write(&key, value).unwrap();
+                    // register_key(DelAdd::Addition, &key)?;
+                }
+                Operation::Delete => {
+                    docids_sender.delete(&key).unwrap();
+                    // register_key(DelAdd::Deletion, &key)?;
+                }
+                Operation::Ignore => (),
+            }
+            Ok(())
+        })
+    })?;
+
+    eprintln!("I took to merger hashmaps {:.2?}", now.elapsed());
+
+    Ok(())
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
+#[inline(never)]
+fn merge_and_send_facet_docids<'t>(
+    merger: HashMapMerger,
+    database: FacetDatabases,
+    rtxn_pool: &ItemsPool<impl Fn() -> Result<RoTxn<'t>> + Send + Sync, RoTxn<'t>, Error>,
+    buffer: &mut Vec<u8>,
+    docids_sender: impl DocidsSender + Sync,
+) -> Result<()> {
+    let now = std::time::Instant::now();
+    merger.into_iter().par_bridge().try_for_each(|(key, deladd)| {
+        rtxn_pool.with(|rtxn| {
+            let mut buffer = Vec::new();
+            let current = database.get(rtxn, &key)?;
+            match merge_cbo_bitmaps(current, deladd.del, deladd.add)? {
+                Operation::Write(bitmap) => {
+                    let value = cbo_bitmap_serialize_into_vec(&bitmap, &mut buffer);
+                    docids_sender.write(&key, value).unwrap();
+                }
+                Operation::Delete => {
+                    docids_sender.delete(&key).unwrap();
+                }
+                Operation::Ignore => (),
+            }
+            Ok(())
+        })
+    })?;
+    eprintln!("I took to merger hashmaps {:.2?}", now.elapsed());
+    Ok(())
+}
+
+struct FacetDatabases {
+    /// Maps the facet field id and the docids for which this field exists
+    facet_id_exists_docids: Database<Bytes, Bytes>,
+    /// Maps the facet field id and the docids for which this field is set as null
+    facet_id_is_null_docids: Database<Bytes, Bytes>,
+    /// Maps the facet field id and the docids for which this field is considered empty
+    facet_id_is_empty_docids: Database<Bytes, Bytes>,
+    /// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
+    facet_id_f64_docids: Database<Bytes, Bytes>,
+    /// Maps the facet field id and ranges of strings with the docids that corresponds to them.
+    facet_id_string_docids: Database<Bytes, Bytes>,
+}
+
+impl FacetDatabases {
+    fn new(index: &Index) -> Self {
+        Self {
+            facet_id_exists_docids: index.facet_id_exists_docids.remap_types(),
+            facet_id_is_null_docids: index.facet_id_is_null_docids.remap_types(),
+            facet_id_is_empty_docids: index.facet_id_is_empty_docids.remap_types(),
+            facet_id_f64_docids: index.facet_id_f64_docids.remap_types(),
+            facet_id_string_docids: index.facet_id_string_docids.remap_types(),
+        }
+    }
+
+    fn get<'a>(&self, rtxn: &'a RoTxn<'_>, key: &[u8]) -> heed::Result<Option<&'a [u8]>> {
+        let (facet_kind, key) = self.extract_facet_kind(key);
+        match facet_kind {
+            FacetKind::Exists => self.facet_id_exists_docids.get(rtxn, key),
+            FacetKind::Null => self.facet_id_is_null_docids.get(rtxn, key),
+            FacetKind::Empty => self.facet_id_is_empty_docids.get(rtxn, key),
+            FacetKind::Number => self.facet_id_f64_docids.get(rtxn, key),
+            FacetKind::String => self.facet_id_string_docids.get(rtxn, key),
+        }
+    }
+
+    fn extract_facet_kind<'a>(&self, key: &'a [u8]) -> (FacetKind, &'a [u8]) {
+        (FacetKind::from(key[0]), &key[1..])
+    }
+}
+
+enum Operation {
+    Write(RoaringBitmap),
+    Delete,
+    Ignore,
+}
+
+/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
+fn merge_cbo_bitmaps(
+    current: Option<&[u8]>,
+    del: Option<RoaringBitmap>,
+    add: Option<RoaringBitmap>,
+) -> Result<Operation> {
+    let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
+    // let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
+    // let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
+
+    match (current, del, add) {
+        (None, None, None) => Ok(Operation::Ignore), // but it's strange
+        (None, None, Some(add)) => Ok(Operation::Write(add)),
+        (None, Some(_del), None) => Ok(Operation::Ignore), // but it's strange
+        (None, Some(_del), Some(add)) => Ok(Operation::Write(add)),
+        (Some(_current), None, None) => Ok(Operation::Ignore), // but it's strange
+        (Some(current), None, Some(add)) => Ok(Operation::Write(current | add)),
+        (Some(current), Some(del), add) => {
+            let output = match add {
+                Some(add) => (current - del) | add,
+                None => current - del,
+            };
+            if output.is_empty() {
+                Ok(Operation::Delete)
+            } else {
+                Ok(Operation::Write(output))
+            }
+        }
+    }
+}
+
+/// TODO Return the slice directly from the serialize_into method
+fn cbo_bitmap_serialize_into_vec<'b>(bitmap: &RoaringBitmap, buffer: &'b mut Vec<u8>) -> &'b [u8] {
+    buffer.clear();
+    CboRoaringBitmapCodec::serialize_into(bitmap, buffer);
+    buffer.as_slice()
+}
+
+/// TODO Return the slice directly from the serialize_into method
+fn serialize_bitmap_into_vec(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) {
+    buffer.clear();
+    bitmap.serialize_into(buffer).unwrap();
+    // buffer.as_slice()
+}
--- a/milli/src/update/new/mod.rs
+++ b/milli/src/update/new/mod.rs
@ -0,0 +1,22 @@
+pub use document_change::{Deletion, DocumentChange, Insertion, Update};
+pub use items_pool::ItemsPool;
+pub use top_level_map::{CowStr, TopLevelMap};
+
+use super::del_add::DelAdd;
+use crate::FieldId;
+
+mod channel;
+mod document_change;
+mod extract;
+pub mod indexer;
+mod items_pool;
+mod merger;
+mod top_level_map;
+mod word_fst_builder;
+
+/// TODO move them elsewhere
+pub type StdResult<T, E> = std::result::Result<T, E>;
+pub type KvReaderDelAdd = obkv::KvReader<DelAdd>;
+pub type KvReaderFieldId = obkv::KvReader<FieldId>;
+pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
+pub type KvWriterFieldId<W> = obkv::KvWriter<W, FieldId>;
--- a/milli/src/update/new/top_level_map.rs
+++ b/milli/src/update/new/top_level_map.rs
@ -0,0 +1,66 @@
+use std::borrow::{Borrow, Cow};
+use std::collections::BTreeMap;
+use std::{fmt, ops};
+
+use serde::{Deserialize, Serialize};
+use serde_json::value::RawValue;
+use serde_json::{Map, Value};
+
+#[derive(Deserialize, Serialize)]
+pub struct TopLevelMap<'p>(#[serde(borrow)] pub BTreeMap<CowStr<'p>, &'p RawValue>);
+
+impl TryFrom<&'_ TopLevelMap<'_>> for Map<String, Value> {
+    type Error = serde_json::Error;
+
+    fn try_from(tlmap: &TopLevelMap<'_>) -> Result<Self, Self::Error> {
+        let mut object = Map::new();
+        for (k, v) in &tlmap.0 {
+            let value = serde_json::from_str(v.get())?;
+            object.insert(k.to_string(), value);
+        }
+        Ok(object)
+    }
+}
+
+impl TryFrom<TopLevelMap<'_>> for Map<String, Value> {
+    type Error = serde_json::Error;
+
+    fn try_from(tlmap: TopLevelMap<'_>) -> Result<Self, Self::Error> {
+        TryFrom::try_from(&tlmap)
+    }
+}
+
+impl<'p> ops::Deref for TopLevelMap<'p> {
+    type Target = BTreeMap<CowStr<'p>, &'p RawValue>;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl ops::DerefMut for TopLevelMap<'_> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+#[derive(Deserialize, Serialize, PartialEq, Eq, PartialOrd, Ord, Hash, Clone)]
+pub struct CowStr<'p>(#[serde(borrow)] pub Cow<'p, str>);
+
+impl fmt::Display for CowStr<'_> {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        fmt::Display::fmt(&self.0, f)
+    }
+}
+
+impl AsRef<str> for CowStr<'_> {
+    fn as_ref(&self) -> &str {
+        self.0.as_ref()
+    }
+}
+
+impl<'doc> Borrow<str> for CowStr<'doc> {
+    fn borrow(&self) -> &str {
+        self.0.borrow()
+    }
+}
--- a/milli/src/update/new/word_fst_builder.rs
+++ b/milli/src/update/new/word_fst_builder.rs
@ -0,0 +1,187 @@
+use std::{fs::File, io::BufWriter};
+
+use fst::{Set, SetBuilder, Streamer};
+use memmap2::Mmap;
+use tempfile::tempfile;
+
+use crate::{update::del_add::DelAdd, Result, SmallString32};
+
+pub struct WordFstBuilder<'a> {
+    stream: Option<fst::set::Stream<'a>>,
+    word_fst_builder: SetBuilder<BufWriter<File>>,
+    /// TODO: Replace the full memory allocation
+    prefix_fst_builders: Vec<SetBuilder<Vec<u8>>>,
+    max_prefix_length: usize,
+    last_word: Option<Vec<u8>>,
+    current_prefix: Vec<SmallString32>,
+    current_prefix_count: Vec<u64>,
+    prefix_count_threshold: u64,
+}
+
+impl<'a> WordFstBuilder<'a> {
+    pub fn new(
+        words_fst: &'a Set<std::borrow::Cow<'a, [u8]>>,
+        max_prefix_length: usize,
+    ) -> Result<Self> {
+        let mut prefix_fst_builders = Vec::new();
+        for _ in 0..max_prefix_length {
+            prefix_fst_builders.push(SetBuilder::memory());
+        }
+
+        Ok(Self {
+            stream: Some(words_fst.stream()),
+            word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?,
+            prefix_fst_builders,
+            max_prefix_length,
+            last_word: None,
+            current_prefix: vec![SmallString32::new(); max_prefix_length],
+            current_prefix_count: vec![0; max_prefix_length],
+            prefix_count_threshold: 100,
+        })
+    }
+
+    pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> {
+        if let Some(left) = self.last_word.take() {
+            let (left_inserted, right_inserted) =
+                self.compare_and_insert(deladd, left.as_slice(), right)?;
+
+            // left was not inserted, so we keep it for the next iteration
+            if !left_inserted {
+                self.last_word = Some(left);
+            }
+
+            // right was inserted, so we can stop
+            if right_inserted {
+                return Ok(());
+            }
+        }
+
+        if let Some(mut stream) = self.stream.take() {
+            while let Some(left) = stream.next() {
+                let (left_inserted, right_inserted) =
+                    self.compare_and_insert(deladd, left, right)?;
+
+                // left was not inserted, so we keep it for the next iteration
+                if !left_inserted {
+                    self.last_word = Some(left.to_vec());
+                }
+
+                // right was inserted, so we can stop
+                if right_inserted {
+                    break;
+                }
+            }
+
+            self.stream = Some(stream);
+        }
+
+        Ok(())
+    }
+
+    pub fn compare_and_insert(
+        &mut self,
+        deladd: DelAdd,
+        left: &[u8],
+        right: &[u8],
+    ) -> Result<(bool, bool)> {
+        let mut left_inserted = false;
+        let mut right_inserted = false;
+        match left.cmp(right) {
+            std::cmp::Ordering::Less => {
+                // We need to insert the last word from the current fst
+                self.insert_word(left)?;
+
+                left_inserted = true;
+            }
+            std::cmp::Ordering::Equal => {
+                // Addition: We insert the word
+                // Deletion: We delete the word by not inserting it
+                if deladd == DelAdd::Addition {
+                    self.insert_word(right)?;
+                }
+
+                left_inserted = true;
+                right_inserted = true;
+            }
+            std::cmp::Ordering::Greater => {
+                // Addition: We insert the word and keep the last word
+                // Deletion: We keep the current word until the left word to delete is greater or equal
+                if deladd == DelAdd::Addition {
+                    self.insert_word(right)?;
+                }
+
+                right_inserted = true;
+            }
+        }
+
+        Ok((left_inserted, right_inserted))
+    }
+
+    fn insert_word(&mut self, bytes: &[u8]) -> Result<()> {
+        self.word_fst_builder.insert(bytes)?;
+
+        for n in 0..self.max_prefix_length {
+            let current_prefix = &mut self.current_prefix[n];
+            let current_prefix_count = &mut self.current_prefix_count[n];
+            let builder = &mut self.prefix_fst_builders[n];
+
+            // We try to get the first n bytes out of this string but we only want
+            // to split at valid characters bounds. If we try to split in the middle of
+            // a character we ignore this word and go to the next one.
+            let word = std::str::from_utf8(bytes)?;
+            let prefix = match word.get(..=n) {
+                Some(prefix) => prefix,
+                None => continue,
+            };
+
+            // This is the first iteration of the loop,
+            // or the current word doesn't starts with the current prefix.
+            if *current_prefix_count == 0 || prefix != current_prefix.as_str() {
+                *current_prefix = SmallString32::from(prefix);
+                *current_prefix_count = 0;
+            }
+
+            *current_prefix_count += 1;
+
+            // There is enough words corresponding to this prefix to add it to the cache.
+            /// TODO: (LEGACY) Replace this by `==` to avoid inserting several times the same prefix?
+            if *current_prefix_count >= self.prefix_count_threshold {
+                builder.insert(prefix)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn drain_stream(&mut self) -> Result<()> {
+        if let Some(mut stream) = self.stream.take() {
+            while let Some(current) = stream.next() {
+                self.insert_word(current)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn build(mut self) -> Result<(Mmap, Mmap)> {
+        self.drain_stream()?;
+
+        /// TODO: ugly unwrap
+        let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap();
+        let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? };
+
+        // We merge all of the previously computed prefixes into on final set.
+        let mut prefix_fsts = Vec::new();
+        for builder in self.prefix_fst_builders {
+            prefix_fsts.push(builder.into_set());
+        }
+        let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
+        let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?;
+        builder.extend_stream(op.r#union())?;
+        /// TODO: ugly unwrap
+        let prefix_fst_file = builder.into_inner()?.into_inner().unwrap();
+        let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? };
+
+        Ok((words_fst_mmap, prefix_fst_mmap))
+    }
+}
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@ -6,9 +6,8 @@ use heed::Database;

 use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{
-    create_sorter, merge_deladd_cbo_roaring_bitmaps,
-    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
-    write_sorter_into_database, CursorClonableMmap, MergeFn,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps,
 };
 use crate::{CboRoaringBitmapCodec, Result};

@ -47,7 +46,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
    )]
    pub fn execute(
        self,
-        new_word_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
+        new_word_docids: grenad::Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
        new_prefix_fst_words: &[String],
        common_prefix_fst_words: &[&[String]],
        del_prefix_fst_words: &HashSet<Vec<u8>>,
@ -56,7 +55,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
        // and write into it at the same time, therefore we write into another file.
        let mut prefix_docids_sorter = create_sorter(
            grenad::SortAlgorithm::Unstable,
-            merge_deladd_cbo_roaring_bitmaps,
+            MergeDeladdCboRoaringBitmaps,
            self.chunk_compression_type,
            self.chunk_compression_level,
            self.max_nb_chunks,
@ -139,7 +138,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {

 fn write_prefixes_in_sorter(
    prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
-    sorter: &mut grenad::Sorter<MergeFn>,
+    sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
 ) -> Result<()> {
    for (key, data_slices) in prefixes.drain() {
        for data in data_slices {
--- a/milli/src/update/words_prefix_integer_docids.rs
+++ b/milli/src/update/words_prefix_integer_docids.rs
@ -11,9 +11,8 @@ use crate::heed_codec::StrBEU16Codec;
 use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
 use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{
-    create_sorter, merge_deladd_cbo_roaring_bitmaps,
-    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
-    write_sorter_into_database, CursorClonableMmap, MergeFn,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps,
 };
 use crate::{CboRoaringBitmapCodec, Result};

@ -52,7 +51,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
    )]
    pub fn execute(
        self,
-        new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeFn>,
+        new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
        new_prefix_fst_words: &[String],
        common_prefix_fst_words: &[&[String]],
        del_prefix_fst_words: &HashSet<Vec<u8>>,
@ -61,7 +60,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {

        let mut prefix_integer_docids_sorter = create_sorter(
            grenad::SortAlgorithm::Unstable,
-            merge_deladd_cbo_roaring_bitmaps,
+            MergeDeladdCboRoaringBitmaps,
            self.chunk_compression_type,
            self.chunk_compression_level,
            self.max_nb_chunks,
@ -173,7 +172,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {

 fn write_prefixes_in_sorter(
    prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
-    sorter: &mut grenad::Sorter<MergeFn>,
+    sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
 ) -> Result<()> {
    // TODO: Merge before insertion.
    for (key, data_slices) in prefixes.drain() {
--- a/milli/src/vector/parsed_vectors.rs
+++ b/milli/src/vector/parsed_vectors.rs
@ -109,14 +109,13 @@ impl ParsedVectorsDiff {
    pub fn new(
        docid: DocumentId,
        embedders_configs: &[IndexEmbeddingConfig],
-        documents_diff: KvReader<'_, FieldId>,
+        documents_diff: &KvReader<FieldId>,
        old_vectors_fid: Option<FieldId>,
        new_vectors_fid: Option<FieldId>,
    ) -> Result<Self, Error> {
        let mut old = match old_vectors_fid
            .and_then(|vectors_fid| documents_diff.get(vectors_fid))
-            .map(KvReaderDelAdd::new)
-            .map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
+            .map(|bytes| to_vector_map(bytes.into(), DelAdd::Deletion))
            .transpose()
        {
            Ok(del) => del,
@ -143,8 +142,7 @@ impl ParsedVectorsDiff {
            let Some(bytes) = documents_diff.get(new_vectors_fid) else {
                break 'new VectorsState::NoVectorsFieldInDocument;
            };
-            let obkv = KvReaderDelAdd::new(bytes);
-            match to_vector_map(obkv, DelAdd::Addition)? {
+            match to_vector_map(bytes.into(), DelAdd::Addition)? {
                Some(new) => VectorsState::Vectors(new),
                None => VectorsState::NoVectorsFieldInDocument,
            }
@ -239,7 +237,7 @@ impl Error {
 }

 fn to_vector_map(
-    obkv: KvReaderDelAdd<'_>,
+    obkv: &KvReaderDelAdd,
    side: DelAdd,
 ) -> Result<Option<BTreeMap<String, Vectors>>, Error> {
    Ok(if let Some(value) = obkv.get(side) {
--- a/workloads/movies.json
+++ b/workloads/movies.json
@ -1,6 +1,6 @@
 {
  "name": "movies.json",
-  "run_count": 10,
+  "run_count": 1,
  "extra_cli_args": [],
  "assets": {
    "movies.json": {
Author	SHA1	Message	Date
Clément Renault	bdd363dd94	Add spans	2024-09-26 17:20:32 +02:00
Clément Renault	d6b3aae8a6	WIP add more logs	2024-09-26 16:37:38 +02:00
Clément Renault	ac2d54b27c	Make the merger multithreaded	2024-09-26 11:09:06 +02:00
Clément Renault	7d61697f19	Fix another iteration bug on hashmap entries	2024-09-25 22:42:41 +02:00
Clément Renault	97d2860998	Fix iterating on hashmap entries	2024-09-25 22:15:15 +02:00
Clément Renault	15bf556291	Write the inverted indexes in memory and never on disk	2024-09-25 18:13:19 +02:00
Clément Renault	3d244451df	Reduce the lru key size from 8 to 12 bytes	2024-09-25 16:14:13 +02:00
Clément Renault	5f53935c8a	Fix a bug in the Lru	2024-09-25 16:09:34 +02:00
Clément Renault	29a7623c3f	Fxi some logs	2024-09-25 15:57:50 +02:00
Clément Renault	e97041f7d0	Replace the Lru free list by a simple increment	2024-09-25 15:55:52 +02:00
Clément Renault	52d7f3ed1c	Reduce the lru key size from 20 to 8 bytes	2024-09-25 15:37:13 +02:00
Clément Renault	86d5e6d9ff	Use the new Lru	2024-09-25 14:54:56 +02:00
Clément Renault	759b9b1546	Introduce a new custom Lru	2024-09-25 14:49:12 +02:00
ManyTheFish	3f7a500f3b	Build prefix fst	2024-09-25 14:36:06 +02:00
ManyTheFish	974272f2e9	Merge branch 'main' into indexer-edition-2024	2024-09-25 07:41:16 +02:00
Clément Renault	7ad037841f	Move the tracing info to eprintln	2024-09-24 18:21:58 +02:00
Clément Renault	e0c7067355	Expose an IndexedParallelIterator to the index function	2024-09-24 17:24:59 +02:00
ManyTheFish	6e87332410	Change the way the FST is built	2024-09-24 16:28:31 +02:00
Clément Renault	2d1caf27df	Use eprintln to log	2024-09-24 15:59:50 +02:00
Clément Renault	92678383d6	Update charabia	2024-09-24 15:37:56 +02:00
Clément Renault	7f148c127c	Measure the SmallVec efficacity	2024-09-24 15:32:15 +02:00
Clément Renault	4ce5d3d66d	Do not check before pushing in bitmaps	2024-09-24 09:43:16 +02:00
Clément Renault	ff931edb55	Update roaring to inline max calls	2024-09-23 16:53:42 +02:00
Clément Renault	42b093687d	Introduce the new PushOptimizedBitmap	2024-09-23 16:38:21 +02:00
Clément Renault	835c5f98f9	Remove the debug symbols	2024-09-23 15:49:24 +02:00
Clément Renault	f00664247d	Add more stats about the channel message sent	2024-09-23 15:13:52 +02:00
Clément Renault	3c63d4a1e5	Fix charabia Zho	2024-09-23 14:50:17 +02:00
Clément Renault	4551abf6d4	Update roaring to the latest version	2024-09-23 14:35:33 +02:00
Clément Renault	193d7f5d34	Add the mutualized charabia normalization	2024-09-23 14:24:25 +02:00
Clément Renault	013acb3d93	Measure merger writer channel contention	2024-09-23 11:07:59 +02:00
Clément Renault	f4ab1f168e	Prefer using Rc<str> than String when cloning a lot	2024-09-16 15:41:29 +02:00
ManyTheFish	1a0e962299	Replace hashmap by vectors in wpp	2024-09-16 15:01:20 +02:00
ManyTheFish	f13e076b8a	Use hashmap instead of Btree in wpp extractor	2024-09-16 14:40:40 +02:00
ManyTheFish	7ba49b849e	Extract and write facet databases	2024-09-16 09:35:16 +02:00
Clément Renault	f7652186e1	WIP geo fields	2024-09-12 18:01:02 +02:00
Clément Renault	b2f4e67c9a	Do not store useless updates	2024-09-12 15:38:31 +02:00
Clément Renault	ff5d3b59f5	Move the document id extraction to the primary key code	2024-09-12 12:01:42 +02:00
ManyTheFish	aa69308e45	Use a bufWriter to build word FSTs	2024-09-12 11:48:00 +02:00
ManyTheFish	eb9a20ff0b	Fix fid_word_docids extraction	2024-09-12 11:08:18 +02:00
Clément Renault	0d868f36d7	Make sure we always use a BufWriter to write the update files	2024-09-11 18:38:04 +02:00
Clément Renault	e7d9db078f	Use the right key name when convertir from CSV to NDJSON	2024-09-11 18:27:00 +02:00
Clément Renault	3e9198ebaa	Support guessing primary key again	2024-09-11 17:25:40 +02:00
Clément Renault	2a0ad0982f	Fix the document counter	2024-09-11 15:59:36 +02:00
ManyTheFish	2b317c681b	Build mergers in parallel	2024-09-11 11:49:26 +02:00
ManyTheFish	39b5990f64	Mutualize tokenization	2024-09-11 10:22:38 +02:00
Clément Renault	3848adf5a2	Improve error management and simplify JSON read	2024-09-11 10:10:51 +02:00
Clément Renault	b4de06259e	Better CSV support	2024-09-11 10:02:00 +02:00
Clément Renault	8287c2644f	Support CSV again	2024-09-10 21:10:28 +01:00
Clément Renault	c1c44a0b81	Impl serialize on TopLevelMap	2024-09-10 19:32:03 +01:00
Clément Renault	04596f3616	Move the TopLevelMap into a dedicated module	2024-09-10 18:01:17 +01:00
Clément Renault	24cb5839ad	Move the document changes sorting logic to a new trait	2024-09-10 17:37:52 +01:00
Clément Renault	8d97b7b28c	Support JSON payloads again (not perfectly though)	2024-09-10 17:09:49 +01:00
ManyTheFish	f69688e8f7	Fix several warnings in extractors and remove unreachable macros	2024-09-09 14:52:50 +02:00
Clément Renault	8fd0afaaaa	Make sure we iterate over the payload documents in order	2024-09-06 08:09:08 +02:00
Clément Renault	72c6a21a30	Use raw JSON to read the payloads	2024-09-05 20:08:23 +02:00
Clément Renault	8412be4a7d	Cleanup CowStr and TopLevelMap struct	2024-09-05 18:32:55 +02:00
Louis Dureuil	10f09c531f	add some commented code to read from json with raw values	2024-09-05 18:22:16 +02:00
ManyTheFish	8fd99b111b	Add tracing timers logs	2024-09-05 18:00:22 +02:00
Clément Renault	f6b3d1f9a5	Increase some channel sizes	2024-09-05 15:12:07 +02:00
Clément Renault	73ce67862d	Use the word pair proximity and fid word count docids extractors Co-authored-by: ManyTheFish <many@meilisearch.com>	2024-09-05 10:56:22 +02:00
Clément Renault	0fc02f7351	Move the facet extraction to dedicated modules	2024-09-05 10:32:27 +02:00
ManyTheFish	34f11e3380	Implement word count and word pair proximity extractors	2024-09-05 10:30:39 +02:00
Clément Renault	27308eaab1	Import the facet extractors	2024-09-04 17:58:15 +02:00
Clément Renault	b33ec9ba3f	Introduce the FieldIdFacetIsNullDocidsExtractor	2024-09-04 17:50:08 +02:00
Clément Renault	9c0a1cd9fd	Introduce the FieldIdFacetExistsDocidsExtractor	2024-09-04 17:48:49 +02:00
Clément Renault	0b061f1e70	Introduce the FieldIdFacetIsEmptyDocidsExtractor	2024-09-04 17:40:24 +02:00
Clément Renault	19d937ab21	Introduce the facet extractors	2024-09-04 17:03:54 +02:00
Clément Renault	1d59c19cd2	Send the WordsFst by using an Mmap	2024-09-04 14:30:09 +02:00
Clément Renault	98e48371c3	Factorize some stuff	2024-09-04 12:17:13 +02:00
Clément Renault	6d74fb0229	Introduce the WordFidWordDocids database	2024-09-04 11:40:55 +02:00
ManyTheFish	1eb75a1040	remove milli/src/update/new/extract/tokenize_document.rs	2024-09-04 11:40:26 +02:00
Clément Renault	3b82d8b5b9	Fix the cache to serialize entries correctly	2024-09-04 10:55:36 +02:00
ManyTheFish	781a186f75	remove milli/src/update/new/extract/extract_word_docids.rs	2024-09-04 10:28:31 +02:00
ManyTheFish	6a399556b5	Implement more searchable extractor	2024-09-04 10:20:18 +02:00
Clément Renault	27b4cab857	Extract and write the documents and words fst in the database	2024-09-04 09:59:19 +02:00
Clément Renault	52d32b4ee9	Move the channel sender in the closure to stop the merger thread	2024-09-03 16:08:33 +02:00
ManyTheFish	da61408e52	Remove unimplemented from document changes	2024-09-03 15:14:16 +02:00
ManyTheFish	fe69385bd7	Fix tokenizer test	2024-09-03 14:24:37 +02:00
Clément Renault	c1557734dc	Use the GlobalFieldsIdsMap everywhere and write it to disk Co-authored-by: Dureuill <louis@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>	2024-09-03 12:01:01 +02:00
ManyTheFish	c50d3edc4a	Integrate first searchable exctrator	2024-09-03 11:02:39 +02:00
Clément Renault	5369bf4a62	Change some lifetimes	2024-09-02 19:51:22 +02:00
Clément Renault	bcb1aa3d22	Find a temporary solution to par into iter on an HashMap Spoiler: Do not use an HashMap but drain it into a Vec	2024-09-02 19:39:48 +02:00
Clément Renault	9b7858fb90	Expose the new indexer	2024-09-02 15:21:59 +02:00
Clément Renault	ab01679a8f	Remove the useless option from the document changes	2024-09-02 15:21:00 +02:00
Clément Renault	521775f788	I push for Many	2024-09-02 15:10:21 +02:00
Clément Renault	72e7b7846e	Renaming the indexers	2024-09-02 14:42:27 +02:00
Clément Renault	6526ce1208	Fix the merging of documents	2024-09-02 14:41:20 +02:00
Clément Renault	e639ec79d1	Move the indexers into their own modules	2024-09-02 10:42:19 +02:00
Clément Renault	bb885a5810	Fix the merge for roaring bitmap	2024-09-01 23:20:19 +02:00
Clément Renault	b625d31c7d	Introduce the PartialDumpIndexer indexer that generates document ids in parallel	2024-08-30 15:07:21 +02:00
Clément Renault	6487a67f2b	Introduce the ConcurrentAvailableIds struct and rename the other to AvailableIds	2024-08-30 15:06:50 +02:00
Clément Renault	271ce91b3b	Add the rayon Threadpool to the index function parameter	2024-08-30 14:34:24 +02:00
Clément Renault	54f2eb4507	Remove duplication of grenad merger	2024-08-30 14:34:05 +02:00
Clément Renault	794ebcd582	Replace grenad with the new grenad various-improvement branch	2024-08-30 11:53:59 +02:00
Clément Renault	b7c77c7a39	Use the latest version of the obkv crate	2024-08-30 11:53:59 +02:00
Clément Renault	0c57cf7565	Replace obkv with the temporary new version of it	2024-08-30 11:53:58 +02:00
Clément Renault	27df9e6c73	Introduce the indexer::index function that runs the indexation	2024-08-30 11:53:58 +02:00
Clément Renault	45c060831e	Introduce typed channels and the merger loop	2024-08-30 11:53:58 +02:00
Clément Renault	874c1ac538	First channels types	2024-08-30 11:53:58 +02:00
Clément Renault	e6ffa4d454	Implement the document merge function for the replace method	2024-08-30 11:53:58 +02:00
Clément Renault	637a9c8bdd	Implement the document merge function for the update method	2024-08-30 11:53:58 +02:00
Louis Dureuil	c683fa98e6	WIP Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>	2024-08-30 11:53:57 +02:00