Merge branch 'indexer-edition-2024' into indexer-edition-2024-doc-chunks

2025-10-26 21:46:27 +00:00 · 2024-11-06 15:50:53 +01:00
parent 33b1f54b41 10feeb88f2
commit ee03743355
1130 changed files with 8255 additions and 6799 deletions
--- a/crates/milli/src/update/available_documents_ids.rs
+++ b/crates/milli/src/update/available_documents_ids.rs
--- a/crates/milli/src/update/available_ids.rs
+++ b/crates/milli/src/update/available_ids.rs
@@ -0,0 +1,65 @@
+use std::iter::{Chain, FromIterator};
+use std::ops::RangeInclusive;
+
+use roaring::bitmap::{IntoIter, RoaringBitmap};
+
+pub struct AvailableIds {
+    iter: Chain<IntoIter, RangeInclusive<u32>>,
+}
+
+impl AvailableIds {
+    pub fn new(docids: &RoaringBitmap) -> AvailableIds {
+        match docids.max() {
+            Some(last_id) => {
+                let mut available = RoaringBitmap::from_iter(0..last_id);
+                available -= docids;
+
+                let iter = match last_id.checked_add(1) {
+                    Some(id) => id..=u32::MAX,
+                    #[allow(clippy::reversed_empty_ranges)]
+                    None => 1..=0, // empty range iterator
+                };
+
+                AvailableIds { iter: available.into_iter().chain(iter) }
+            }
+            None => {
+                let empty = RoaringBitmap::new().into_iter();
+                AvailableIds { iter: empty.chain(0..=u32::MAX) }
+            }
+        }
+    }
+}
+
+impl Iterator for AvailableIds {
+    type Item = u32;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn empty() {
+        let base = RoaringBitmap::new();
+        let left = AvailableIds::new(&base);
+        let right = 0..=u32::MAX;
+        left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
+    }
+
+    #[test]
+    fn scattered() {
+        let mut base = RoaringBitmap::new();
+        base.insert(0);
+        base.insert(10);
+        base.insert(100);
+        base.insert(405);
+
+        let left = AvailableIds::new(&base);
+        let right = (0..=u32::MAX).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
+        left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
+    }
+}
--- a/crates/milli/src/update/clear_documents.rs
+++ b/crates/milli/src/update/clear_documents.rs
@@ -0,0 +1,149 @@
+use heed::RwTxn;
+use roaring::RoaringBitmap;
+use time::OffsetDateTime;
+
+use crate::{FieldDistribution, Index, Result};
+
+pub struct ClearDocuments<'t, 'i> {
+    wtxn: &'t mut RwTxn<'i>,
+    index: &'i Index,
+}
+
+impl<'t, 'i> ClearDocuments<'t, 'i> {
+    pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> ClearDocuments<'t, 'i> {
+        ClearDocuments { wtxn, index }
+    }
+
+    #[tracing::instrument(
+        level = "trace",
+        skip(self),
+        target = "indexing::documents",
+        name = "clear_documents"
+    )]
+    pub fn execute(self) -> Result<u64> {
+        self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
+        let Index {
+            env: _env,
+            main: _main,
+            external_documents_ids,
+            word_docids,
+            exact_word_docids,
+            word_prefix_docids,
+            exact_word_prefix_docids,
+            word_pair_proximity_docids,
+            word_position_docids,
+            word_fid_docids,
+            field_id_word_count_docids,
+            word_prefix_position_docids,
+            word_prefix_fid_docids,
+            facet_id_f64_docids,
+            facet_id_string_docids,
+            facet_id_normalized_string_strings,
+            facet_id_string_fst,
+            facet_id_exists_docids,
+            facet_id_is_null_docids,
+            facet_id_is_empty_docids,
+            field_id_docid_facet_f64s,
+            field_id_docid_facet_strings,
+            vector_arroy,
+            embedder_category_id: _,
+            documents,
+        } = self.index;
+
+        let empty_roaring = RoaringBitmap::default();
+
+        // We retrieve the number of documents ids that we are deleting.
+        let number_of_documents = self.index.number_of_documents(self.wtxn)?;
+
+        // We clean some of the main engine datastructures.
+        self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
+        self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
+        self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
+        self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
+        self.index.delete_geo_rtree(self.wtxn)?;
+        self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
+
+        // Remove all user-provided bits from the configs
+        let mut configs = self.index.embedding_configs(self.wtxn)?;
+        for config in configs.iter_mut() {
+            config.user_provided.clear();
+        }
+        self.index.put_embedding_configs(self.wtxn, configs)?;
+
+        // Clear the other databases.
+        external_documents_ids.clear(self.wtxn)?;
+        word_docids.clear(self.wtxn)?;
+        exact_word_docids.clear(self.wtxn)?;
+        word_prefix_docids.clear(self.wtxn)?;
+        exact_word_prefix_docids.clear(self.wtxn)?;
+        word_pair_proximity_docids.clear(self.wtxn)?;
+        word_position_docids.clear(self.wtxn)?;
+        word_fid_docids.clear(self.wtxn)?;
+        field_id_word_count_docids.clear(self.wtxn)?;
+        word_prefix_position_docids.clear(self.wtxn)?;
+        word_prefix_fid_docids.clear(self.wtxn)?;
+        facet_id_f64_docids.clear(self.wtxn)?;
+        facet_id_normalized_string_strings.clear(self.wtxn)?;
+        facet_id_string_fst.clear(self.wtxn)?;
+        facet_id_exists_docids.clear(self.wtxn)?;
+        facet_id_is_null_docids.clear(self.wtxn)?;
+        facet_id_is_empty_docids.clear(self.wtxn)?;
+        facet_id_string_docids.clear(self.wtxn)?;
+        field_id_docid_facet_f64s.clear(self.wtxn)?;
+        field_id_docid_facet_strings.clear(self.wtxn)?;
+        // vector
+        vector_arroy.clear(self.wtxn)?;
+
+        documents.clear(self.wtxn)?;
+
+        Ok(number_of_documents)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::index::tests::TempIndex;
+
+    #[test]
+    fn clear_documents() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(&mut wtxn, documents!([
+                { "id": 0, "name": "kevin", "age": 20 },
+                { "id": 1, "name": "kevina" },
+                { "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } }
+            ]))
+            .unwrap();
+
+        // Clear all documents from the database.
+        let builder = ClearDocuments::new(&mut wtxn, &index);
+        assert_eq!(builder.execute().unwrap(), 3);
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        // the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]`
+        assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7);
+
+        assert!(index.words_fst(&rtxn).unwrap().is_empty());
+        assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
+        assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
+        assert!(index.documents_ids(&rtxn).unwrap().is_empty());
+        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
+        assert!(index.geo_rtree(&rtxn).unwrap().is_none());
+        assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty());
+
+        assert!(index.word_docids.is_empty(&rtxn).unwrap());
+        assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
+        assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
+        assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
+        assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
+        assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
+        assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
+        assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap());
+        assert!(index.documents.is_empty(&rtxn).unwrap());
+    }
+}
--- a/crates/milli/src/update/concurrent_available_ids.rs
+++ b/crates/milli/src/update/concurrent_available_ids.rs
@@ -0,0 +1,59 @@
+use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
+
+use roaring::RoaringBitmap;
+
+/// A concurrent ID generate that will never return the same ID twice.
+#[derive(Debug)]
+pub struct ConcurrentAvailableIds {
+    /// The current tree node ID we should use if there is no other IDs available.
+    current: AtomicU32,
+    /// The total number of tree node IDs used.
+    used: AtomicU64,
+
+    /// A list of IDs to exhaust before picking IDs from `current`.
+    available: RoaringBitmap,
+    /// The current Nth ID to select in the bitmap.
+    select_in_bitmap: AtomicU32,
+    /// Tells if you should look in the roaring bitmap or if all the IDs are already exhausted.
+    look_into_bitmap: AtomicBool,
+}
+
+impl ConcurrentAvailableIds {
+    /// Creates an ID generator returning unique IDs, avoiding the specified used IDs.
+    pub fn new(used: RoaringBitmap) -> ConcurrentAvailableIds {
+        let last_id = used.max().map_or(0, |id| id + 1);
+        let used_ids = used.len();
+        let available = RoaringBitmap::from_sorted_iter(0..last_id).unwrap() - used;
+
+        ConcurrentAvailableIds {
+            current: AtomicU32::new(last_id),
+            used: AtomicU64::new(used_ids),
+            select_in_bitmap: AtomicU32::new(0),
+            look_into_bitmap: AtomicBool::new(!available.is_empty()),
+            available,
+        }
+    }
+
+    /// Returns a new unique ID and increase the count of IDs used.
+    pub fn next(&self) -> Option<u32> {
+        if self.used.fetch_add(1, Ordering::Relaxed) > u32::MAX as u64 {
+            None
+        } else if self.look_into_bitmap.load(Ordering::Relaxed) {
+            let current = self.select_in_bitmap.fetch_add(1, Ordering::Relaxed);
+            match self.available.select(current) {
+                Some(id) => Some(id),
+                None => {
+                    self.look_into_bitmap.store(false, Ordering::Relaxed);
+                    Some(self.current.fetch_add(1, Ordering::Relaxed))
+                }
+            }
+        } else {
+            Some(self.current.fetch_add(1, Ordering::Relaxed))
+        }
+    }
+
+    /// Returns the number of used ids in total.
+    pub fn used(&self) -> u64 {
+        self.used.load(Ordering::Relaxed)
+    }
+}
--- a/crates/milli/src/update/del_add.rs
+++ b/crates/milli/src/update/del_add.rs
@@ -0,0 +1,140 @@
+use obkv::Key;
+
+pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
+pub type KvReaderDelAdd = obkv::KvReader<DelAdd>;
+
+/// DelAdd defines the new value to add in the database and old value to delete from the database.
+///
+/// Its used in an OBKV to be serialized in grenad files.
+#[repr(u8)]
+#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
+pub enum DelAdd {
+    Deletion = 0,
+    Addition = 1,
+}
+
+impl Key for DelAdd {
+    const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
+    type BYTES = [u8; Self::BYTES_SIZE];
+
+    fn to_be_bytes(&self) -> Self::BYTES {
+        u8::to_be_bytes(*self as u8)
+    }
+
+    fn from_be_bytes(array: Self::BYTES) -> Self {
+        match u8::from_be_bytes(array) {
+            0 => Self::Deletion,
+            1 => Self::Addition,
+            otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
+        }
+    }
+}
+
+/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
+///
+/// Deletion: put all the values under DelAdd::Deletion
+/// Addition: put all the values under DelAdd::Addition,
+/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
+pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
+    reader: &obkv::KvReader<K>,
+    operation: DelAddOperation,
+    buffer: &mut Vec<u8>,
+) -> Result<(), std::io::Error> {
+    into_del_add_obkv_conditional_operation(reader, buffer, |_| operation)
+}
+
+/// Akin to the [into_del_add_obkv] function but lets you
+/// conditionally define the `DelAdd` variant based on the obkv key.
+pub fn into_del_add_obkv_conditional_operation<K, F>(
+    reader: &obkv::KvReader<K>,
+    buffer: &mut Vec<u8>,
+    operation: F,
+) -> std::io::Result<()>
+where
+    K: obkv::Key + PartialOrd,
+    F: Fn(K) -> DelAddOperation,
+{
+    let mut writer = obkv::KvWriter::new(buffer);
+    let mut value_buffer = Vec::new();
+    for (key, value) in reader.iter() {
+        value_buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+        let operation = operation(key);
+        if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
+            value_writer.insert(DelAdd::Deletion, value)?;
+        }
+        if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) {
+            value_writer.insert(DelAdd::Addition, value)?;
+        }
+        value_writer.finish()?;
+        writer.insert(key, &value_buffer)?;
+    }
+
+    writer.finish()
+}
+
+/// Enum controlling the side of the DelAdd obkv in which the provided value will be written.
+#[derive(Debug, Clone, Copy)]
+pub enum DelAddOperation {
+    Deletion,
+    Addition,
+    DeletionAndAddition,
+}
+
+/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
+///
+/// putting each deletion obkv's keys under an DelAdd::Deletion
+/// and putting each addition obkv's keys under an DelAdd::Addition
+pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
+    deletion: &obkv::KvReader<K>,
+    addition: &obkv::KvReader<K>,
+    buffer: &mut Vec<u8>,
+) -> Result<(), std::io::Error> {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut writer = obkv::KvWriter::new(buffer);
+    let mut value_buffer = Vec::new();
+
+    for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
+        value_buffer.clear();
+        match eob {
+            Left((k, v)) => {
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Deletion, v).unwrap();
+                writer.insert(k, value_writer.into_inner()?).unwrap();
+            }
+            Right((k, v)) => {
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Addition, v).unwrap();
+                writer.insert(k, value_writer.into_inner()?).unwrap();
+            }
+            Both((k, deletion), (_, addition)) => {
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Deletion, deletion).unwrap();
+                value_writer.insert(DelAdd::Addition, addition).unwrap();
+                writer.insert(k, value_writer.into_inner()?).unwrap();
+            }
+        }
+    }
+
+    writer.finish()
+}
+
+pub fn is_noop_del_add_obkv(del_add: &KvReaderDelAdd) -> bool {
+    del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
+}
+
+/// A function that extracts and returns the Add side of a DelAdd obkv.
+/// This is useful when there are no previous value in the database and
+/// therefore we don't need to do a diff with what's already there.
+///
+/// If there is no Add side we currently write an empty buffer
+/// which is a valid CboRoaringBitmap.
+#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
+pub fn deladd_serialize_add_side<'a>(
+    obkv: &'a [u8],
+    _buffer: &mut Vec<u8>,
+) -> crate::Result<&'a [u8]> {
+    Ok(KvReaderDelAdd::from_slice(obkv).get(DelAdd::Addition).unwrap_or_default())
+}
--- a/crates/milli/src/update/facet/bulk.rs
+++ b/crates/milli/src/update/facet/bulk.rs
@@ -0,0 +1,533 @@
+use std::fs::File;
+use std::io::BufReader;
+
+use grenad::{CompressionType, Merger};
+use heed::types::Bytes;
+use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
+use roaring::RoaringBitmap;
+
+use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
+use crate::facet::FacetType;
+use crate::heed_codec::facet::{
+    FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
+};
+use crate::heed_codec::BytesRefCodec;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd};
+use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
+use crate::update::MergeDeladdCboRoaringBitmaps;
+use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
+
+/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
+/// by rebuilding the database "from scratch".
+///
+/// First, the new elements are inserted into the level 0 of the database. Then, the
+/// higher levels are cleared and recomputed from the content of level 0.
+pub struct FacetsUpdateBulk<'i> {
+    index: &'i Index,
+    group_size: u8,
+    min_level_size: u8,
+    facet_type: FacetType,
+    field_ids: Vec<FieldId>,
+    // None if level 0 does not need to be updated
+    delta_data: Option<Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>>,
+}
+
+impl<'i> FacetsUpdateBulk<'i> {
+    pub fn new(
+        index: &'i Index,
+        field_ids: Vec<FieldId>,
+        facet_type: FacetType,
+        delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
+        group_size: u8,
+        min_level_size: u8,
+    ) -> FacetsUpdateBulk<'i> {
+        FacetsUpdateBulk {
+            index,
+            field_ids,
+            group_size,
+            min_level_size,
+            facet_type,
+            delta_data: Some(delta_data),
+        }
+    }
+
+    pub fn new_not_updating_level_0(
+        index: &'i Index,
+        field_ids: Vec<FieldId>,
+        facet_type: FacetType,
+    ) -> FacetsUpdateBulk<'i> {
+        FacetsUpdateBulk {
+            index,
+            field_ids,
+            group_size: FACET_GROUP_SIZE,
+            min_level_size: FACET_MIN_LEVEL_SIZE,
+            facet_type,
+            delta_data: None,
+        }
+    }
+
+    #[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")]
+    pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
+        let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
+
+        let db = match facet_type {
+            FacetType::String => {
+                index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
+            }
+            FacetType::Number => {
+                index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
+            }
+        };
+
+        let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
+
+        inner.update(wtxn, &field_ids)?;
+
+        Ok(())
+    }
+}
+
+/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
+pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
+    pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
+    pub delta_data: Option<Merger<R, MergeDeladdCboRoaringBitmaps>>,
+    pub group_size: u8,
+    pub min_level_size: u8,
+}
+impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
+    pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> {
+        self.update_level0(wtxn)?;
+        for &field_id in field_ids.iter() {
+            self.clear_levels(wtxn, field_id)?;
+        }
+
+        for &field_id in field_ids.iter() {
+            let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
+
+            for level_reader in level_readers {
+                let mut cursor = level_reader.into_cursor()?;
+                while let Some((k, v)) = cursor.move_on_next()? {
+                    self.db.remap_types::<Bytes, Bytes>().put(wtxn, k, v)?;
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> {
+        let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] };
+        let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] };
+        let range = left..=right;
+        self.db.delete_range(wtxn, &range).map(drop)?;
+        Ok(())
+    }
+
+    fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> {
+        let delta_data = match self.delta_data.take() {
+            Some(x) => x,
+            None => return Ok(()),
+        };
+        if self.db.is_empty(wtxn)? {
+            let mut buffer = Vec::new();
+            let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
+            let mut iter = delta_data.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                if !valid_lmdb_key(key) {
+                    continue;
+                }
+                let value = KvReaderDelAdd::from_slice(value);
+
+                // DB is empty, it is safe to ignore Del operations
+                let Some(value) = value.get(DelAdd::Addition) else {
+                    continue;
+                };
+
+                buffer.clear();
+                // the group size for level 0
+                buffer.push(1);
+                // then we extend the buffer with the docids bitmap
+                buffer.extend_from_slice(value);
+                unsafe {
+                    database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, &buffer)?
+                };
+            }
+        } else {
+            let mut buffer = Vec::new();
+            let database = self.db.remap_types::<Bytes, Bytes>();
+
+            let mut iter = delta_data.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                if !valid_lmdb_key(key) {
+                    continue;
+                }
+
+                let value = KvReaderDelAdd::from_slice(value);
+
+                // the value is a CboRoaringBitmap, but I still need to prepend the
+                // group size for level 0 (= 1) to it
+                buffer.clear();
+                buffer.push(1);
+                // then we extend the buffer with the docids bitmap
+                match database.get(wtxn, key)? {
+                    Some(prev_value) => {
+                        // prev_value is the group size for level 0, followed by the previous bitmap.
+                        let old_bitmap = &prev_value[1..];
+                        CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
+                    }
+                    None => {
+                        // it is safe to ignore the del in that case.
+                        let Some(value) = value.get(DelAdd::Addition) else {
+                            // won't put the key in DB as the value would be empty
+                            continue;
+                        };
+
+                        buffer.extend_from_slice(value);
+                    }
+                };
+                let new_bitmap = &buffer[1..];
+                // if the new bitmap is empty, let's remove it
+                if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
+                    database.delete(wtxn, key)?;
+                } else {
+                    database.put(wtxn, key, &buffer)?;
+                }
+            }
+        }
+        Ok(())
+    }
+    fn compute_levels_for_field_id(
+        &self,
+        field_id: FieldId,
+        txn: &RoTxn<'_>,
+    ) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
+        let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
+
+        Ok(subwriters)
+    }
+    #[allow(clippy::type_complexity)]
+    fn read_level_0<'t>(
+        &self,
+        rtxn: &'t RoTxn<'t>,
+        field_id: u16,
+        handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
+    ) -> Result<()> {
+        // we read the elements one by one and
+        // 1. keep track of the left bound
+        // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read
+        let mut bitmaps = vec![];
+
+        let mut level_0_prefix = vec![];
+        level_0_prefix.extend_from_slice(&field_id.to_be_bytes());
+        level_0_prefix.push(0);
+
+        let level_0_iter = self
+            .db
+            .remap_types::<Bytes, Bytes>()
+            .prefix_iter(rtxn, level_0_prefix.as_slice())?
+            .remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>();
+
+        let mut left_bound: &[u8] = &[];
+        let mut first_iteration_for_new_group = true;
+        for el in level_0_iter {
+            let (key, value) = el?;
+            let bound = key.left_bound;
+            let docids = value.bitmap;
+
+            if first_iteration_for_new_group {
+                left_bound = bound;
+                first_iteration_for_new_group = false;
+            }
+            bitmaps.push(docids);
+
+            if bitmaps.len() == self.group_size as usize {
+                handle_group(&bitmaps, left_bound)?;
+                first_iteration_for_new_group = true;
+                bitmaps.clear();
+            }
+        }
+        // don't forget to give the leftover bitmaps as well
+        if !bitmaps.is_empty() {
+            handle_group(&bitmaps, left_bound)?;
+            bitmaps.clear();
+        }
+        Ok(())
+    }
+
+    /// Compute the content of the database levels from its level 0 for the given field id.
+    ///
+    /// ## Returns:
+    /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1`
+    /// that must be inserted into the database.
+    #[allow(clippy::type_complexity)]
+    fn compute_higher_levels<'t>(
+        &self,
+        rtxn: &'t RoTxn<'t>,
+        field_id: u16,
+        level: u8,
+        handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
+    ) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
+        if level == 0 {
+            self.read_level_0(rtxn, field_id, handle_group)?;
+            // Level 0 is already in the database
+            return Ok(vec![]);
+        }
+        // level >= 1
+        // we compute each element of this level based on the elements of the level below it
+        // once we have computed `level_group_size` elements, we give the left bound
+        // of those elements, and their bitmaps, to the level above
+
+        let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
+        let mut cur_writer_len: usize = 0;
+
+        let mut group_sizes = vec![];
+        let mut left_bounds = vec![];
+        let mut bitmaps = vec![];
+
+        // compute the levels below
+        // in the callback, we fill `cur_writer` with the correct elements for this level
+        let mut sub_writers = self.compute_higher_levels(
+            rtxn,
+            field_id,
+            level - 1,
+            &mut |sub_bitmaps, left_bound| {
+                let mut combined_bitmap = RoaringBitmap::default();
+                for bitmap in sub_bitmaps {
+                    combined_bitmap |= bitmap;
+                }
+                // The conversion of sub_bitmaps.len() to a u8 will always be correct
+                // since its length is bounded by max_group_size, which is a u8.
+                group_sizes.push(sub_bitmaps.len() as u8);
+                left_bounds.push(left_bound);
+
+                bitmaps.push(combined_bitmap);
+                if bitmaps.len() != self.group_size as usize {
+                    return Ok(());
+                }
+                let left_bound = left_bounds.first().unwrap();
+                handle_group(&bitmaps, left_bound)?;
+
+                for ((bitmap, left_bound), group_size) in
+                    bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..))
+                {
+                    let key = FacetGroupKey { field_id, level, left_bound };
+                    let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key)
+                        .map_err(Error::Encoding)?;
+                    let value = FacetGroupValue { size: group_size, bitmap };
+                    let value =
+                        FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?;
+                    cur_writer.insert(key, value)?;
+                    cur_writer_len += 1;
+                }
+                Ok(())
+            },
+        )?;
+        // don't forget to insert the leftover elements into the writer as well
+
+        // but only do so if the current number of elements to be inserted into this
+        // levelcould grow to the minimum level size
+
+        if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) {
+            // the length of bitmaps is between 0 and group_size
+            assert!(bitmaps.len() < self.group_size as usize);
+            assert!(cur_writer_len > 0);
+
+            let left_bound = left_bounds.first().unwrap();
+            handle_group(&bitmaps, left_bound)?;
+
+            // Note: how many bitmaps are there here?
+            for ((bitmap, left_bound), group_size) in
+                bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..))
+            {
+                let key = FacetGroupKey { field_id, level, left_bound };
+                let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key)
+                    .map_err(Error::Encoding)?;
+                let value = FacetGroupValue { size: group_size, bitmap };
+                let value = FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?;
+                cur_writer.insert(key, value)?;
+                cur_writer_len += 1;
+            }
+        }
+        // if we inserted enough elements to reach the minimum level size, then we push the writer
+        if cur_writer_len >= self.min_level_size as usize {
+            sub_writers.push(writer_into_reader(cur_writer)?);
+        } else {
+            // otherwise, if there are still leftover elements, we give them to the level above
+            // this is necessary in order to get the union of all docids
+            if !bitmaps.is_empty() {
+                handle_group(&bitmaps, left_bounds.first().unwrap())?;
+            }
+        }
+        Ok(sub_writers)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::iter::once;
+
+    use big_s::S;
+    use maplit::hashset;
+    use roaring::RoaringBitmap;
+
+    use crate::documents::documents_batch_reader_from_objects;
+    use crate::heed_codec::facet::OrderedF64Codec;
+    use crate::heed_codec::StrRefCodec;
+    use crate::index::tests::TempIndex;
+    use crate::update::facet::test_helpers::{ordered_string, FacetIndex};
+    use crate::{db_snap, milli_snap};
+
+    #[test]
+    fn insert() {
+        let test = |name: &str, group_size: u8, min_level_size: u8| {
+            let index =
+                FacetIndex::<OrderedF64Codec>::new(group_size, 0 /*NA*/, min_level_size);
+
+            let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
+            for i in 0..1_000u32 {
+                // field id = 0, left_bound = i, docids = [i]
+                elements.push(((0, i as f64), once(i).collect()));
+            }
+            for i in 0..100u32 {
+                // field id = 1, left_bound = i, docids = [i]
+                elements.push(((1, i as f64), once(i).collect()));
+            }
+            let mut wtxn = index.env.write_txn().unwrap();
+            index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
+
+            index.verify_structure_validity(&wtxn, 0);
+            index.verify_structure_validity(&wtxn, 1);
+
+            wtxn.commit().unwrap();
+
+            milli_snap!(format!("{index}"), name);
+        };
+
+        test("default", 4, 5);
+        test("small_group_small_min_level", 2, 2);
+        test("small_group_large_min_level", 2, 128);
+        test("large_group_small_min_level", 16, 2);
+        test("odd_group_odd_min_level", 7, 3);
+    }
+    #[test]
+    fn insert_delete_field_insert() {
+        let test = |name: &str, group_size: u8, min_level_size: u8| {
+            let index =
+                FacetIndex::<OrderedF64Codec>::new(group_size, 0 /*NA*/, min_level_size);
+            let mut wtxn = index.env.write_txn().unwrap();
+
+            let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
+            for i in 0..100u32 {
+                // field id = 0, left_bound = i, docids = [i]
+                elements.push(((0, i as f64), once(i).collect()));
+            }
+            for i in 0..100u32 {
+                // field id = 1, left_bound = i, docids = [i]
+                elements.push(((1, i as f64), once(i).collect()));
+            }
+            index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
+
+            index.verify_structure_validity(&wtxn, 0);
+            index.verify_structure_validity(&wtxn, 1);
+            // delete all the elements for the facet id 0
+            for i in 0..100u32 {
+                index.delete_single_docid(&mut wtxn, 0, &(i as f64), i);
+            }
+            index.verify_structure_validity(&wtxn, 0);
+            index.verify_structure_validity(&wtxn, 1);
+
+            let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
+            // then add some elements again for the facet id 1
+            for i in 0..110u32 {
+                // field id = 1, left_bound = i, docids = [i]
+                elements.push(((1, i as f64), once(i).collect()));
+            }
+            index.verify_structure_validity(&wtxn, 0);
+            index.verify_structure_validity(&wtxn, 1);
+            index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
+
+            wtxn.commit().unwrap();
+
+            milli_snap!(format!("{index}"), name);
+        };
+
+        test("default", 4, 5);
+        test("small_group_small_min_level", 2, 2);
+        test("small_group_large_min_level", 2, 128);
+        test("large_group_small_min_level", 16, 2);
+        test("odd_group_odd_min_level", 7, 3);
+    }
+
+    #[test]
+    fn bug_3165() {
+        // Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies)
+        // would lead to a facet DB which was missing some levels.
+        // That was because before writing a level into the database, we would
+        // check that its size was higher than the minimum level size using
+        // a lossy integer conversion: `level_size as u8 >= min_level_size`.
+        //
+        // This missing level in the facet DBs would make the incremental indexer
+        // (and other search algorithms) crash.
+        //
+        // https://github.com/meilisearch/meilisearch/issues/3165
+        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
+
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key("id".to_owned());
+                settings.set_filterable_fields(hashset! { S("id") });
+            })
+            .unwrap();
+
+        let mut documents = vec![];
+        for i in 0..=22_540 {
+            documents.push(
+                serde_json::json! {
+                    {
+                        "id": i as u64,
+                    }
+                }
+                .as_object()
+                .unwrap()
+                .clone(),
+            );
+        }
+
+        let documents = documents_batch_reader_from_objects(documents);
+        index.add_documents(documents).unwrap();
+
+        db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
+    }
+
+    #[test]
+    fn insert_string() {
+        let test = |name: &str, group_size: u8, min_level_size: u8| {
+            let index = FacetIndex::<StrRefCodec>::new(group_size, 0 /*NA*/, min_level_size);
+
+            let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::<Vec<_>>();
+            let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new();
+            for i in 0..1_000u32 {
+                // field id = 0, left_bound = i, docids = [i]
+                elements.push(((0, &strings[i as usize]), once(i).collect()));
+            }
+            for i in 0..100u32 {
+                // field id = 1, left_bound = i, docids = [i]
+                elements.push(((1, &strings[i as usize]), once(i).collect()));
+            }
+            let mut wtxn = index.env.write_txn().unwrap();
+            index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
+
+            index.verify_structure_validity(&wtxn, 0);
+            index.verify_structure_validity(&wtxn, 1);
+
+            wtxn.commit().unwrap();
+
+            milli_snap!(format!("{index}"), name);
+        };
+
+        test("default", 4, 5);
+        test("small_group_small_min_level", 2, 2);
+        test("small_group_large_min_level", 2, 128);
+        test("large_group_small_min_level", 16, 2);
+        test("odd_group_odd_min_level", 7, 3);
+    }
+}
--- a/crates/milli/src/update/facet/incremental.rs
+++ b/crates/milli/src/update/facet/incremental.rs
--- a/crates/milli/src/update/facet/mod.rs
+++ b/crates/milli/src/update/facet/mod.rs
@@ -0,0 +1,640 @@
+/*!
+This module implements two different algorithms for updating the `facet_id_string_docids`
+and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that
+it recreates the database from scratch when new elements are added to it. The second algorithm
+is incremental: it modifies the database as little as possible.
+
+The databases must be able to return results for queries such as:
+1. Filter       : find all the document ids that have a facet value greater than X and/or smaller than Y
+2. Min/Max      : find the minimum/maximum facet value among these document ids
+3. Sort         : sort these document ids by increasing/decreasing facet values
+4. Distribution : given some document ids, make a list of each facet value
+   found in these documents along with the number of documents that contain it
+
+The algorithms that implement these queries are found in the `src/search/facet` folder.
+
+To make these queries fast to compute, the database adopts a tree structure:
+```text
+            ┌───────────────────────────────┬───────────────────────────────┬───────────────┐
+┌───────┐   │           "ab" (2)            │           "gaf" (2)           │   "woz" (1)   │
+│Level 2│   │                               │                               │               │
+└───────┘   │        [a, b, d, f, z]        │        [c, d, e, f, g]        │    [u, y]     │
+            ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤
+┌───────┐   │   "ab" (2)    │   "ba" (2)    │   "gaf" (2)   │  "form" (2)   │   "woz" (2)   │
+│Level 1│   │               │               │               │               │               │
+└───────┘   │ [a, b, d, z]  │   [a, b, f]   │   [c, d, g]   │    [e, f]     │    [u, y]     │
+            ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤
+┌───────┐   │  "ab" │  "ac" │  "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │  "zz" │
+│Level 0│   │       │       │       │       │       │       │       │       │       │       │
+└───────┘   │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│  [g]  │  [e]  │ [e, f]│  [y]  │  [u]  │
+            └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
+```
+In the diagram above, each cell corresponds to a node in the tree. The first line of the cell
+contains the left bound of the range of facet values as well as the number of children of the node.
+The second line contains the document ids which have a facet value within the range of the node.
+The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range.
+
+In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because
+`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`.
+These documents all contain a facet value that is contained within `ab .. gaf`.
+
+In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
+[`FacetGroupValue`], which have the following format:
+
+```text
+FacetGroupKey:
+- field id  : u16
+- level     : u8
+- left bound: [u8]    // the facet value encoded using either OrderedF64Codec or Str
+
+FacetGroupValue:
+- #children : u8
+- docids    : RoaringBitmap
+```
+
+When the database is first created using the "bulk" method, each node has a fixed number of children
+(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`).
+The tree is also built such that the highest level has more than `min_level_size`
+(default to `FACET_MIN_LEVEL_SIZE`) elements in it.
+
+When the database is incrementally updated, the number of children of a node can vary between
+1 and `max_group_size`. This is done so that most incremental operations do not need to change
+the structure of the tree. When the number of children of a node reaches `max_group_size`,
+we split the node in two and update the number of children of its parent.
+
+When adding documents to the databases, it is important to determine which method to use to
+minimise indexing time. The incremental method is faster when adding few new facet values, but the
+bulk method is faster when a large part of the database is modified. Empirically, it seems that
+it takes 50x more time to incrementally add N facet values to an existing database than it is to
+construct a database of N facet values. This is the heuristic that is used to choose between the
+two methods.
+
+Related PR: https://github.com/meilisearch/milli/pull/619
+*/
+
+pub const FACET_MAX_GROUP_SIZE: u8 = 8;
+pub const FACET_GROUP_SIZE: u8 = 4;
+pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
+
+use std::collections::BTreeSet;
+use std::fs::File;
+use std::io::BufReader;
+
+use grenad::Merger;
+use heed::types::{Bytes, DecodeIgnore};
+use time::OffsetDateTime;
+use tracing::debug;
+
+use self::incremental::FacetsUpdateIncremental;
+use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
+use crate::facet::FacetType;
+use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
+use crate::heed_codec::BytesRefCodec;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd};
+use crate::{try_split_array_at, FieldId, Index, Result};
+
+pub mod bulk;
+pub mod incremental;
+
+/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
+///
+/// Depending on the number of new elements and the existing size of the database, we use either
+/// a bulk update method or an incremental update method.
+pub struct FacetsUpdate<'i> {
+    index: &'i Index,
+    database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
+    facet_type: FacetType,
+    delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
+    normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
+    group_size: u8,
+    max_group_size: u8,
+    min_level_size: u8,
+    data_size: u64,
+}
+impl<'i> FacetsUpdate<'i> {
+    pub fn new(
+        index: &'i Index,
+        facet_type: FacetType,
+        delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
+        normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
+        data_size: u64,
+    ) -> Self {
+        let database = match facet_type {
+            FacetType::String => {
+                index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
+            }
+            FacetType::Number => {
+                index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
+            }
+        };
+        Self {
+            index,
+            database,
+            group_size: FACET_GROUP_SIZE,
+            max_group_size: FACET_MAX_GROUP_SIZE,
+            min_level_size: FACET_MIN_LEVEL_SIZE,
+            facet_type,
+            delta_data,
+            normalized_delta_data,
+            data_size,
+        }
+    }
+
+    pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
+        if self.data_size == 0 {
+            return Ok(());
+        }
+        debug!("Computing and writing the facet values levels docids into LMDB on disk...");
+        self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
+
+        // See self::comparison_bench::benchmark_facet_indexing
+        if self.data_size >= (self.database.len(wtxn)? / 500) {
+            let field_ids =
+                self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
+            let bulk_update = FacetsUpdateBulk::new(
+                self.index,
+                field_ids,
+                self.facet_type,
+                self.delta_data,
+                self.group_size,
+                self.min_level_size,
+            );
+            bulk_update.execute(wtxn)?;
+        } else {
+            let incremental_update = FacetsUpdateIncremental::new(
+                self.index,
+                self.facet_type,
+                self.delta_data,
+                self.group_size,
+                self.min_level_size,
+                self.max_group_size,
+            );
+            incremental_update.execute(wtxn)?;
+        }
+
+        match self.normalized_delta_data {
+            Some(data) => index_facet_search(wtxn, data, self.index),
+            None => Ok(()),
+        }
+    }
+}
+
+fn index_facet_search(
+    wtxn: &mut heed::RwTxn<'_>,
+    normalized_delta_data: Merger<BufReader<File>, MergeDeladdBtreesetString>,
+    index: &Index,
+) -> Result<()> {
+    let mut iter = normalized_delta_data.into_stream_merger_iter()?;
+    while let Some((key_bytes, delta_bytes)) = iter.next()? {
+        let deladd_reader = KvReaderDelAdd::from_slice(delta_bytes);
+
+        let database_set = index
+            .facet_id_normalized_string_strings
+            .remap_key_type::<Bytes>()
+            .get(wtxn, key_bytes)?
+            .unwrap_or_default();
+
+        let add_set = deladd_reader
+            .get(DelAdd::Addition)
+            .and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
+            .unwrap_or_default();
+
+        let del_set = match deladd_reader
+            .get(DelAdd::Deletion)
+            .and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
+        {
+            Some(del_set) => {
+                let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap();
+                let field_id = FieldId::from_be_bytes(field_id_bytes);
+                let mut set = BTreeSet::new();
+                for facet in del_set {
+                    let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() };
+                    // Check if the referenced value doesn't exist anymore before deleting it.
+                    if index
+                        .facet_id_string_docids
+                        .remap_data_type::<DecodeIgnore>()
+                        .get(wtxn, &key)?
+                        .is_none()
+                    {
+                        set.insert(facet);
+                    }
+                }
+                set
+            }
+            None => BTreeSet::new(),
+        };
+
+        let set: BTreeSet<_> =
+            database_set.difference(&del_set).chain(add_set.iter()).cloned().collect();
+
+        if set.is_empty() {
+            index
+                .facet_id_normalized_string_strings
+                .remap_key_type::<Bytes>()
+                .delete(wtxn, key_bytes)?;
+        } else {
+            index
+                .facet_id_normalized_string_strings
+                .remap_key_type::<Bytes>()
+                .put(wtxn, key_bytes, &set)?;
+        }
+    }
+
+    // We clear the FST of normalized-for-search to compute everything from scratch.
+    index.facet_id_string_fst.clear(wtxn)?;
+    // We compute one FST by string facet
+    let mut text_fsts = vec![];
+    let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
+    let database = index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
+    for result in database.iter(wtxn)? {
+        let ((field_id, normalized_facet), _) = result?;
+        current_fst = match current_fst.take() {
+            Some((fid, fst_builder)) if fid != field_id => {
+                let fst = fst_builder.into_set();
+                text_fsts.push((fid, fst));
+                Some((field_id, fst::SetBuilder::memory()))
+            }
+            Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
+            None => Some((field_id, fst::SetBuilder::memory())),
+        };
+
+        if let Some((_, fst_builder)) = current_fst.as_mut() {
+            fst_builder.insert(normalized_facet)?;
+        }
+    }
+
+    if let Some((field_id, fst_builder)) = current_fst {
+        let fst = fst_builder.into_set();
+        text_fsts.push((field_id, fst));
+    }
+
+    // We write those FSTs in LMDB now
+    for (field_id, fst) in text_fsts {
+        index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
+    }
+
+    Ok(())
+}
+
+#[cfg(test)]
+pub(crate) mod test_helpers {
+    use std::cell::Cell;
+    use std::fmt::Display;
+    use std::iter::FromIterator;
+    use std::marker::PhantomData;
+    use std::rc::Rc;
+
+    use grenad::MergerBuilder;
+    use heed::types::Bytes;
+    use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
+    use roaring::RoaringBitmap;
+
+    use super::bulk::FacetsUpdateBulkInner;
+    use crate::heed_codec::facet::{
+        FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
+    };
+    use crate::heed_codec::BytesRefCodec;
+    use crate::search::facet::get_highest_level;
+    use crate::snapshot_tests::display_bitmap;
+    use crate::update::del_add::{DelAdd, KvWriterDelAdd};
+    use crate::update::index_documents::MergeDeladdCboRoaringBitmaps;
+    use crate::update::FacetsUpdateIncrementalInner;
+    use crate::CboRoaringBitmapCodec;
+
+    /// Utility function to generate a string whose position in a lexicographically
+    /// ordered list is `i`.
+    pub fn ordered_string(mut i: usize) -> String {
+        // The first string is empty
+        if i == 0 {
+            return String::new();
+        }
+        // The others are 5 char long, each between 'a' and 'z'
+        let mut s = String::new();
+        for _ in 0..5 {
+            let (digit, next) = (i % 26, i / 26);
+            s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap());
+            i = next;
+        }
+        s
+    }
+
+    /// A dummy index that only contains the facet database, used for testing
+    pub struct FacetIndex<BoundCodec>
+    where
+        for<'a> BoundCodec:
+            BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
+    {
+        pub env: Env,
+        pub content: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
+        pub group_size: Cell<u8>,
+        pub min_level_size: Cell<u8>,
+        pub max_group_size: Cell<u8>,
+        _tempdir: Rc<tempfile::TempDir>,
+        _phantom: PhantomData<BoundCodec>,
+    }
+
+    impl<BoundCodec> FacetIndex<BoundCodec>
+    where
+        for<'a> BoundCodec:
+            BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
+    {
+        #[cfg(all(test, fuzzing))]
+        pub fn open_from_tempdir(
+            tempdir: Rc<tempfile::TempDir>,
+            group_size: u8,
+            max_group_size: u8,
+            min_level_size: u8,
+        ) -> FacetIndex<BoundCodec> {
+            let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16
+            let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16
+            let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17
+
+            let mut options = heed::EnvOpenOptions::new();
+            let options = options.map_size(4096 * 4 * 10 * 1000);
+            unsafe {
+                options.flag(heed::flags::Flags::MdbAlwaysFreePages);
+            }
+            let env = options.open(tempdir.path()).unwrap();
+            let content = env.open_database(None).unwrap().unwrap();
+
+            FacetIndex {
+                content,
+                group_size: Cell::new(group_size),
+                max_group_size: Cell::new(max_group_size),
+                min_level_size: Cell::new(min_level_size),
+                _tempdir: tempdir,
+                env,
+                _phantom: PhantomData,
+            }
+        }
+        pub fn new(
+            group_size: u8,
+            max_group_size: u8,
+            min_level_size: u8,
+        ) -> FacetIndex<BoundCodec> {
+            let group_size = group_size.clamp(2, 127);
+            let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127
+            let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf
+            let mut options = heed::EnvOpenOptions::new();
+            let options = options.map_size(4096 * 4 * 1000 * 100);
+            let tempdir = tempfile::TempDir::new().unwrap();
+            let env = unsafe { options.open(tempdir.path()) }.unwrap();
+            let mut wtxn = env.write_txn().unwrap();
+            let content = env.create_database(&mut wtxn, None).unwrap();
+            wtxn.commit().unwrap();
+
+            FacetIndex {
+                content,
+                group_size: Cell::new(group_size),
+                max_group_size: Cell::new(max_group_size),
+                min_level_size: Cell::new(min_level_size),
+                _tempdir: Rc::new(tempdir),
+                env,
+                _phantom: PhantomData,
+            }
+        }
+
+        #[cfg(all(test, fuzzing))]
+        pub fn set_group_size(&self, group_size: u8) {
+            // 2 <= x <= 64
+            self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2)));
+        }
+        #[cfg(all(test, fuzzing))]
+        pub fn set_max_group_size(&self, max_group_size: u8) {
+            // 2*group_size <= x <= 128
+            let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size));
+            self.max_group_size.set(max_group_size);
+            if self.group_size.get() < max_group_size / 2 {
+                self.group_size.set(max_group_size / 2);
+            }
+        }
+        #[cfg(all(test, fuzzing))]
+        pub fn set_min_level_size(&self, min_level_size: u8) {
+            // 1 <= x <= inf
+            self.min_level_size.set(std::cmp::max(1, min_level_size));
+        }
+
+        pub fn insert<'a>(
+            &self,
+            wtxn: &'a mut RwTxn<'_>,
+            field_id: u16,
+            key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
+            docids: &RoaringBitmap,
+        ) {
+            let update = FacetsUpdateIncrementalInner {
+                db: self.content,
+                group_size: self.group_size.get(),
+                min_level_size: self.min_level_size.get(),
+                max_group_size: self.max_group_size.get(),
+            };
+            let key_bytes = BoundCodec::bytes_encode(key).unwrap();
+            update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap();
+            update.add_or_delete_level(wtxn, field_id).unwrap();
+        }
+        pub fn delete_single_docid<'a>(
+            &self,
+            wtxn: &'a mut RwTxn<'_>,
+            field_id: u16,
+            key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
+            docid: u32,
+        ) {
+            self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid)))
+        }
+
+        pub fn delete<'a>(
+            &self,
+            wtxn: &'a mut RwTxn<'_>,
+            field_id: u16,
+            key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
+            docids: &RoaringBitmap,
+        ) {
+            let update = FacetsUpdateIncrementalInner {
+                db: self.content,
+                group_size: self.group_size.get(),
+                min_level_size: self.min_level_size.get(),
+                max_group_size: self.max_group_size.get(),
+            };
+            let key_bytes = BoundCodec::bytes_encode(key).unwrap();
+            update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap();
+            update.add_or_delete_level(wtxn, field_id).unwrap();
+        }
+
+        pub fn bulk_insert<'a, 'b>(
+            &self,
+            wtxn: &'a mut RwTxn<'_>,
+            field_ids: &[u16],
+            els: impl IntoIterator<
+                Item = &'a ((u16, <BoundCodec as BytesEncode<'a>>::EItem), RoaringBitmap),
+            >,
+        ) where
+            for<'c> <BoundCodec as BytesEncode<'c>>::EItem: Sized,
+        {
+            let mut new_data = vec![];
+            let mut writer = grenad::Writer::new(&mut new_data);
+            for ((field_id, left_bound), docids) in els {
+                let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned();
+                let key: FacetGroupKey<&[u8]> =
+                    FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
+                let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap();
+                let mut inner_writer = KvWriterDelAdd::memory();
+                let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
+                inner_writer.insert(DelAdd::Addition, value).unwrap();
+                writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
+            }
+            writer.finish().unwrap();
+            let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            builder.push(reader.into_cursor().unwrap());
+            let merger = builder.build();
+
+            let update = FacetsUpdateBulkInner {
+                db: self.content,
+                delta_data: Some(merger),
+                group_size: self.group_size.get(),
+                min_level_size: self.min_level_size.get(),
+            };
+
+            update.update(wtxn, field_ids).unwrap();
+        }
+
+        pub fn verify_structure_validity(&self, txn: &RoTxn<'_>, field_id: u16) {
+            let mut field_id_prefix = vec![];
+            field_id_prefix.extend_from_slice(&field_id.to_be_bytes());
+
+            let highest_level = get_highest_level(txn, self.content, field_id).unwrap();
+
+            for level_no in (1..=highest_level).rev() {
+                let mut level_no_prefix = vec![];
+                level_no_prefix.extend_from_slice(&field_id.to_be_bytes());
+                level_no_prefix.push(level_no);
+
+                let iter = self
+                    .content
+                    .remap_types::<Bytes, FacetGroupValueCodec>()
+                    .prefix_iter(txn, &level_no_prefix)
+                    .unwrap();
+                for el in iter {
+                    let (key, value) = el.unwrap();
+                    let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
+
+                    let mut prefix_start_below = vec![];
+                    prefix_start_below.extend_from_slice(&field_id.to_be_bytes());
+                    prefix_start_below.push(level_no - 1);
+                    prefix_start_below.extend_from_slice(key.left_bound);
+
+                    let start_below = {
+                        let mut start_below_iter = self
+                            .content
+                            .remap_types::<Bytes, FacetGroupValueCodec>()
+                            .prefix_iter(txn, &prefix_start_below)
+                            .unwrap();
+                        let (key_bytes, _) = start_below_iter.next().unwrap().unwrap();
+                        FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes).unwrap()
+                    };
+
+                    assert!(value.size > 0);
+
+                    let mut actual_size = 0;
+                    let mut values_below = RoaringBitmap::new();
+                    let iter_below = self
+                        .content
+                        .range(txn, &(start_below..))
+                        .unwrap()
+                        .take(value.size as usize);
+                    for el in iter_below {
+                        let (_, value) = el.unwrap();
+                        actual_size += 1;
+                        values_below |= value.bitmap;
+                    }
+                    assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}");
+
+                    assert_eq!(value.bitmap, values_below);
+                }
+            }
+        }
+    }
+
+    impl<BoundCodec> Display for FacetIndex<BoundCodec>
+    where
+        for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized + Display,
+        for<'a> BoundCodec:
+            BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
+    {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            let txn = self.env.read_txn().unwrap();
+            let iter = self.content.iter(&txn).unwrap();
+            for el in iter {
+                let (key, value) = el.unwrap();
+                let FacetGroupKey { field_id, level, left_bound: bound } = key;
+                let bound = BoundCodec::bytes_decode(bound).unwrap();
+                let FacetGroupValue { size, bitmap } = value;
+                writeln!(
+                    f,
+                    "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}",
+                    values = display_bitmap(&bitmap)
+                )?;
+            }
+            Ok(())
+        }
+    }
+}
+
+#[allow(unused)]
+#[cfg(test)]
+mod comparison_bench {
+    use std::iter::once;
+
+    use rand::Rng;
+    use roaring::RoaringBitmap;
+
+    use super::test_helpers::FacetIndex;
+    use crate::heed_codec::facet::OrderedF64Codec;
+
+    // This is a simple test to get an intuition on the relative speed
+    // of the incremental vs. bulk indexer.
+    //
+    // The benchmark shows the worst-case scenario for the incremental indexer, since
+    // each facet value contains only one document ID.
+    //
+    // In that scenario, it appears that the incremental indexer is about 50 times slower than the
+    // bulk indexer.
+    // #[test]
+    fn benchmark_facet_indexing() {
+        let mut facet_value = 0;
+
+        let mut r = rand::thread_rng();
+
+        for i in 1..=20 {
+            let size = 50_000 * i;
+            let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
+
+            let mut txn = index.env.write_txn().unwrap();
+            let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
+            for i in 0..size {
+                // field id = 0, left_bound = i, docids = [i]
+                elements.push(((0, facet_value as f64), once(i).collect()));
+                facet_value += 1;
+            }
+            let timer = std::time::Instant::now();
+            index.bulk_insert(&mut txn, &[0], elements.iter());
+            let time_spent = timer.elapsed().as_millis();
+            println!("bulk {size} : {time_spent}ms");
+
+            txn.commit().unwrap();
+
+            for nbr_doc in [1, 100, 1000, 10_000] {
+                let mut txn = index.env.write_txn().unwrap();
+                let timer = std::time::Instant::now();
+                //
+                // insert one document
+                //
+                for _ in 0..nbr_doc {
+                    index.insert(&mut txn, 0, &r.gen(), &once(1).collect());
+                }
+                let time_spent = timer.elapsed().as_millis();
+                println!("    add {nbr_doc} : {time_spent}ms");
+                txn.abort();
+            }
+        }
+    }
+}
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+b40dd31a65e033ffc6b35c027ce19506
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+7ee22d8e9387e72758f00918eb67e4c6
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+60f567359382507afdaf45fb075740c3
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+b986d6e6cbf425685f409a8b417010e1
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+ee10dd2ae2b5c6621a89a5d0a9aa8ccc
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+fa877559eef78b383b496c15a364a2dc
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+16a96353bc42f2ff3e91611ca4d5b184
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+be1b08073b9d9788d18080c1320151d7
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+16a96353bc42f2ff3e91611ca4d5b184
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+32a45d555df2e001420fea149818d376
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+353d70f52eea66e5031dca989ea8a037
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+52a093c909133d84023a4a7b83864808
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+9d86c72ddb241d0aeca2995d61a3648a
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+c0943177594534bfe5527cbf40fe388e
--- a/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+6ed86f234028ae3df5881bee5512f11e
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+5dbfa134cc44abeb3ab6242fc182e48e
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+6ed7bf5d440599b3b10b37549a271fdf
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap
@@ -0,0 +1,19 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+0  0  k0        1    "[0, ]"
+0  0  k1        1    "[1, ]"
+0  0  k2        1    "[2, ]"
+0  0  k3        1    "[3, ]"
+0  0  k4        1    "[4, ]"
+0  0  k5        1    "[5, ]"
+0  0  k6        1    "[6, ]"
+0  0  k7        1    "[7, ]"
+0  0  k8        1    "[8, ]"
+0  0  k9        1    "[9, ]"
+0  0  k10       1    "[10, ]"
+0  0  k11       1    "[11, ]"
+0  0  k12       1    "[12, ]"
+0  0  k13       1    "[13, ]"
+0  0  k14       1    "[14, ]"
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+b5203f0df0036ebaa133dd77d63a00eb
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap
@@ -0,0 +1,26 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+0  0  k0        1    "[0, ]"
+0  0  k1        1    "[1, ]"
+0  0  k2        1    "[2, ]"
+0  0  k3        1    "[3, ]"
+0  0  k4        1    "[4, ]"
+0  0  k5        1    "[5, ]"
+0  0  k6        1    "[6, ]"
+0  0  k7        1    "[7, ]"
+0  0  k8        1    "[8, ]"
+0  0  k9        1    "[9, ]"
+0  0  k10       1    "[10, ]"
+0  0  k11       1    "[11, ]"
+0  0  k12       1    "[12, ]"
+0  0  k13       1    "[13, ]"
+0  0  k14       1    "[14, ]"
+0  0  k15       1    "[15, ]"
+0  0  k16       1    "[16, ]"
+0  1  k0        4    "[0, 1, 2, 3, ]"
+0  1  k4        4    "[4, 5, 6, 7, ]"
+0  1  k8        4    "[8, 9, 10, 11, ]"
+0  1  k12       4    "[12, 13, 14, 15, ]"
+0  1  k16       1    "[16, ]"
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+95497d8579740868ee0bfc655b0bf782
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+d565c2f7bbd9e13e12de40cfbbfba6bb
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap
@@ -0,0 +1,54 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+0  0  k216      1    "[216, ]"
+0  0  k217      1    "[217, ]"
+0  0  k218      1    "[218, ]"
+0  0  k219      1    "[219, ]"
+0  0  k220      1    "[220, ]"
+0  0  k221      1    "[221, ]"
+0  0  k222      1    "[222, ]"
+0  0  k223      1    "[223, ]"
+0  0  k224      1    "[224, ]"
+0  0  k225      1    "[225, ]"
+0  0  k226      1    "[226, ]"
+0  0  k227      1    "[227, ]"
+0  0  k228      1    "[228, ]"
+0  0  k229      1    "[229, ]"
+0  0  k230      1    "[230, ]"
+0  0  k231      1    "[231, ]"
+0  0  k232      1    "[232, ]"
+0  0  k233      1    "[233, ]"
+0  0  k234      1    "[234, ]"
+0  0  k235      1    "[235, ]"
+0  0  k236      1    "[236, ]"
+0  0  k237      1    "[237, ]"
+0  0  k238      1    "[238, ]"
+0  0  k239      1    "[239, ]"
+0  0  k240      1    "[240, ]"
+0  0  k241      1    "[241, ]"
+0  0  k242      1    "[242, ]"
+0  0  k243      1    "[243, ]"
+0  0  k244      1    "[244, ]"
+0  0  k245      1    "[245, ]"
+0  0  k246      1    "[246, ]"
+0  0  k247      1    "[247, ]"
+0  0  k248      1    "[248, ]"
+0  0  k249      1    "[249, ]"
+0  0  k250      1    "[250, ]"
+0  0  k251      1    "[251, ]"
+0  0  k252      1    "[252, ]"
+0  0  k253      1    "[253, ]"
+0  0  k254      1    "[254, ]"
+0  0  k255      1    "[255, ]"
+0  1  k216      4    "[216, 217, 218, 219, ]"
+0  1  k220      4    "[220, 221, 222, 223, ]"
+0  1  k224      4    "[224, 225, 226, 227, ]"
+0  1  k228      4    "[228, 229, 230, 231, ]"
+0  1  k232      4    "[232, 233, 234, 235, ]"
+0  1  k236      4    "[236, 237, 238, 239, ]"
+0  1  k240      4    "[240, 241, 242, 243, ]"
+0  1  k244      4    "[244, 245, 246, 247, ]"
+0  1  k248      4    "[248, 249, 250, 251, ]"
+0  1  k252      4    "[252, 253, 254, 255, ]"
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+7cb503827ba17e9670296cc9531a1380
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+b061f43e379e16f0617c05d3313d0078
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+81fc9489d6b163935b97433477dea63b
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+b17b2c4ec87a778aae07854c96c08b48
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap
@@ -0,0 +1,20 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+0  0  k0        1    "[3, 435, 583, 849, ]"
+0  0  k1        1    "[35, 494, 693, 796, ]"
+0  0  k2        1    "[76, 420, 526, 909, ]"
+0  0  k3        1    "[133, 451, 653, 806, ]"
+0  0  k4        1    "[131, 464, 656, 853, ]"
+0  0  k5        1    "[61, 308, 701, 903, ]"
+0  0  k6        1    "[144, 449, 674, 794, ]"
+0  0  k7        1    "[182, 451, 735, 941, ]"
+0  0  k8        1    "[6, 359, 679, 1003, ]"
+0  0  k9        1    "[197, 418, 659, 904, ]"
+0  0  k10       1    "[88, 297, 567, 800, ]"
+0  0  k11       1    "[150, 309, 530, 946, ]"
+0  0  k12       1    "[156, 466, 567, 892, ]"
+0  0  k13       1    "[46, 425, 610, 807, ]"
+0  0  k14       1    "[236, 433, 549, 891, ]"
+0  0  k15       1    "[207, 472, 603, 974, ]"
+
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+7f8aa18d2b3a6422d55c03bede0563db
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+7f8aa18d2b3a6422d55c03bede0563db
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+b3e2de9020d9e0f3941bc3a179c795ba
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+5dbfa134cc44abeb3ab6242fc182e48e
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+9343355bf535ed4a0c956df2b229d5e6
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+4fc800f49201a336295af0542fdf01ab
--- a/crates/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap
+++ b/crates/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/incremental.rs
+---
+fd65ce7d96a07aafb0ef6cfb5bf016b8
--- a/crates/milli/src/update/index_documents/enrich.rs
+++ b/crates/milli/src/update/index_documents/enrich.rs
@@ -0,0 +1,263 @@
+use std::fmt;
+use std::io::{BufWriter, Read, Seek};
+use std::result::Result as StdResult;
+
+use serde::{Deserialize, Serialize};
+use serde_json::Value;
+
+use crate::documents::{
+    DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
+    EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
+};
+use crate::error::{GeoError, InternalError, UserError};
+use crate::update::index_documents::{obkv_to_object, writer_into_reader};
+use crate::{FieldId, Index, Result};
+
+/// This function validates and enrich the documents by checking that:
+///  - we can infer a primary key,
+///  - all the documents id exist and are extracted,
+///  - the validity of them but also,
+///  - the validity of the `_geo` field depending on the settings.
+///
+/// # Panics
+///
+/// - if reader.is_empty(), this function may panic in some cases
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
+pub fn enrich_documents_batch<R: Read + Seek>(
+    rtxn: &heed::RoTxn<'_>,
+    index: &Index,
+    autogenerate_docids: bool,
+    reader: DocumentsBatchReader<R>,
+) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
+    let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
+
+    let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
+    let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
+
+    // The primary key *field id* that has already been set for this index or the one
+    // we will guess by searching for the first key that contains "id" as a substring.
+    let primary_key = match index.primary_key(rtxn)? {
+        Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
+            Some(primary_key) => primary_key,
+            None if autogenerate_docids => PrimaryKey::Flat {
+                name: primary_key,
+                field_id: documents_batch_index.insert(primary_key),
+            },
+            None => {
+                return match cursor.next_document()? {
+                    Some(first_document) => Ok(Err(UserError::MissingDocumentId {
+                        primary_key: primary_key.to_string(),
+                        document: obkv_to_object(first_document, &documents_batch_index)?,
+                    })),
+                    None => unreachable!("Called with reader.is_empty()"),
+                };
+            }
+        },
+        None => {
+            let mut guesses: Vec<(u16, &str)> = documents_batch_index
+                .iter()
+                .filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
+                .map(|(field_id, name)| (*field_id, name.as_str()))
+                .collect();
+
+            // sort the keys in a deterministic, obvious way, so that fields are always in the same order.
+            guesses.sort_by(|(_, left_name), (_, right_name)| {
+                // shortest name first
+                left_name.len().cmp(&right_name.len()).then_with(
+                    // then alphabetical order
+                    || left_name.cmp(right_name),
+                )
+            });
+
+            match guesses.as_slice() {
+                [] if autogenerate_docids => PrimaryKey::Flat {
+                    name: DEFAULT_PRIMARY_KEY,
+                    field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
+                },
+                [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
+                [(field_id, name)] => {
+                    tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
+                    PrimaryKey::Flat { name, field_id: *field_id }
+                }
+                multiple => {
+                    return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
+                        candidates: multiple
+                            .iter()
+                            .map(|(_, candidate)| candidate.to_string())
+                            .collect(),
+                    }));
+                }
+            }
+        }
+    };
+
+    // If the settings specifies that a _geo field must be used therefore we must check the
+    // validity of it in all the documents of this batch and this is when we return `Some`.
+    let geo_field_id = match documents_batch_index.id("_geo") {
+        Some(geo_field_id)
+            if index.sortable_fields(rtxn)?.contains("_geo")
+                || index.filterable_fields(rtxn)?.contains("_geo") =>
+        {
+            Some(geo_field_id)
+        }
+        _otherwise => None,
+    };
+
+    let mut count = 0;
+    while let Some(document) = cursor.next_document()? {
+        let document_id = match fetch_or_generate_document_id(
+            document,
+            &documents_batch_index,
+            primary_key,
+            autogenerate_docids,
+            &mut uuid_buffer,
+            count,
+        )? {
+            Ok(document_id) => document_id,
+            Err(user_error) => return Ok(Err(user_error)),
+        };
+
+        if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
+            if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
+                return Ok(Err(UserError::from(user_error)));
+            }
+        }
+
+        let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?;
+        external_ids.insert(count.to_be_bytes(), document_id)?;
+
+        count += 1;
+    }
+
+    let external_ids = writer_into_reader(external_ids)?;
+    let primary_key_name = primary_key.name().to_string();
+    let reader = EnrichedDocumentsBatchReader::new(
+        DocumentsBatchReader::new(cursor, documents_batch_index),
+        primary_key_name,
+        external_ids,
+    )?;
+
+    Ok(Ok(reader))
+}
+
+/// Retrieve the document id after validating it, returning a `UserError`
+/// if the id is invalid or can't be guessed.
+#[tracing::instrument(level = "trace", skip(uuid_buffer, documents_batch_index, document)
+target = "indexing::documents")]
+fn fetch_or_generate_document_id(
+    document: &obkv::KvReader<FieldId>,
+    documents_batch_index: &DocumentsBatchIndex,
+    primary_key: PrimaryKey<'_>,
+    autogenerate_docids: bool,
+    uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
+    count: u32,
+) -> Result<StdResult<DocumentId, UserError>> {
+    Ok(match primary_key.document_id(document, documents_batch_index)? {
+        Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
+        Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
+        Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
+            let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
+            Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
+        }
+        Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
+            primary_key: primary_key.name().to_string(),
+            document: obkv_to_object(document, documents_batch_index)?,
+        }),
+        Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
+            Err(UserError::TooManyDocumentIds {
+                primary_key: primary_key.name().to_string(),
+                document: obkv_to_object(document, documents_batch_index)?,
+            })
+        }
+    })
+}
+
+/// A type that represents a document id that has been retrieved from a document or auto-generated.
+///
+/// In case the document id has been auto-generated, the document nth is kept to help
+/// users debug if there is an issue with the document itself.
+#[derive(Serialize, Deserialize, Clone)]
+pub enum DocumentId {
+    Retrieved { value: String },
+    Generated { value: String, document_nth: u32 },
+}
+
+impl DocumentId {
+    fn debug(&self) -> String {
+        format!("{:?}", self)
+    }
+
+    pub fn is_generated(&self) -> bool {
+        matches!(self, DocumentId::Generated { .. })
+    }
+
+    pub fn value(&self) -> &str {
+        match self {
+            DocumentId::Retrieved { value } => value,
+            DocumentId::Generated { value, .. } => value,
+        }
+    }
+}
+
+impl fmt::Debug for DocumentId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            DocumentId::Retrieved { value } => write!(f, "{:?}", value),
+            DocumentId::Generated { value, document_nth } => {
+                write!(f, "{{{:?}}} of the {}nth document", value, document_nth)
+            }
+        }
+    }
+}
+
+/// Try to extract an `f64` from a JSON `Value` and return the `Value`
+/// in the `Err` variant if it failed.
+pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
+    let number = match value {
+        Value::Number(ref n) => match n.as_f64() {
+            Some(number) => number,
+            None => return Err(value),
+        },
+        Value::String(ref s) => match s.parse::<f64>() {
+            Ok(number) => number,
+            Err(_) => return Err(value),
+        },
+        value => return Err(value),
+    };
+
+    if number.is_finite() {
+        Ok(number)
+    } else {
+        Err(value)
+    }
+}
+
+pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
+    use GeoError::*;
+    let debug_id = || {
+        serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
+    };
+    match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
+        Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
+            (Some(lat), Some(lng)) => {
+                match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
+                    (Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
+                        document_id: debug_id(),
+                        value: object.into(),
+                    })),
+                    (Ok(_), Ok(_)) => Ok(Ok(())),
+                    (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
+                    (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
+                    (Err(lat), Err(lng)) => {
+                        Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng }))
+                    }
+                }
+            }
+            (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })),
+            (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
+            (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
+        },
+        Value::Null => Ok(Ok(())),
+        value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
+    }
+}
--- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -0,0 +1,319 @@
+use std::convert::TryInto;
+use std::fs::File;
+use std::io::BufReader;
+use std::{io, mem, str};
+
+use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
+use obkv::{KvReader, KvWriterU16};
+use roaring::RoaringBitmap;
+use serde_json::Value;
+
+use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepLatestObkv};
+use crate::error::{InternalError, SerializationError};
+use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
+use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
+use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
+
+/// Extracts the word and positions where this word appear and
+/// prefixes it by the document id.
+///
+/// Returns the generated internal documents ids and a grenad reader
+/// with the list of extracted words from the given chunk of documents.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
+    obkv_documents: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
+    max_positions_per_attributes: Option<u32>,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let max_positions_per_attributes = max_positions_per_attributes
+        .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
+    let max_memory = indexer.max_memory_by_thread();
+    let force_reindexing = settings_diff.reindex_searchable();
+
+    // initialize destination values.
+    let mut documents_ids = RoaringBitmap::new();
+    let mut docid_word_positions_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        KeepLatestObkv,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+        true,
+    );
+
+    // initialize buffers.
+    let mut del_buffers = Buffers::default();
+    let mut add_buffers = Buffers::default();
+    let mut key_buffer = Vec::new();
+    let mut value_buffer = Vec::new();
+
+    // initialize tokenizer.
+    let old_stop_words = settings_diff.old.stop_words.as_ref();
+    let old_separators: Option<Vec<_>> = settings_diff
+        .old
+        .allowed_separators
+        .as_ref()
+        .map(|s| s.iter().map(String::as_str).collect());
+    let old_dictionary: Option<Vec<_>> =
+        settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+    let del_builder =
+        tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref());
+    let del_tokenizer = del_builder.into_tokenizer();
+
+    let new_stop_words = settings_diff.new.stop_words.as_ref();
+    let new_separators: Option<Vec<_>> = settings_diff
+        .new
+        .allowed_separators
+        .as_ref()
+        .map(|s| s.iter().map(String::as_str).collect());
+    let new_dictionary: Option<Vec<_>> =
+        settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+    let add_builder =
+        tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref());
+    let add_tokenizer = add_builder.into_tokenizer();
+
+    // iterate over documents.
+    let mut cursor = obkv_documents.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let document_id = key
+            .try_into()
+            .map(u32::from_be_bytes)
+            .map_err(|_| SerializationError::InvalidNumberSerialization)?;
+        let obkv = KvReader::<FieldId>::from_slice(value);
+
+        // if the searchable fields didn't change, skip the searchable indexing for this document.
+        if !force_reindexing && !searchable_fields_changed(obkv, settings_diff) {
+            continue;
+        }
+
+        documents_ids.push(document_id);
+
+        // Update key buffer prefix.
+        key_buffer.clear();
+        key_buffer.extend_from_slice(&document_id.to_be_bytes());
+
+        // Tokenize deletions and additions in 2 diffferent threads.
+        let (del, add): (Result<_>, Result<_>) = rayon::join(
+            || {
+                // deletions
+                tokens_from_document(
+                    obkv,
+                    &settings_diff.old,
+                    &del_tokenizer,
+                    max_positions_per_attributes,
+                    DelAdd::Deletion,
+                    &mut del_buffers,
+                )
+            },
+            || {
+                // additions
+                tokens_from_document(
+                    obkv,
+                    &settings_diff.new,
+                    &add_tokenizer,
+                    max_positions_per_attributes,
+                    DelAdd::Addition,
+                    &mut add_buffers,
+                )
+            },
+        );
+
+        let del_obkv = del?;
+        let add_obkv = add?;
+
+        // merge deletions and additions.
+        // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
+        value_buffer.clear();
+        del_add_from_two_obkvs(
+            KvReader::<FieldId>::from_slice(del_obkv),
+            KvReader::<FieldId>::from_slice(add_obkv),
+            &mut value_buffer,
+        )?;
+
+        // write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
+        let obkv = KvReader::<FieldId>::from_slice(&value_buffer);
+        for (field_id, value) in obkv.iter() {
+            key_buffer.truncate(mem::size_of::<u32>());
+            key_buffer.extend_from_slice(&field_id.to_be_bytes());
+            docid_word_positions_sorter.insert(&key_buffer, value)?;
+        }
+    }
+
+    // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
+    sorter_into_reader(docid_word_positions_sorter, indexer)
+}
+
+/// Check if any searchable fields of a document changed.
+fn searchable_fields_changed(
+    obkv: &KvReader<FieldId>,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> bool {
+    let searchable_fields = &settings_diff.new.searchable_fields_ids;
+    for (field_id, field_bytes) in obkv.iter() {
+        if searchable_fields.contains(&field_id) {
+            let del_add = KvReaderDelAdd::from_slice(field_bytes);
+            match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
+                // if both fields are None, check the next field.
+                (None, None) => (),
+                // if both contains a value and values are the same, check the next field.
+                (Some(del), Some(add)) if del == add => (),
+                // otherwise the fields are different, return true.
+                _otherwise => return true,
+            }
+        }
+    }
+
+    false
+}
+
+/// Factorize tokenizer building.
+fn tokenizer_builder<'a>(
+    stop_words: Option<&'a fst::Set<Vec<u8>>>,
+    allowed_separators: Option<&'a [&str]>,
+    dictionary: Option<&'a [&str]>,
+) -> TokenizerBuilder<'a, Vec<u8>> {
+    let mut tokenizer_builder = TokenizerBuilder::new();
+    if let Some(stop_words) = stop_words {
+        tokenizer_builder.stop_words(stop_words);
+    }
+    if let Some(dictionary) = dictionary {
+        tokenizer_builder.words_dict(dictionary);
+    }
+    if let Some(separators) = allowed_separators {
+        tokenizer_builder.separators(separators);
+    }
+
+    tokenizer_builder
+}
+
+/// Extract words mapped with their positions of a document.
+fn tokens_from_document<'a>(
+    obkv: &'a KvReader<FieldId>,
+    settings: &InnerIndexSettings,
+    tokenizer: &Tokenizer<'_>,
+    max_positions_per_attributes: u32,
+    del_add: DelAdd,
+    buffers: &'a mut Buffers,
+) -> Result<&'a [u8]> {
+    buffers.obkv_buffer.clear();
+    let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
+    for (field_id, field_bytes) in obkv.iter() {
+        // if field is searchable.
+        if settings.searchable_fields_ids.contains(&field_id) {
+            // extract deletion or addition only.
+            if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
+                // parse json.
+                let value =
+                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
+
+                // prepare writing destination.
+                buffers.obkv_positions_buffer.clear();
+                let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
+
+                // convert json into a unique string.
+                buffers.field_buffer.clear();
+                if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
+                    // create an iterator of token with their positions.
+                    let locales = settings.localized_searchable_fields_ids.locales(field_id);
+                    let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
+                        .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
+
+                    for (index, token) in tokens {
+                        // keep a word only if it is not empty and fit in a LMDB key.
+                        let token = token.lemma().trim();
+                        if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                            let position: u16 = index
+                                .try_into()
+                                .map_err(|_| SerializationError::InvalidNumberSerialization)?;
+                            writer.insert(position, token.as_bytes())?;
+                        }
+                    }
+
+                    // write positions into document.
+                    let positions = writer.into_inner()?;
+                    document_writer.insert(field_id, positions)?;
+                }
+            }
+        }
+    }
+
+    // returns a KV<FieldId, KV<u16, String>>
+    Ok(document_writer.into_inner().map(|v| v.as_slice())?)
+}
+
+/// Transform a JSON value into a string that can be indexed.
+fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
+    fn inner(value: &Value, output: &mut String) -> bool {
+        use std::fmt::Write;
+        match value {
+            Value::Null | Value::Object(_) => false,
+            Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
+            Value::Number(number) => write!(output, "{}", number).is_ok(),
+            Value::String(string) => write!(output, "{}", string).is_ok(),
+            Value::Array(array) => {
+                let mut count = 0;
+                for value in array {
+                    if inner(value, output) {
+                        output.push_str(". ");
+                        count += 1;
+                    }
+                }
+                // check that at least one value was written
+                count != 0
+            }
+        }
+    }
+
+    if let Value::String(string) = value {
+        Some(string)
+    } else if inner(value, buffer) {
+        Some(buffer)
+    } else {
+        None
+    }
+}
+
+/// take an iterator on tokens and compute their relative position depending on separator kinds
+/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
+/// else we keep the standard proximity of 1 between words.
+fn process_tokens<'a>(
+    tokens: impl Iterator<Item = Token<'a>>,
+) -> impl Iterator<Item = (usize, Token<'a>)> {
+    tokens
+        .skip_while(|token| token.is_separator())
+        .scan((0, None), |(offset, prev_kind), mut token| {
+            match token.kind {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
+                    *offset += match *prev_kind {
+                        Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
+                        Some(_) => 1,
+                        None => 0,
+                    };
+                    *prev_kind = Some(token.kind)
+                }
+                TokenKind::Separator(SeparatorKind::Hard) => {
+                    *prev_kind = Some(token.kind);
+                }
+                TokenKind::Separator(SeparatorKind::Soft)
+                    if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
+                {
+                    *prev_kind = Some(token.kind);
+                }
+                _ => token.kind = TokenKind::Unknown,
+            }
+            Some((*offset, token))
+        })
+        .filter(|(_, t)| t.is_word())
+}
+
+#[derive(Default)]
+struct Buffers {
+    // the field buffer for each fields desserialization, and must be cleared between each field.
+    field_buffer: String,
+    // buffer used to store the value data containing an obkv.
+    obkv_buffer: Vec<u8>,
+    // buffer used to store the value data containing an obkv of tokens with their positions.
+    obkv_positions_buffer: Vec<u8>,
+}
--- a/crates/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@@ -0,0 +1,58 @@
+use std::fs::File;
+use std::io::{self, BufReader};
+
+use heed::{BytesDecode, BytesEncode};
+
+use super::helpers::{
+    create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps,
+};
+use crate::heed_codec::facet::{
+    FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
+};
+use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::Result;
+
+/// Extracts the facet number and the documents ids where this facet number appear.
+///
+/// Returns a grenad reader with the list of extracted facet numbers and
+/// documents ids from the given chunk of docid facet number positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
+    fid_docid_facet_number: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut facet_number_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+        true,
+    );
+
+    let mut buffer = Vec::new();
+    let mut cursor = fid_docid_facet_number.into_cursor()?;
+    while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
+        let (field_id, document_id, number) =
+            FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
+
+        let key = FacetGroupKey { field_id, level: 0, left_bound: number };
+        let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
+
+        buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut buffer);
+        for (deladd_key, _) in KvReaderDelAdd::from_slice(deladd_obkv_bytes).iter() {
+            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
+        }
+        obkv.finish()?;
+
+        facet_number_docids_sorter.insert(key_bytes, &buffer)?;
+    }
+
+    sorter_into_reader(facet_number_docids_sorter, indexer)
+}
--- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -0,0 +1,303 @@
+use std::collections::BTreeSet;
+use std::fs::File;
+use std::io::BufReader;
+use std::iter::FromIterator;
+use std::{io, str};
+
+use charabia::normalizer::{Normalize, NormalizerOption};
+use charabia::{Language, StrDetection, Token};
+use heed::types::SerdeJson;
+use heed::BytesEncode;
+
+use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
+use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
+use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
+use crate::localized_attributes_rules::LocalizedFieldIds;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::helpers::{
+    MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
+};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
+
+/// Extracts the facet string and the documents ids where this facet string appear.
+///
+/// Returns a grenad reader with the list of extracted facet strings and
+/// documents ids from the given chunk of docid facet string positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
+    docid_fid_facet_string: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+    if settings_diff.settings_update_only() {
+        extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
+    } else {
+        let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
+        extract_facet_string_docids_document_update(
+            docid_fid_facet_string,
+            indexer,
+            localized_field_ids,
+        )
+    }
+}
+
+/// Extracts the facet string and the documents ids where this facet string appear.
+///
+/// Returns a grenad reader with the list of extracted facet strings and
+/// documents ids from the given chunk of docid facet string positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
+    docid_fid_facet_string: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    localized_field_ids: &LocalizedFieldIds,
+) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut facet_string_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+        true,
+    );
+
+    let mut normalized_facet_string_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        MergeDeladdBtreesetString,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+        true,
+    );
+
+    let mut buffer = Vec::new();
+    let mut cursor = docid_fid_facet_string.into_cursor()?;
+    while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
+        let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
+
+        let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
+            && deladd_reader.get(DelAdd::Addition).is_some();
+
+        if is_same_value {
+            continue;
+        }
+
+        let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
+        let field_id = FieldId::from_be_bytes(field_id_bytes);
+
+        let (document_id_bytes, normalized_value_bytes) =
+            try_split_array_at::<_, 4>(bytes).unwrap();
+        let document_id = u32::from_be_bytes(document_id_bytes);
+
+        let normalized_value = str::from_utf8(normalized_value_bytes)?;
+
+        // Facet search normalization
+        {
+            let locales = localized_field_ids.locales(field_id);
+            let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
+
+            let set = BTreeSet::from_iter(std::iter::once(normalized_value));
+
+            // as the facet string is the same, we can put the deletion and addition in the same obkv.
+            buffer.clear();
+            let mut obkv = KvWriterDelAdd::new(&mut buffer);
+            for (deladd_key, _) in deladd_reader.iter() {
+                let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
+                obkv.insert(deladd_key, val)?;
+            }
+            obkv.finish()?;
+
+            let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
+            let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+            normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
+        }
+
+        let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
+        let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
+
+        buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut buffer);
+        for (deladd_key, _) in deladd_reader.iter() {
+            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
+        }
+        obkv.finish()?;
+        facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
+    }
+
+    let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
+    sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
+}
+
+/// Extracts the facet string and the documents ids where this facet string appear.
+///
+/// Returns a grenad reader with the list of extracted facet strings and
+/// documents ids from the given chunk of docid facet string positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
+    docid_fid_facet_string: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut facet_string_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+        true,
+    );
+
+    let mut normalized_facet_string_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        MergeDeladdBtreesetString,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+        true,
+    );
+
+    let mut buffer = Vec::new();
+    let mut cursor = docid_fid_facet_string.into_cursor()?;
+    while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
+        let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
+
+        let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
+            && deladd_reader.get(DelAdd::Addition).is_some();
+
+        let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
+        let field_id = FieldId::from_be_bytes(field_id_bytes);
+
+        let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
+        let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
+
+        let are_same_locales = old_locales == new_locales;
+
+        if is_same_value && are_same_locales {
+            continue;
+        }
+
+        let (document_id_bytes, normalized_value_bytes) =
+            try_split_array_at::<_, 4>(bytes).unwrap();
+        let document_id = u32::from_be_bytes(document_id_bytes);
+
+        let normalized_value = str::from_utf8(normalized_value_bytes)?;
+
+        // Facet search normalization
+        {
+            let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
+            let new_hyper_normalized_value = if are_same_locales {
+                &old_hyper_normalized_value
+            } else {
+                &normalize_facet_string(normalized_value, new_locales)
+            };
+
+            let set = BTreeSet::from_iter(std::iter::once(normalized_value));
+
+            // if the facet string is the same, we can put the deletion and addition in the same obkv.
+            if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
+                // nothing to do if we delete and re-add the value.
+                if is_same_value {
+                    continue;
+                }
+
+                buffer.clear();
+                let mut obkv = KvWriterDelAdd::new(&mut buffer);
+                for (deladd_key, _) in deladd_reader.iter() {
+                    let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
+                    obkv.insert(deladd_key, val)?;
+                }
+                obkv.finish()?;
+
+                let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
+                let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+                normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
+            } else {
+                // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different.
+                // deletion
+                if deladd_reader.get(DelAdd::Deletion).is_some() {
+                    // insert old value
+                    let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
+                    buffer.clear();
+                    let mut obkv = KvWriterDelAdd::new(&mut buffer);
+                    obkv.insert(DelAdd::Deletion, val)?;
+                    obkv.finish()?;
+                    let key: (u16, &str) = (field_id, old_hyper_normalized_value.as_ref());
+                    let key_bytes =
+                        BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+                    normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
+                }
+
+                // addition
+                if deladd_reader.get(DelAdd::Addition).is_some() {
+                    // insert new value
+                    let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
+                    buffer.clear();
+                    let mut obkv = KvWriterDelAdd::new(&mut buffer);
+                    obkv.insert(DelAdd::Addition, val)?;
+                    obkv.finish()?;
+                    let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
+                    let key_bytes =
+                        BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+                    normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
+                }
+            }
+        }
+
+        // nothing to do if we delete and re-add the value.
+        if is_same_value {
+            continue;
+        }
+
+        let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
+        let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
+
+        buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut buffer);
+        for (deladd_key, _) in deladd_reader.iter() {
+            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
+        }
+        obkv.finish()?;
+        facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
+    }
+
+    let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
+    sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
+}
+
+/// Normalizes the facet string and truncates it to the max length.
+fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
+    let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
+    let mut detection = StrDetection::new(facet_string, locales);
+
+    let script = detection.script();
+    // Detect the language of the facet string only if several locales are explicitly provided.
+    let language = match locales {
+        Some(&[language]) => Some(language),
+        Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
+        _ => None,
+    };
+
+    let token = Token {
+        lemma: std::borrow::Cow::Borrowed(facet_string),
+        script,
+        language,
+        ..Default::default()
+    };
+
+    // truncate the facet string to the max length
+    token
+        .normalize(&options)
+        .lemma
+        .char_indices()
+        .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
+        .map(|(_, c)| c)
+        .collect()
+}
--- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -0,0 +1,574 @@
+use std::collections::{BTreeMap, BTreeSet};
+use std::convert::TryInto;
+use std::fs::File;
+use std::io::{self, BufReader};
+use std::mem::size_of;
+
+use bytemuck::bytes_of;
+use grenad::Sorter;
+use heed::BytesEncode;
+use itertools::{merge_join_by, EitherOrBoth, Itertools};
+use ordered_float::OrderedFloat;
+use roaring::RoaringBitmap;
+use serde_json::{from_slice, Value};
+use FilterableValues::{Empty, Null, Values};
+
+use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepFirst};
+use crate::error::InternalError;
+use crate::facet::value_encoding::f64_into_bytes;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::{create_writer, writer_into_reader};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
+
+/// The length of the elements that are always in the buffer when inserting new values.
+const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
+
+/// The extracted facet values stored in grenad files by type.
+pub struct ExtractedFacetValues {
+    pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
+}
+
+/// Extracts the facet values of each faceted field of each document.
+///
+/// Returns the generated grenad reader containing the docid the fid and the original value as key
+/// and the normalized value as value extracted from the given chunk of documents.
+/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
+    obkv_documents: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<ExtractedFacetValues> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut fid_docid_facet_numbers_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        KeepFirst,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+        true,
+    );
+
+    let mut fid_docid_facet_strings_sorter = create_sorter(
+        grenad::SortAlgorithm::Stable,
+        KeepFirst,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 2),
+        true,
+    );
+
+    // The tuples represents the Del and Add side for a bitmap
+    let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
+    let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
+    let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
+
+    // We create two buffers for mutable ref issues with closures.
+    let mut numbers_key_buffer = Vec::new();
+    let mut strings_key_buffer = Vec::new();
+
+    let old_faceted_fids: BTreeSet<_> =
+        settings_diff.old.faceted_fields_ids.iter().copied().collect();
+    let new_faceted_fids: BTreeSet<_> =
+        settings_diff.new.faceted_fields_ids.iter().copied().collect();
+
+    if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
+        let mut cursor = obkv_documents.into_cursor()?;
+        while let Some((docid_bytes, value)) = cursor.move_on_next()? {
+            let obkv = obkv::KvReader::from_slice(value);
+            let get_document_json_value = move |field_id, side| {
+                obkv.get(field_id)
+                    .map(KvReaderDelAdd::from_slice)
+                    .and_then(|kv| kv.get(side))
+                    .map(from_slice)
+                    .transpose()
+                    .map_err(InternalError::SerdeJson)
+            };
+            // iterate over the faceted fields instead of over the whole document.
+            for eob in
+                merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
+                    old.cmp(new)
+                })
+            {
+                let (field_id, del_value, add_value) = match eob {
+                    EitherOrBoth::Left(&field_id) => {
+                        let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
+
+                        // deletion only
+                        (field_id, del_value, None)
+                    }
+                    EitherOrBoth::Right(&field_id) => {
+                        let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
+
+                        // addition only
+                        (field_id, None, add_value)
+                    }
+                    EitherOrBoth::Both(&field_id, _) => {
+                        // during settings update, recompute the changing settings only.
+                        if settings_diff.settings_update_only {
+                            continue;
+                        }
+
+                        let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
+                        let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
+
+                        (field_id, del_value, add_value)
+                    }
+                };
+
+                if del_value.is_some() || add_value.is_some() {
+                    numbers_key_buffer.clear();
+                    strings_key_buffer.clear();
+
+                    // Set key to the field_id
+                    // Note: this encoding is consistent with FieldIdCodec
+                    numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
+                    strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
+
+                    let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
+                    let document = DocumentId::from_be_bytes(document);
+
+                    // For the other extraction tasks, prefix the key with the field_id and the document_id
+                    numbers_key_buffer.extend_from_slice(docid_bytes);
+                    strings_key_buffer.extend_from_slice(docid_bytes);
+
+                    // We insert the document id on the Del and the Add side if the field exists.
+                    let (ref mut del_exists, ref mut add_exists) =
+                        facet_exists_docids.entry(field_id).or_default();
+                    let (ref mut del_is_null, ref mut add_is_null) =
+                        facet_is_null_docids.entry(field_id).or_default();
+                    let (ref mut del_is_empty, ref mut add_is_empty) =
+                        facet_is_empty_docids.entry(field_id).or_default();
+
+                    if del_value.is_some() {
+                        del_exists.insert(document);
+                    }
+                    if add_value.is_some() {
+                        add_exists.insert(document);
+                    }
+
+                    let del_geo_support = settings_diff
+                        .old
+                        .geo_fields_ids
+                        .map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
+                    let add_geo_support = settings_diff
+                        .new
+                        .geo_fields_ids
+                        .map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
+                    let del_filterable_values =
+                        del_value.map(|value| extract_facet_values(&value, del_geo_support));
+                    let add_filterable_values =
+                        add_value.map(|value| extract_facet_values(&value, add_geo_support));
+
+                    // Those closures are just here to simplify things a bit.
+                    let mut insert_numbers_diff = |del_numbers, add_numbers| {
+                        insert_numbers_diff(
+                            &mut fid_docid_facet_numbers_sorter,
+                            &mut numbers_key_buffer,
+                            del_numbers,
+                            add_numbers,
+                        )
+                    };
+                    let mut insert_strings_diff = |del_strings, add_strings| {
+                        insert_strings_diff(
+                            &mut fid_docid_facet_strings_sorter,
+                            &mut strings_key_buffer,
+                            del_strings,
+                            add_strings,
+                        )
+                    };
+
+                    match (del_filterable_values, add_filterable_values) {
+                        (None, None) => (),
+                        (Some(del_filterable_values), None) => match del_filterable_values {
+                            Null => {
+                                del_is_null.insert(document);
+                            }
+                            Empty => {
+                                del_is_empty.insert(document);
+                            }
+                            Values { numbers, strings } => {
+                                insert_numbers_diff(numbers, vec![])?;
+                                insert_strings_diff(strings, vec![])?;
+                            }
+                        },
+                        (None, Some(add_filterable_values)) => match add_filterable_values {
+                            Null => {
+                                add_is_null.insert(document);
+                            }
+                            Empty => {
+                                add_is_empty.insert(document);
+                            }
+                            Values { numbers, strings } => {
+                                insert_numbers_diff(vec![], numbers)?;
+                                insert_strings_diff(vec![], strings)?;
+                            }
+                        },
+                        (Some(del_filterable_values), Some(add_filterable_values)) => {
+                            match (del_filterable_values, add_filterable_values) {
+                                (Null, Null) | (Empty, Empty) => (),
+                                (Null, Empty) => {
+                                    del_is_null.insert(document);
+                                    add_is_empty.insert(document);
+                                }
+                                (Empty, Null) => {
+                                    del_is_empty.insert(document);
+                                    add_is_null.insert(document);
+                                }
+                                (Null, Values { numbers, strings }) => {
+                                    insert_numbers_diff(vec![], numbers)?;
+                                    insert_strings_diff(vec![], strings)?;
+                                    del_is_null.insert(document);
+                                }
+                                (Empty, Values { numbers, strings }) => {
+                                    insert_numbers_diff(vec![], numbers)?;
+                                    insert_strings_diff(vec![], strings)?;
+                                    del_is_empty.insert(document);
+                                }
+                                (Values { numbers, strings }, Null) => {
+                                    add_is_null.insert(document);
+                                    insert_numbers_diff(numbers, vec![])?;
+                                    insert_strings_diff(strings, vec![])?;
+                                }
+                                (Values { numbers, strings }, Empty) => {
+                                    add_is_empty.insert(document);
+                                    insert_numbers_diff(numbers, vec![])?;
+                                    insert_strings_diff(strings, vec![])?;
+                                }
+                                (
+                                    Values { numbers: del_numbers, strings: del_strings },
+                                    Values { numbers: add_numbers, strings: add_strings },
+                                ) => {
+                                    insert_numbers_diff(del_numbers, add_numbers)?;
+                                    insert_strings_diff(del_strings, add_strings)?;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    let mut buffer = Vec::new();
+    let mut facet_exists_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+    for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
+        deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
+        facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
+    }
+    let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
+
+    let mut facet_is_null_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+    for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
+        deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
+        facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
+    }
+    let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
+
+    let mut facet_is_empty_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+    for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
+        deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
+        facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
+    }
+    let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
+
+    Ok(ExtractedFacetValues {
+        fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
+        fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
+        fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
+        fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
+        fid_facet_exists_docids_chunk: facet_exists_docids_reader,
+    })
+}
+
+/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
+fn deladd_obkv_cbo_roaring_bitmaps(
+    buffer: &mut Vec<u8>,
+    del_bitmap: &RoaringBitmap,
+    add_bitmap: &RoaringBitmap,
+) -> io::Result<()> {
+    buffer.clear();
+    let mut obkv = KvWriterDelAdd::new(buffer);
+    let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
+    let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
+    obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
+    obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
+    obkv.finish()
+}
+
+/// Truncates a string to the biggest valid LMDB key size.
+fn truncate_str(s: &str) -> &str {
+    let index = s
+        .char_indices()
+        .map(|(idx, _)| idx)
+        .chain(std::iter::once(s.len()))
+        .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
+        .last();
+
+    &s[..index.unwrap_or(0)]
+}
+
+/// Computes the diff between both Del and Add numbers and
+/// only inserts the parts that differ in the sorter.
+fn insert_numbers_diff(
+    fid_docid_facet_numbers_sorter: &mut Sorter<KeepFirst>,
+    key_buffer: &mut Vec<u8>,
+    mut del_numbers: Vec<f64>,
+    mut add_numbers: Vec<f64>,
+) -> Result<()> {
+    // We sort and dedup the float numbers
+    del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
+    add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
+    del_numbers.dedup_by_key(|f| OrderedFloat(*f));
+    add_numbers.dedup_by_key(|f| OrderedFloat(*f));
+
+    let merged_numbers_iter = itertools::merge_join_by(
+        del_numbers.into_iter().map(OrderedFloat),
+        add_numbers.into_iter().map(OrderedFloat),
+        |del, add| del.cmp(add),
+    );
+
+    // insert facet numbers in sorter
+    for eob in merged_numbers_iter {
+        key_buffer.truncate(TRUNCATE_SIZE);
+        match eob {
+            EitherOrBoth::Both(_, _) => (), // no need to touch anything
+            EitherOrBoth::Left(OrderedFloat(number)) => {
+                if let Some(value_bytes) = f64_into_bytes(number) {
+                    key_buffer.extend_from_slice(&value_bytes);
+                    key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                    // We insert only the Del part of the Obkv to inform
+                    // that we only want to remove all those numbers.
+                    let mut obkv = KvWriterDelAdd::memory();
+                    obkv.insert(DelAdd::Deletion, bytes_of(&()))?;
+                    let bytes = obkv.into_inner()?;
+                    fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
+                }
+            }
+            EitherOrBoth::Right(OrderedFloat(number)) => {
+                if let Some(value_bytes) = f64_into_bytes(number) {
+                    key_buffer.extend_from_slice(&value_bytes);
+                    key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                    // We insert only the Add part of the Obkv to inform
+                    // that we only want to remove all those numbers.
+                    let mut obkv = KvWriterDelAdd::memory();
+                    obkv.insert(DelAdd::Addition, bytes_of(&()))?;
+                    let bytes = obkv.into_inner()?;
+                    fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Computes the diff between both Del and Add strings and
+/// only inserts the parts that differ in the sorter.
+fn insert_strings_diff(
+    fid_docid_facet_strings_sorter: &mut Sorter<KeepFirst>,
+    key_buffer: &mut Vec<u8>,
+    mut del_strings: Vec<(String, String)>,
+    mut add_strings: Vec<(String, String)>,
+) -> Result<()> {
+    // We sort and dedup the normalized and original strings
+    del_strings.sort_unstable();
+    add_strings.sort_unstable();
+    del_strings.dedup();
+    add_strings.dedup();
+
+    let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
+    let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
+
+    let merged_strings_iter = itertools::merge_join_by(
+        del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
+        add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
+        |(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
+    );
+
+    // insert normalized and original facet string in sorter
+    for eob in merged_strings_iter {
+        key_buffer.truncate(TRUNCATE_SIZE);
+        let (side, normalized, original) = match eob {
+            EitherOrBoth::Both((normalized, del), (_, add)) => {
+                let merged_strings_iter =
+                    itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
+                        original_del.cmp(original_add)
+                    });
+
+                // FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
+                // but we possibly have multiple original values that changed in the case where the field is an
+                // array of multiple values that normalize to the same value.
+                // (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
+                //
+                // We'll work best effort by ignoring when the same value appears in both sides, deleting the first
+                // value that is only in the old version, and adding the first value that is only in the new version
+                let mut obkv = KvWriterDelAdd::memory();
+                let mut del = None;
+                let mut add = None;
+                let mut both = None;
+
+                for eob in merged_strings_iter {
+                    match eob {
+                        EitherOrBoth::Both((_normalized, original), _) => {
+                            both = match both {
+                                Some(both) => Some(both),
+                                None => Some(original),
+                            }
+                        }
+                        EitherOrBoth::Left((_normalized, original)) => {
+                            del = match del {
+                                Some(del) => Some(del),
+                                None => Some(original),
+                            };
+                        }
+                        EitherOrBoth::Right((_normalized, original)) => {
+                            add = match add {
+                                Some(add) => Some(add),
+                                None => Some(original),
+                            }
+                        }
+                    }
+                }
+
+                if let Some(del) = del {
+                    obkv.insert(DelAdd::Deletion, del)?;
+                }
+                if let Some(add) = add
+                    // prefer the newly added, but if there is none, keep a value in the list of values
+                    // since the normalized value appears both in old and new, we should never remove it.
+                    .or(both)
+                {
+                    obkv.insert(DelAdd::Addition, add)?;
+                }
+
+                let truncated = truncate_str(normalized);
+                key_buffer.extend_from_slice(truncated.as_bytes());
+
+                let bytes = obkv.into_inner()?;
+                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
+                continue;
+            }
+            EitherOrBoth::Left((_normalized, mut original)) => {
+                // FIXME: we only consider the first value for the purpose of facet search
+                // another structure is needed, able to retain all originals associated with a normalized value.
+                let Some((normalized, original)) = original.next() else {
+                    continue;
+                };
+                (DelAdd::Deletion, normalized, original)
+            }
+            EitherOrBoth::Right((_normalized, mut original)) => {
+                // FIXME: we only consider the first value for the purpose of facet search
+                // another structure is needed, able to retain all originals associated with a normalized value.
+                let Some((normalized, original)) = original.next() else {
+                    continue;
+                };
+                (DelAdd::Addition, normalized, original)
+            }
+        };
+        let truncated = truncate_str(normalized);
+        key_buffer.extend_from_slice(truncated.as_bytes());
+
+        let mut obkv = KvWriterDelAdd::memory();
+        obkv.insert(side, original)?;
+        let bytes = obkv.into_inner()?;
+        fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
+    }
+
+    Ok(())
+}
+
+/// Represent what a document field contains.
+enum FilterableValues {
+    /// Corresponds to the JSON `null` value.
+    Null,
+    /// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
+    Empty,
+    /// Represents all the numbers and strings values found in this document field.
+    Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
+}
+
+/// Extracts the facet values of a JSON field.
+fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
+    fn inner_extract_facet_values(
+        value: &Value,
+        can_recurse: bool,
+        output_numbers: &mut Vec<f64>,
+        output_strings: &mut Vec<(String, String)>,
+        geo_field: bool,
+    ) {
+        match value {
+            Value::Null => (),
+            Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
+            Value::Number(number) => {
+                if let Some(float) = number.as_f64() {
+                    output_numbers.push(float);
+                }
+            }
+            Value::String(original) => {
+                // if we're working on a geofield it MUST be something we can parse or else there was an internal error
+                // in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
+                if geo_field {
+                    if let Ok(float) = original.parse() {
+                        output_numbers.push(float);
+                    } else {
+                        tracing::warn!(
+                            "Internal error, could not parse a geofield that has been validated. Please open an issue."
+                        )
+                    }
+                }
+                let normalized = crate::normalize_facet(original);
+                output_strings.push((normalized, original.clone()));
+            }
+            Value::Array(values) => {
+                if can_recurse {
+                    for value in values {
+                        inner_extract_facet_values(
+                            value,
+                            false,
+                            output_numbers,
+                            output_strings,
+                            geo_field,
+                        );
+                    }
+                }
+            }
+            Value::Object(_) => (),
+        }
+    }
+
+    match value {
+        Value::Null => FilterableValues::Null,
+        Value::String(s) if s.is_empty() => FilterableValues::Empty,
+        Value::Array(a) if a.is_empty() => FilterableValues::Empty,
+        Value::Object(o) if o.is_empty() => FilterableValues::Empty,
+        otherwise => {
+            let mut numbers = Vec::new();
+            let mut strings = Vec::new();
+            inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
+            FilterableValues::Values { numbers, strings }
+        }
+    }
+}
--- a/crates/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -0,0 +1,96 @@
+use std::fs::File;
+use std::io::{self, BufReader};
+
+use obkv::KvReaderU16;
+
+use super::helpers::{
+    create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
+};
+use crate::error::SerializationError;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::Result;
+
+const MAX_COUNTED_WORDS: usize = 30;
+
+/// Extracts the field id word count and the documents ids where
+/// this field id with this amount of words appear.
+///
+/// Returns a grenad reader with the list of extracted field id word counts
+/// and documents ids from the given chunk of docid word positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
+    docid_word_positions: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut fid_word_count_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+        true,
+    );
+
+    let mut key_buffer = Vec::new();
+    let mut value_buffer = Vec::new();
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = u32::from_be_bytes(document_id_bytes);
+
+        let del_add_reader = KvReaderDelAdd::from_slice(value);
+        let deletion = del_add_reader
+            // get deleted words
+            .get(DelAdd::Deletion)
+            // count deleted words
+            .map(|deletion| {
+                KvReaderU16::from_slice(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()
+            })
+            // keep the count if under or equal to MAX_COUNTED_WORDS
+            .filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
+        let addition = del_add_reader
+            // get added words
+            .get(DelAdd::Addition)
+            // count added words
+            .map(|addition| {
+                KvReaderU16::from_slice(addition).iter().take(MAX_COUNTED_WORDS + 1).count()
+            })
+            // keep the count if under or equal to MAX_COUNTED_WORDS
+            .filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
+
+        if deletion != addition {
+            // Insert deleted word count in sorter if exist.
+            if let Some(word_count) = deletion {
+                value_buffer.clear();
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                key_buffer.clear();
+                key_buffer.extend_from_slice(fid_bytes);
+                key_buffer.push(word_count as u8);
+                fid_word_count_docids_sorter
+                    .insert(&key_buffer, value_writer.into_inner().unwrap())?;
+            }
+            // Insert added word count in sorter if exist.
+            if let Some(word_count) = addition {
+                value_buffer.clear();
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key_buffer.clear();
+                key_buffer.extend_from_slice(fid_bytes);
+                key_buffer.push(word_count as u8);
+                fid_word_count_docids_sorter
+                    .insert(&key_buffer, value_writer.into_inner().unwrap())?;
+            }
+        }
+    }
+
+    sorter_into_reader(fid_word_count_docids_sorter, indexer)
+}
--- a/crates/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -0,0 +1,103 @@
+use std::fs::File;
+use std::io::{self, BufReader};
+
+use concat_arrays::concat_arrays;
+use serde_json::Value;
+
+use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
+use crate::error::GeoError;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::extract_finite_float_from_value;
+use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
+use crate::{FieldId, InternalError, Result};
+
+/// Extracts the geographical coordinates contained in each document under the `_geo` field.
+///
+/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_geo_points<R: io::Read + io::Seek>(
+    obkv_documents: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    primary_key_id: FieldId,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let mut writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+
+    let mut cursor = obkv_documents.into_cursor()?;
+    while let Some((docid_bytes, value)) = cursor.move_on_next()? {
+        let obkv = obkv::KvReader::from_slice(value);
+        // since we only need the primary key when we throw an error
+        // we create this getter to lazily get it when needed
+        let document_id = || -> Value {
+            let reader = KvReaderDelAdd::from_slice(obkv.get(primary_key_id).unwrap());
+            let document_id =
+                reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap();
+            serde_json::from_slice(document_id).unwrap()
+        };
+
+        // extract old version
+        let del_lat_lng = extract_lat_lng(obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
+        // extract new version
+        let add_lat_lng = extract_lat_lng(obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
+
+        if del_lat_lng != add_lat_lng {
+            let mut obkv = KvWriterDelAdd::memory();
+            if let Some([lat, lng]) = del_lat_lng {
+                #[allow(clippy::drop_non_drop)]
+                let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                obkv.insert(DelAdd::Deletion, bytes)?;
+            }
+            if let Some([lat, lng]) = add_lat_lng {
+                #[allow(clippy::drop_non_drop)]
+                let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                obkv.insert(DelAdd::Addition, bytes)?;
+            }
+            let bytes = obkv.into_inner()?;
+            writer.insert(docid_bytes, bytes)?;
+        }
+    }
+
+    writer_into_reader(writer)
+}
+
+/// Extract the finite floats lat and lng from two bytes slices.
+fn extract_lat_lng(
+    document: &obkv::KvReader<FieldId>,
+    settings: &InnerIndexSettings,
+    deladd: DelAdd,
+    document_id: impl Fn() -> Value,
+) -> Result<Option<[f64; 2]>> {
+    match settings.geo_fields_ids {
+        Some((lat_fid, lng_fid)) => {
+            let lat =
+                document.get(lat_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
+            let lng =
+                document.get(lng_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
+            let (lat, lng) = match (lat, lng) {
+                (Some(lat), Some(lng)) => (lat, lng),
+                (Some(_), None) => {
+                    return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
+                }
+                (None, Some(_)) => {
+                    return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
+                }
+                (None, None) => return Ok(None),
+            };
+            let lat = extract_finite_float_from_value(
+                serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
+            )
+            .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
+
+            let lng = extract_finite_float_from_value(
+                serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
+            )
+            .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
+            Ok(Some([lat, lng]))
+        }
+        None => Ok(None),
+    }
+}
--- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -0,0 +1,841 @@
+use std::cmp::Ordering;
+use std::convert::{TryFrom, TryInto};
+use std::fs::File;
+use std::io::{self, BufReader, BufWriter};
+use std::mem::size_of;
+use std::str::from_utf8;
+use std::sync::Arc;
+
+use bytemuck::cast_slice;
+use grenad::Writer;
+use ordered_float::OrderedFloat;
+use roaring::RoaringBitmap;
+use serde_json::Value;
+
+use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
+use crate::error::FaultSource;
+use crate::index::IndexEmbeddingConfig;
+use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
+use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
+use crate::vector::settings::ReindexAction;
+use crate::vector::{Embedder, Embedding};
+use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort};
+
+/// The length of the elements that are always in the buffer when inserting new values.
+const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
+
+pub struct ExtractedVectorPoints {
+    // docid, _index -> KvWriterDelAdd -> Vector
+    pub manual_vectors: grenad::Reader<BufReader<File>>,
+    // docid -> ()
+    pub remove_vectors: grenad::Reader<BufReader<File>>,
+    // docid -> prompt
+    pub prompts: grenad::Reader<BufReader<File>>,
+
+    // embedder
+    pub embedder_name: String,
+    pub embedder: Arc<Embedder>,
+    pub add_to_user_provided: RoaringBitmap,
+    pub remove_from_user_provided: RoaringBitmap,
+}
+
+enum VectorStateDelta {
+    NoChange,
+    // Remove all vectors, generated or manual, from this document
+    NowRemoved,
+
+    NowManual(Vec<Vec<f32>>),
+
+    // Add the vector computed from the specified prompt
+    // Remove any previous vector
+    // Note: changing the value of the prompt **does require** recording this delta
+    NowGenerated(String),
+}
+
+impl VectorStateDelta {
+    fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
+        match self {
+            VectorStateDelta::NoChange => Default::default(),
+            VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
+            // We always delete the previous vectors
+            VectorStateDelta::NowManual(add) => (true, Default::default(), add),
+            VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
+        }
+    }
+}
+
+struct EmbedderVectorExtractor {
+    embedder_name: String,
+    embedder: Arc<Embedder>,
+    prompt: Arc<Prompt>,
+
+    // (docid) -> (prompt)
+    prompts_writer: Writer<BufWriter<File>>,
+    // (docid) -> ()
+    remove_vectors_writer: Writer<BufWriter<File>>,
+    // (docid, _index) -> KvWriterDelAdd -> Vector
+    manual_vectors_writer: Writer<BufWriter<File>>,
+    // The docids of the documents that contains a user defined embedding
+    add_to_user_provided: RoaringBitmap,
+
+    action: ExtractionAction,
+}
+
+struct DocumentOperation {
+    // The docids of the documents that contains an auto-generated embedding
+    remove_from_user_provided: RoaringBitmap,
+}
+
+enum ExtractionAction {
+    SettingsFullReindex,
+    SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
+    DocumentOperation(DocumentOperation),
+}
+
+struct ManualEmbedderErrors {
+    embedder_name: String,
+    docid: String,
+    other_docids: usize,
+}
+
+impl ManualEmbedderErrors {
+    pub fn push_error(
+        errors: &mut Option<ManualEmbedderErrors>,
+        embedder_name: &str,
+        document_id: impl Fn() -> Value,
+    ) {
+        match errors {
+            Some(errors) => {
+                if errors.embedder_name == embedder_name {
+                    errors.other_docids = errors.other_docids.saturating_add(1)
+                }
+            }
+            None => {
+                *errors = Some(Self {
+                    embedder_name: embedder_name.to_owned(),
+                    docid: document_id().to_string(),
+                    other_docids: 0,
+                });
+            }
+        }
+    }
+
+    pub fn to_result(
+        errors: Option<ManualEmbedderErrors>,
+        possible_embedding_mistakes: &PossibleEmbeddingMistakes,
+        unused_vectors_distribution: &UnusedVectorsDistribution,
+    ) -> Result<()> {
+        match errors {
+            Some(errors) => {
+                let embedder_name = &errors.embedder_name;
+                let mut msg = format!(
+                    r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document {}{}",
+                    errors.docid,
+                    if errors.other_docids != 0 {
+                        format!(" and at least {} other document(s)", errors.other_docids)
+                    } else {
+                        "".to_string()
+                    }
+                );
+
+                msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
+
+                let mut hint_count = 0;
+
+                for (vector_misspelling, count) in
+                    possible_embedding_mistakes.vector_mistakes().take(2)
+                {
+                    msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
+                    hint_count += 1;
+                }
+
+                for (embedder_misspelling, count) in possible_embedding_mistakes
+                    .embedder_mistakes(embedder_name, unused_vectors_distribution)
+                    .take(2)
+                {
+                    msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
+                    hint_count += 1;
+                }
+
+                if hint_count == 0 {
+                    msg += &format!(
+                        "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
+                    );
+                }
+
+                Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
+            }
+            None => Ok(()),
+        }
+    }
+}
+
+/// Extracts the embedding vector contained in each document under the `_vectors` field.
+///
+/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_vector_points<R: io::Read + io::Seek>(
+    obkv_documents: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    embedders_configs: &[IndexEmbeddingConfig],
+    settings_diff: &InnerIndexSettingsDiff,
+    possible_embedding_mistakes: &PossibleEmbeddingMistakes,
+) -> Result<(Vec<ExtractedVectorPoints>, UnusedVectorsDistribution)> {
+    let mut unused_vectors_distribution = UnusedVectorsDistribution::new();
+    let mut manual_errors = None;
+    let reindex_vectors = settings_diff.reindex_vectors();
+
+    let old_fields_ids_map = &settings_diff.old.fields_ids_map;
+    let old_fields_ids_map =
+        FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
+
+    let new_fields_ids_map = &settings_diff.new.fields_ids_map;
+    let new_fields_ids_map =
+        FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
+
+    // the vector field id may have changed
+    let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
+
+    let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
+
+    let mut extractors = Vec::new();
+
+    let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
+    let old_configs = &settings_diff.old.embedding_configs;
+
+    if reindex_vectors {
+        for (name, action) in settings_diff.embedding_config_updates.iter() {
+            if let Some(action) = action.reindex() {
+                let Some((embedder_name, (embedder, prompt, _quantized))) =
+                    configs.remove_entry(name)
+                else {
+                    tracing::error!(embedder = name, "Requested embedder config not found");
+                    continue;
+                };
+
+                // (docid, _index) -> KvWriterDelAdd -> Vector
+                let manual_vectors_writer = create_writer(
+                    indexer.chunk_compression_type,
+                    indexer.chunk_compression_level,
+                    tempfile::tempfile()?,
+                );
+
+                // (docid) -> (prompt)
+                let prompts_writer = create_writer(
+                    indexer.chunk_compression_type,
+                    indexer.chunk_compression_level,
+                    tempfile::tempfile()?,
+                );
+
+                // (docid) -> ()
+                let remove_vectors_writer = create_writer(
+                    indexer.chunk_compression_type,
+                    indexer.chunk_compression_level,
+                    tempfile::tempfile()?,
+                );
+
+                let action = match action {
+                    ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
+                    ReindexAction::RegeneratePrompts => {
+                        let Some((_, old_prompt, _quantized)) = old_configs.get(name) else {
+                            tracing::error!(embedder = name, "Old embedder config not found");
+                            continue;
+                        };
+
+                        ExtractionAction::SettingsRegeneratePrompts { old_prompt }
+                    }
+                };
+
+                extractors.push(EmbedderVectorExtractor {
+                    embedder_name,
+                    embedder,
+                    prompt,
+                    prompts_writer,
+                    remove_vectors_writer,
+                    manual_vectors_writer,
+                    add_to_user_provided: RoaringBitmap::new(),
+                    action,
+                });
+            } else {
+                continue;
+            }
+        }
+    } else {
+        // document operation
+
+        for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() {
+            // (docid, _index) -> KvWriterDelAdd -> Vector
+            let manual_vectors_writer = create_writer(
+                indexer.chunk_compression_type,
+                indexer.chunk_compression_level,
+                tempfile::tempfile()?,
+            );
+
+            // (docid) -> (prompt)
+            let prompts_writer = create_writer(
+                indexer.chunk_compression_type,
+                indexer.chunk_compression_level,
+                tempfile::tempfile()?,
+            );
+
+            // (docid) -> ()
+            let remove_vectors_writer = create_writer(
+                indexer.chunk_compression_type,
+                indexer.chunk_compression_level,
+                tempfile::tempfile()?,
+            );
+
+            extractors.push(EmbedderVectorExtractor {
+                embedder_name,
+                embedder,
+                prompt,
+                prompts_writer,
+                remove_vectors_writer,
+                manual_vectors_writer,
+                add_to_user_provided: RoaringBitmap::new(),
+                action: ExtractionAction::DocumentOperation(DocumentOperation {
+                    remove_from_user_provided: RoaringBitmap::new(),
+                }),
+            });
+        }
+    }
+
+    let mut key_buffer = Vec::new();
+    let mut cursor = obkv_documents.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        // this must always be serialized as (docid, external_docid);
+        const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
+        let (docid_bytes, external_id_bytes) =
+            try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
+        debug_assert!(from_utf8(external_id_bytes).is_ok());
+        let docid = DocumentId::from_be_bytes(docid_bytes);
+
+        let obkv = obkv::KvReader::from_slice(value);
+        key_buffer.clear();
+        key_buffer.extend_from_slice(docid_bytes.as_slice());
+
+        // since we only need the primary key when we throw an error we create this getter to
+        // lazily get it when needed
+        let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
+
+        let mut parsed_vectors = ParsedVectorsDiff::new(
+            docid,
+            embedders_configs,
+            obkv,
+            old_vectors_fid,
+            new_vectors_fid,
+        )
+        .map_err(|error| error.to_crate_error(document_id().to_string()))?;
+
+        for EmbedderVectorExtractor {
+            embedder_name,
+            embedder,
+            prompt,
+            prompts_writer,
+            remove_vectors_writer,
+            manual_vectors_writer,
+            add_to_user_provided,
+            action,
+        } in extractors.iter_mut()
+        {
+            let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_));
+
+            let (old, new) = parsed_vectors.remove(embedder_name);
+            let delta = match action {
+                ExtractionAction::SettingsFullReindex => match old {
+                    // A full reindex can be triggered either by:
+                    // 1. a new embedder
+                    // 2. an existing embedder changed so that it must regenerate all generated embeddings.
+                    // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
+                    VectorState::Inline(vectors) => {
+                        if !vectors.must_regenerate() {
+                            add_to_user_provided.insert(docid);
+                        }
+
+                        match vectors.into_array_of_vectors() {
+                            Some(add_vectors) => {
+                                if add_vectors.len() > usize::from(u8::MAX) {
+                                    return Err(crate::Error::UserError(
+                                        crate::UserError::TooManyVectors(
+                                            document_id().to_string(),
+                                            add_vectors.len(),
+                                        ),
+                                    ));
+                                }
+                                VectorStateDelta::NowManual(add_vectors)
+                            }
+                            None => VectorStateDelta::NoChange,
+                        }
+                    }
+                    // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
+                    VectorState::Manual => VectorStateDelta::NoChange,
+                    // generated vectors must be regenerated
+                    VectorState::Generated => {
+                        if embedder_is_manual {
+                            ManualEmbedderErrors::push_error(
+                                &mut manual_errors,
+                                embedder_name.as_str(),
+                                document_id,
+                            );
+                            continue;
+                        }
+                        regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
+                    }
+                },
+                // prompt regeneration is only triggered for existing embedders
+                ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
+                    if old.must_regenerate() {
+                        if embedder_is_manual {
+                            ManualEmbedderErrors::push_error(
+                                &mut manual_errors,
+                                embedder_name.as_str(),
+                                document_id,
+                            );
+                            continue;
+                        }
+                        regenerate_if_prompt_changed(
+                            obkv,
+                            (old_prompt, prompt),
+                            (&old_fields_ids_map, &new_fields_ids_map),
+                        )?
+                    } else {
+                        // we can simply ignore user provided vectors as they are not regenerated and are
+                        // already in the DB since this is an existing embedder
+                        VectorStateDelta::NoChange
+                    }
+                }
+                ExtractionAction::DocumentOperation(DocumentOperation {
+                    remove_from_user_provided,
+                }) => extract_vector_document_diff(
+                    docid,
+                    obkv,
+                    prompt,
+                    (add_to_user_provided, remove_from_user_provided),
+                    (old, new),
+                    (&old_fields_ids_map, &new_fields_ids_map),
+                    document_id,
+                    embedder_name,
+                    embedder_is_manual,
+                    &mut manual_errors,
+                )?,
+            };
+            // and we finally push the unique vectors into the writer
+            push_vectors_diff(
+                remove_vectors_writer,
+                prompts_writer,
+                manual_vectors_writer,
+                &mut key_buffer,
+                delta,
+            )?;
+        }
+
+        unused_vectors_distribution.append(parsed_vectors);
+    }
+
+    ManualEmbedderErrors::to_result(
+        manual_errors,
+        possible_embedding_mistakes,
+        &unused_vectors_distribution,
+    )?;
+
+    let mut results = Vec::new();
+
+    for EmbedderVectorExtractor {
+        embedder_name,
+        embedder,
+        prompt: _,
+        prompts_writer,
+        remove_vectors_writer,
+        action,
+        manual_vectors_writer,
+        add_to_user_provided,
+    } in extractors
+    {
+        let remove_from_user_provided =
+            if let ExtractionAction::DocumentOperation(DocumentOperation {
+                remove_from_user_provided,
+            }) = action
+            {
+                remove_from_user_provided
+            } else {
+                Default::default()
+            };
+
+        results.push(ExtractedVectorPoints {
+            manual_vectors: writer_into_reader(manual_vectors_writer)?,
+            remove_vectors: writer_into_reader(remove_vectors_writer)?,
+            prompts: writer_into_reader(prompts_writer)?,
+            embedder,
+            embedder_name,
+            add_to_user_provided,
+            remove_from_user_provided,
+        })
+    }
+
+    Ok((results, unused_vectors_distribution))
+}
+
+#[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments
+fn extract_vector_document_diff(
+    docid: DocumentId,
+    obkv: &obkv::KvReader<FieldId>,
+    prompt: &Prompt,
+    (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
+    (old, new): (VectorState, VectorState),
+    (old_fields_ids_map, new_fields_ids_map): (
+        &FieldsIdsMapWithMetadata,
+        &FieldsIdsMapWithMetadata,
+    ),
+    document_id: impl Fn() -> Value,
+    embedder_name: &str,
+    embedder_is_manual: bool,
+    manual_errors: &mut Option<ManualEmbedderErrors>,
+) -> Result<VectorStateDelta> {
+    match (old.must_regenerate(), new.must_regenerate()) {
+        (true, true) | (false, false) => {}
+        (true, false) => {
+            add_to_user_provided.insert(docid);
+        }
+        (false, true) => {
+            remove_from_user_provided.insert(docid);
+        }
+    }
+
+    let delta = match (old, new) {
+        // regardless of the previous state, if a document now contains inline _vectors, they must
+        // be extracted manually
+        (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
+            Some(add_vectors) => {
+                if add_vectors.len() > usize::from(u8::MAX) {
+                    return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
+                        document_id().to_string(),
+                        add_vectors.len(),
+                    )));
+                }
+
+                VectorStateDelta::NowManual(add_vectors)
+            }
+            None => VectorStateDelta::NoChange,
+        },
+        // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
+        // document changed
+        (VectorState::Generated, VectorState::Generated) => {
+            // Do we keep this document?
+            let document_is_kept = obkv
+                .iter()
+                .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
+                .any(|deladd| deladd.get(DelAdd::Addition).is_some());
+
+            if document_is_kept {
+                if embedder_is_manual {
+                    ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
+                    return Ok(VectorStateDelta::NoChange);
+                }
+                // Don't give up if the old prompt was failing
+                let old_prompt = Some(&prompt).map(|p| {
+                    p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
+                        .unwrap_or_default()
+                });
+                let new_prompt =
+                    prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
+                if old_prompt.as_ref() != Some(&new_prompt) {
+                    let old_prompt = old_prompt.unwrap_or_default();
+                    tracing::trace!(
+                        "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
+                    );
+                    VectorStateDelta::NowGenerated(new_prompt)
+                } else {
+                    tracing::trace!("⏭️ Prompt unmodified, skipping");
+                    VectorStateDelta::NoChange
+                }
+            } else {
+                VectorStateDelta::NowRemoved
+            }
+        }
+        // inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
+        // the previous version of the document.
+        // Manual -> Generated is also not possible without an Inline to the right (which is handled above)
+        // Generated -> Generated is handled above, so not possible
+        // As a result, this code is unreachable
+        (_not_generated, VectorState::Generated) => {
+            // Do we keep this document?
+            let document_is_kept = obkv
+                .iter()
+                .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
+                .any(|deladd| deladd.get(DelAdd::Addition).is_some());
+            if document_is_kept {
+                if embedder_is_manual {
+                    ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
+                    return Ok(VectorStateDelta::NoChange);
+                }
+                // becomes autogenerated
+                VectorStateDelta::NowGenerated(prompt.render_kvdeladd(
+                    obkv,
+                    DelAdd::Addition,
+                    new_fields_ids_map,
+                )?)
+            } else {
+                // make sure the document is always removed from user provided on removal
+                remove_from_user_provided.insert(docid);
+                VectorStateDelta::NowRemoved
+            }
+        }
+        // inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
+        // version of the document.
+        // however the Rust type system cannot know that.
+        (_manual, VectorState::Manual) => {
+            // Do we keep this document?
+            let document_is_kept = obkv
+                .iter()
+                .map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
+                .any(|deladd| deladd.get(DelAdd::Addition).is_some());
+            if document_is_kept {
+                // if the new version of documents has the vectors in the DB,
+                // then they are user-provided and nothing possibly changed
+                VectorStateDelta::NoChange
+            } else {
+                // make sure the document is always removed from user provided on removal
+                remove_from_user_provided.insert(docid);
+                VectorStateDelta::NowRemoved
+            }
+        }
+    };
+
+    Ok(delta)
+}
+
+fn regenerate_if_prompt_changed(
+    obkv: &obkv::KvReader<FieldId>,
+    (old_prompt, new_prompt): (&Prompt, &Prompt),
+    (old_fields_ids_map, new_fields_ids_map): (
+        &FieldsIdsMapWithMetadata,
+        &FieldsIdsMapWithMetadata,
+    ),
+) -> Result<VectorStateDelta> {
+    let old_prompt = old_prompt
+        .render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
+        .unwrap_or(Default::default());
+    let new_prompt = new_prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
+
+    if new_prompt == old_prompt {
+        return Ok(VectorStateDelta::NoChange);
+    }
+    Ok(VectorStateDelta::NowGenerated(new_prompt))
+}
+
+fn regenerate_prompt(
+    obkv: &obkv::KvReader<FieldId>,
+    prompt: &Prompt,
+    new_fields_ids_map: &FieldsIdsMapWithMetadata,
+) -> Result<VectorStateDelta> {
+    let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
+
+    Ok(VectorStateDelta::NowGenerated(prompt))
+}
+
+/// We cannot compute the diff between both Del and Add vectors.
+/// We'll push every vector and compute the difference later in TypedChunk.
+fn push_vectors_diff(
+    remove_vectors_writer: &mut Writer<BufWriter<File>>,
+    prompts_writer: &mut Writer<BufWriter<File>>,
+    manual_vectors_writer: &mut Writer<BufWriter<File>>,
+    key_buffer: &mut Vec<u8>,
+    delta: VectorStateDelta,
+) -> Result<()> {
+    let (must_remove, prompt, mut add_vectors) = delta.into_values();
+    if must_remove {
+        key_buffer.truncate(TRUNCATE_SIZE);
+        remove_vectors_writer.insert(&key_buffer, [])?;
+    }
+    if !prompt.is_empty() {
+        key_buffer.truncate(TRUNCATE_SIZE);
+        prompts_writer.insert(&key_buffer, prompt.as_bytes())?;
+    }
+
+    // We sort and dedup the vectors
+    add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
+    add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
+
+    // insert vectors into the writer
+    for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
+        // Generate the key by extending the unique index to it.
+        key_buffer.truncate(TRUNCATE_SIZE);
+        let index = u16::try_from(i).unwrap();
+        key_buffer.extend_from_slice(&index.to_be_bytes());
+
+        // We insert only the Add part of the Obkv to inform
+        // that we only want to remove all those vectors.
+        let mut obkv = KvWriterDelAdd::memory();
+        obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
+        let bytes = obkv.into_inner()?;
+        manual_vectors_writer.insert(&key_buffer, bytes)?;
+    }
+
+    Ok(())
+}
+
+/// Compares two vectors by using the OrderingFloat helper.
+fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
+    a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_embeddings<R: io::Read + io::Seek>(
+    // docid, prompt
+    prompt_reader: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    embedder: Arc<Embedder>,
+    embedder_name: &str,
+    possible_embedding_mistakes: &PossibleEmbeddingMistakes,
+    unused_vectors_distribution: &UnusedVectorsDistribution,
+    request_threads: &ThreadPoolNoAbort,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
+    let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
+
+    // docid, state with embedding
+    let mut state_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+
+    let mut chunks = Vec::with_capacity(n_chunks);
+    let mut current_chunk = Vec::with_capacity(n_vectors_per_chunk);
+    let mut current_chunk_ids = Vec::with_capacity(n_vectors_per_chunk);
+    let mut chunks_ids = Vec::with_capacity(n_chunks);
+    let mut cursor = prompt_reader.into_cursor()?;
+
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
+        // SAFETY: precondition, the grenad value was saved from a string
+        let prompt = unsafe { std::str::from_utf8_unchecked(value) };
+        if current_chunk.len() == current_chunk.capacity() {
+            chunks.push(std::mem::replace(
+                &mut current_chunk,
+                Vec::with_capacity(n_vectors_per_chunk),
+            ));
+            chunks_ids.push(std::mem::replace(
+                &mut current_chunk_ids,
+                Vec::with_capacity(n_vectors_per_chunk),
+            ));
+        };
+        current_chunk.push(prompt.to_owned());
+        current_chunk_ids.push(docid);
+
+        if chunks.len() == chunks.capacity() {
+            let chunked_embeds = embed_chunks(
+                &embedder,
+                std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
+                embedder_name,
+                possible_embedding_mistakes,
+                unused_vectors_distribution,
+                request_threads,
+            )?;
+
+            for (docid, embeddings) in chunks_ids
+                .iter()
+                .flat_map(|docids| docids.iter())
+                .zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
+            {
+                state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
+            }
+            chunks_ids.clear();
+        }
+    }
+
+    // send last chunk
+    if !chunks.is_empty() {
+        let chunked_embeds = embed_chunks(
+            &embedder,
+            std::mem::take(&mut chunks),
+            embedder_name,
+            possible_embedding_mistakes,
+            unused_vectors_distribution,
+            request_threads,
+        )?;
+        for (docid, embeddings) in chunks_ids
+            .iter()
+            .flat_map(|docids| docids.iter())
+            .zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
+        {
+            state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
+        }
+    }
+
+    if !current_chunk.is_empty() {
+        let embeds = embed_chunks(
+            &embedder,
+            vec![std::mem::take(&mut current_chunk)],
+            embedder_name,
+            possible_embedding_mistakes,
+            unused_vectors_distribution,
+            request_threads,
+        )?;
+
+        if let Some(embeds) = embeds.first() {
+            for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
+                state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
+            }
+        }
+    }
+
+    writer_into_reader(state_writer)
+}
+
+fn embed_chunks(
+    embedder: &Embedder,
+    text_chunks: Vec<Vec<String>>,
+    embedder_name: &str,
+    possible_embedding_mistakes: &PossibleEmbeddingMistakes,
+    unused_vectors_distribution: &UnusedVectorsDistribution,
+    request_threads: &ThreadPoolNoAbort,
+) -> Result<Vec<Vec<Embedding>>> {
+    match embedder.embed_chunks(text_chunks, request_threads) {
+        Ok(chunks) => Ok(chunks),
+        Err(error) => {
+            if let FaultSource::Bug = error.fault {
+                Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(
+                    error.into(),
+                )))
+            } else {
+                let mut msg =
+                    format!(r"While embedding documents for embedder `{embedder_name}`: {error}");
+
+                if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
+                    msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
+                }
+
+                let mut hint_count = 0;
+
+                for (vector_misspelling, count) in
+                    possible_embedding_mistakes.vector_mistakes().take(2)
+                {
+                    msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
+                    hint_count += 1;
+                }
+
+                for (embedder_misspelling, count) in possible_embedding_mistakes
+                    .embedder_mistakes(embedder_name, unused_vectors_distribution)
+                    .take(2)
+                {
+                    msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
+                    hint_count += 1;
+                }
+
+                if hint_count == 0 {
+                    if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
+                        msg += &format!(
+                            "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
+                        );
+                    }
+                }
+
+                Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
+            }
+        }
+    }
+}
--- a/crates/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -0,0 +1,243 @@
+use std::collections::BTreeSet;
+use std::fs::File;
+use std::io::{self, BufReader};
+
+use heed::{BytesDecode, BytesEncode};
+use obkv::KvReaderU16;
+use roaring::RoaringBitmap;
+
+use super::helpers::{
+    create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
+};
+use crate::error::SerializationError;
+use crate::heed_codec::StrBEU16Codec;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::helpers::sorter_into_reader;
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
+
+/// Extracts the word and the documents ids where this word appear.
+///
+/// Returns a grenad reader with the list of extracted words and
+/// documents ids from the given chunk of docid word positions.
+///
+/// The first returned reader is the one for normal word_docids, and the second one is for
+/// exact_word_docids
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_word_docids<R: io::Read + io::Seek>(
+    docid_word_positions: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<(
+    grenad::Reader<BufReader<File>>,
+    grenad::Reader<BufReader<File>>,
+    grenad::Reader<BufReader<File>>,
+)> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut word_fid_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 3),
+        true,
+    );
+    let mut key_buffer = Vec::new();
+    let mut del_words = BTreeSet::new();
+    let mut add_words = BTreeSet::new();
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let (fid_bytes, _) = try_split_array_at(fid_bytes)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = u32::from_be_bytes(document_id_bytes);
+        let fid = u16::from_be_bytes(fid_bytes);
+
+        let del_add_reader = KvReaderDelAdd::from_slice(value);
+        // extract all unique words to remove.
+        if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
+            for (_pos, word) in KvReaderU16::from_slice(deletion).iter() {
+                del_words.insert(word.to_vec());
+            }
+        }
+
+        // extract all unique additional words.
+        if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
+            for (_pos, word) in KvReaderU16::from_slice(addition).iter() {
+                add_words.insert(word.to_vec());
+            }
+        }
+
+        words_into_sorter(
+            document_id,
+            fid,
+            &mut key_buffer,
+            &del_words,
+            &add_words,
+            &mut word_fid_docids_sorter,
+        )?;
+
+        del_words.clear();
+        add_words.clear();
+    }
+
+    let mut word_fid_docids_writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+
+    let mut word_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 3),
+        true,
+    );
+
+    let mut exact_word_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|m| m / 3),
+        true,
+    );
+
+    let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
+    let mut buffer = Vec::new();
+    // NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
+    while let Some((key, value)) = iter.next()? {
+        // only keep the value if their is a change to apply in the DB.
+        if !is_noop_del_add_obkv(KvReaderDelAdd::from_slice(value)) {
+            word_fid_docids_writer.insert(key, value)?;
+        }
+
+        let (w, fid) = StrBEU16Codec::bytes_decode(key)
+            .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+
+        // merge all deletions
+        let obkv = KvReaderDelAdd::from_slice(value);
+        if let Some(value) = obkv.get(DelAdd::Deletion) {
+            let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
+            buffer.clear();
+            let mut obkv = KvWriterDelAdd::new(&mut buffer);
+            obkv.insert(DelAdd::Deletion, value)?;
+            if delete_from_exact {
+                exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
+            } else {
+                word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
+            }
+        }
+        // merge all additions
+        if let Some(value) = obkv.get(DelAdd::Addition) {
+            let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
+            buffer.clear();
+            let mut obkv = KvWriterDelAdd::new(&mut buffer);
+            obkv.insert(DelAdd::Addition, value)?;
+            if add_in_exact {
+                exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
+            } else {
+                word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
+            }
+        }
+    }
+
+    Ok((
+        sorter_into_reader(word_docids_sorter, indexer)?,
+        sorter_into_reader(exact_word_docids_sorter, indexer)?,
+        writer_into_reader(word_fid_docids_writer)?,
+    ))
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn words_into_sorter(
+    document_id: DocumentId,
+    fid: FieldId,
+    key_buffer: &mut Vec<u8>,
+    del_words: &BTreeSet<Vec<u8>>,
+    add_words: &BTreeSet<Vec<u8>>,
+    word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
+) -> Result<()> {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut buffer = Vec::new();
+    for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
+        buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut buffer);
+        let word_bytes = match eob {
+            Left(word_bytes) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                word_bytes
+            }
+            Right(word_bytes) => {
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                word_bytes
+            }
+            Both(word_bytes, _) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                word_bytes
+            }
+        };
+
+        key_buffer.clear();
+        key_buffer.extend_from_slice(word_bytes);
+        key_buffer.push(0);
+        key_buffer.extend_from_slice(&fid.to_be_bytes());
+        word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
+    }
+
+    Ok(())
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn docids_into_writers<W>(
+    word: &str,
+    deletions: &RoaringBitmap,
+    additions: &RoaringBitmap,
+    writer: &mut grenad::Writer<W>,
+) -> Result<()>
+where
+    W: std::io::Write,
+{
+    if deletions == additions {
+        // if the same value is deleted and added, do nothing.
+        return Ok(());
+    }
+
+    // Write each value in the same KvDelAdd before inserting it in the final writer.
+    let mut obkv = KvWriterDelAdd::memory();
+    // deletions:
+    if !deletions.is_empty() && !deletions.is_subset(additions) {
+        obkv.insert(
+            DelAdd::Deletion,
+            CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
+                SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
+            })?,
+        )?;
+    }
+    // additions:
+    if !additions.is_empty() {
+        obkv.insert(
+            DelAdd::Addition,
+            CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
+                SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
+            })?,
+        )?;
+    }
+
+    // insert everything in the same writer.
+    writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
+
+    Ok(())
+}
--- a/crates/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -0,0 +1,261 @@
+use std::collections::{BTreeMap, VecDeque};
+use std::fs::File;
+use std::io::BufReader;
+use std::{cmp, io};
+
+use obkv::KvReaderU16;
+
+use super::helpers::{
+    create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
+};
+use crate::error::SerializationError;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::{DocumentId, Result};
+
+/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
+///
+/// Returns a grenad reader with the list of extracted word pairs proximities and
+/// documents ids from the given chunk of docid word positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
+    docid_word_positions: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    settings_diff: &InnerIndexSettingsDiff,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    // early return if the data shouldn't be deleted nor created.
+    if settings_diff.settings_update_only && !settings_diff.reindex_proximities() {
+        let writer = create_writer(
+            indexer.chunk_compression_type,
+            indexer.chunk_compression_level,
+            tempfile::tempfile()?,
+        );
+        return writer_into_reader(writer);
+    }
+
+    let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
+    let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
+
+    let max_memory = indexer.max_memory_by_thread();
+    let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
+        .map(|_| {
+            create_sorter(
+                grenad::SortAlgorithm::Unstable,
+                MergeDeladdCboRoaringBitmaps,
+                indexer.chunk_compression_type,
+                indexer.chunk_compression_level,
+                indexer.max_nb_chunks,
+                max_memory.map(|m| m / MAX_DISTANCE as usize),
+                true,
+            )
+        })
+        .collect();
+
+    let mut del_word_positions: VecDeque<(String, u16)> =
+        VecDeque::with_capacity(MAX_DISTANCE as usize);
+    let mut add_word_positions: VecDeque<(String, u16)> =
+        VecDeque::with_capacity(MAX_DISTANCE as usize);
+    let mut del_word_pair_proximity = BTreeMap::new();
+    let mut add_word_pair_proximity = BTreeMap::new();
+    let mut current_document_id = None;
+
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = u32::from_be_bytes(document_id_bytes);
+
+        // if we change document, we fill the sorter
+        if current_document_id.map_or(false, |id| id != document_id) {
+            // FIXME: span inside of a hot loop might degrade performance and create big reports
+            let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
+            let _entered = span.enter();
+
+            document_word_positions_into_sorter(
+                current_document_id.unwrap(),
+                &del_word_pair_proximity,
+                &add_word_pair_proximity,
+                &mut word_pair_proximity_docids_sorters,
+            )?;
+            del_word_pair_proximity.clear();
+            add_word_pair_proximity.clear();
+        }
+
+        current_document_id = Some(document_id);
+
+        let (del, add): (Result<_>, Result<_>) = rayon::join(
+            || {
+                if !any_deletion {
+                    return Ok(());
+                }
+
+                // deletions
+                if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) {
+                    for (position, word) in KvReaderU16::from_slice(deletion).iter() {
+                        // drain the proximity window until the head word is considered close to the word we are inserting.
+                        while del_word_positions.front().map_or(false, |(_w, p)| {
+                            index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
+                        }) {
+                            word_positions_into_word_pair_proximity(
+                                &mut del_word_positions,
+                                &mut del_word_pair_proximity,
+                            )?;
+                        }
+
+                        // insert the new word.
+                        let word = std::str::from_utf8(word)?;
+                        del_word_positions.push_back((word.to_string(), position));
+                    }
+
+                    while !del_word_positions.is_empty() {
+                        word_positions_into_word_pair_proximity(
+                            &mut del_word_positions,
+                            &mut del_word_pair_proximity,
+                        )?;
+                    }
+                }
+
+                Ok(())
+            },
+            || {
+                if !any_addition {
+                    return Ok(());
+                }
+
+                // additions
+                if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
+                    for (position, word) in KvReaderU16::from_slice(addition).iter() {
+                        // drain the proximity window until the head word is considered close to the word we are inserting.
+                        while add_word_positions.front().map_or(false, |(_w, p)| {
+                            index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
+                        }) {
+                            word_positions_into_word_pair_proximity(
+                                &mut add_word_positions,
+                                &mut add_word_pair_proximity,
+                            )?;
+                        }
+
+                        // insert the new word.
+                        let word = std::str::from_utf8(word)?;
+                        add_word_positions.push_back((word.to_string(), position));
+                    }
+
+                    while !add_word_positions.is_empty() {
+                        word_positions_into_word_pair_proximity(
+                            &mut add_word_positions,
+                            &mut add_word_pair_proximity,
+                        )?;
+                    }
+                }
+
+                Ok(())
+            },
+        );
+
+        del?;
+        add?;
+    }
+
+    if let Some(document_id) = current_document_id {
+        // FIXME: span inside of a hot loop might degrade performance and create big reports
+        let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
+        let _entered = span.enter();
+
+        document_word_positions_into_sorter(
+            document_id,
+            &del_word_pair_proximity,
+            &add_word_pair_proximity,
+            &mut word_pair_proximity_docids_sorters,
+        )?;
+    }
+    {
+        // FIXME: span inside of a hot loop might degrade performance and create big reports
+        let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
+        let _entered = span.enter();
+
+        let mut writer = create_writer(
+            indexer.chunk_compression_type,
+            indexer.chunk_compression_level,
+            tempfile::tempfile()?,
+        );
+
+        for sorter in word_pair_proximity_docids_sorters {
+            sorter.write_into_stream_writer(&mut writer)?;
+        }
+
+        writer_into_reader(writer)
+    }
+}
+
+/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
+///
+/// This list is used by the engine to calculate the documents containing words that are
+/// close to each other.
+fn document_word_positions_into_sorter(
+    document_id: DocumentId,
+    del_word_pair_proximity: &BTreeMap<(String, String), u8>,
+    add_word_pair_proximity: &BTreeMap<(String, String), u8>,
+    word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdCboRoaringBitmaps>],
+) -> Result<()> {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut buffer = Vec::new();
+    let mut key_buffer = Vec::new();
+    for eob in
+        merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
+            d.cmp(a)
+        })
+    {
+        buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut buffer);
+        let ((w1, w2), prox) = match eob {
+            Left(key_value) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                key_value
+            }
+            Right(key_value) => {
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key_value
+            }
+            Both(key_value, _) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key_value
+            }
+        };
+
+        key_buffer.clear();
+        key_buffer.push(*prox);
+        key_buffer.extend_from_slice(w1.as_bytes());
+        key_buffer.push(0);
+        key_buffer.extend_from_slice(w2.as_bytes());
+
+        word_pair_proximity_docids_sorters[*prox as usize - 1]
+            .insert(&key_buffer, value_writer.into_inner().unwrap())?;
+    }
+
+    Ok(())
+}
+
+fn word_positions_into_word_pair_proximity(
+    word_positions: &mut VecDeque<(String, u16)>,
+    word_pair_proximity: &mut BTreeMap<(String, String), u8>,
+) -> Result<()> {
+    let (head_word, head_position) = word_positions.pop_front().unwrap();
+    for (word, position) in word_positions.iter() {
+        let prox = index_proximity(head_position as u32, *position as u32) as u8;
+        if prox > 0 && prox < MAX_DISTANCE as u8 {
+            word_pair_proximity
+                .entry((head_word.clone(), word.clone()))
+                .and_modify(|p| {
+                    *p = cmp::min(*p, prox);
+                })
+                .or_insert(prox);
+        }
+    }
+    Ok(())
+}
--- a/crates/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -0,0 +1,138 @@
+use std::collections::BTreeSet;
+use std::fs::File;
+use std::io::{self, BufReader};
+
+use obkv::KvReaderU16;
+
+use super::helpers::{
+    create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
+    MergeDeladdCboRoaringBitmaps,
+};
+use crate::error::SerializationError;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::{bucketed_position, DocumentId, Result};
+
+/// Extracts the word positions and the documents ids where this word appear.
+///
+/// Returns a grenad reader with the list of extracted words at positions and
+/// documents ids from the given chunk of docid word positions.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub fn extract_word_position_docids<R: io::Read + io::Seek>(
+    docid_word_positions: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    _settings_diff: &InnerIndexSettingsDiff,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut word_position_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        MergeDeladdCboRoaringBitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+        true,
+    );
+
+    let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
+    let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
+    let mut current_document_id: Option<u32> = None;
+    let mut key_buffer = Vec::new();
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = DocumentId::from_be_bytes(document_id_bytes);
+
+        if current_document_id.map_or(false, |id| document_id != id) {
+            words_position_into_sorter(
+                current_document_id.unwrap(),
+                &mut key_buffer,
+                &del_word_positions,
+                &add_word_positions,
+                &mut word_position_docids_sorter,
+            )?;
+            del_word_positions.clear();
+            add_word_positions.clear();
+        }
+
+        current_document_id = Some(document_id);
+
+        let del_add_reader = KvReaderDelAdd::from_slice(value);
+        // extract all unique words to remove.
+        if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
+            for (position, word_bytes) in KvReaderU16::from_slice(deletion).iter() {
+                let position = bucketed_position(position);
+                del_word_positions.insert((position, word_bytes.to_vec()));
+            }
+        }
+
+        // extract all unique additional words.
+        if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
+            for (position, word_bytes) in KvReaderU16::from_slice(addition).iter() {
+                let position = bucketed_position(position);
+                add_word_positions.insert((position, word_bytes.to_vec()));
+            }
+        }
+    }
+
+    if let Some(document_id) = current_document_id {
+        words_position_into_sorter(
+            document_id,
+            &mut key_buffer,
+            &del_word_positions,
+            &add_word_positions,
+            &mut word_position_docids_sorter,
+        )?;
+    }
+
+    // TODO remove noop DelAdd OBKV
+    let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
+
+    Ok(word_position_docids_reader)
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+fn words_position_into_sorter(
+    document_id: DocumentId,
+    key_buffer: &mut Vec<u8>,
+    del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
+    add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
+    word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
+) -> Result<()> {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut buffer = Vec::new();
+    for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
+    {
+        buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut buffer);
+        let (position, word_bytes) = match eob {
+            Left(key) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                key
+            }
+            Right(key) => {
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key
+            }
+            Both(key, _) => {
+                // both values needs to be kept because it will be used in other extractors.
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key
+            }
+        };
+
+        key_buffer.clear();
+        key_buffer.extend_from_slice(word_bytes);
+        key_buffer.push(0);
+        key_buffer.extend_from_slice(&position.to_be_bytes());
+        word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
+    }
+
+    Ok(())
+}
--- a/crates/milli/src/update/index_documents/extract/mod.rs
+++ b/crates/milli/src/update/index_documents/extract/mod.rs
@@ -0,0 +1,406 @@
+mod extract_docid_word_positions;
+mod extract_facet_number_docids;
+mod extract_facet_string_docids;
+mod extract_fid_docid_facet_values;
+mod extract_fid_word_count_docids;
+mod extract_geo_points;
+mod extract_vector_points;
+mod extract_word_docids;
+mod extract_word_pair_proximity_docids;
+mod extract_word_position_docids;
+
+use std::fs::File;
+use std::io::BufReader;
+use std::sync::{Arc, OnceLock};
+
+use crossbeam_channel::Sender;
+use rayon::prelude::*;
+
+use self::extract_docid_word_positions::extract_docid_word_positions;
+use self::extract_facet_number_docids::extract_facet_number_docids;
+use self::extract_facet_string_docids::extract_facet_string_docids;
+use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
+use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
+use self::extract_geo_points::extract_geo_points;
+use self::extract_vector_points::{
+    extract_embeddings, extract_vector_points, ExtractedVectorPoints,
+};
+use self::extract_word_docids::extract_word_docids;
+use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
+use self::extract_word_position_docids::extract_word_position_docids;
+use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
+use super::{helpers, TypedChunk};
+use crate::index::IndexEmbeddingConfig;
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::vector::error::PossibleEmbeddingMistakes;
+use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
+
+/// Extract data for each databases from obkv documents in parallel.
+/// Send data in grenad file over provided Sender.
+#[allow(clippy::too_many_arguments)]
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
+pub(crate) fn data_from_obkv_documents(
+    original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
+    flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
+    indexer: GrenadParameters,
+    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    primary_key_id: FieldId,
+    embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
+    max_positions_per_attributes: Option<u32>,
+    possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
+) -> Result<()> {
+    let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
+        || {
+            original_obkv_chunks
+                .par_bridge()
+                .map(|original_documents_chunk| {
+                    send_original_documents_data(
+                        original_documents_chunk,
+                        indexer,
+                        lmdb_writer_sx.clone(),
+                        embedders_configs.clone(),
+                        settings_diff.clone(),
+                        possible_embedding_mistakes.clone(),
+                    )
+                })
+                .collect::<Result<()>>()
+        },
+        || {
+            flattened_obkv_chunks
+                .par_bridge()
+                .map(|flattened_obkv_chunks| {
+                    send_and_extract_flattened_documents_data(
+                        flattened_obkv_chunks,
+                        indexer,
+                        lmdb_writer_sx.clone(),
+                        primary_key_id,
+                        settings_diff.clone(),
+                        max_positions_per_attributes,
+                    )
+                })
+                .map(|result| {
+                    if let Ok((
+                        ref docid_word_positions_chunk,
+                        (ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
+                    )) = result
+                    {
+                        run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
+                            docid_word_positions_chunk.clone(),
+                            indexer,
+                            settings_diff.clone(),
+                            lmdb_writer_sx.clone(),
+                            extract_fid_word_count_docids,
+                            TypedChunk::FieldIdWordCountDocids,
+                        );
+                        run_extraction_task::<
+                            _,
+                            _,
+                            (
+                                grenad::Reader<BufReader<File>>,
+                                grenad::Reader<BufReader<File>>,
+                                grenad::Reader<BufReader<File>>,
+                            ),
+                        >(
+                            docid_word_positions_chunk.clone(),
+                            indexer,
+                            settings_diff.clone(),
+                            lmdb_writer_sx.clone(),
+                            extract_word_docids,
+                            |(
+                                word_docids_reader,
+                                exact_word_docids_reader,
+                                word_fid_docids_reader,
+                            )| {
+                                TypedChunk::WordDocids {
+                                    word_docids_reader,
+                                    exact_word_docids_reader,
+                                    word_fid_docids_reader,
+                                }
+                            },
+                        );
+
+                        run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
+                            docid_word_positions_chunk.clone(),
+                            indexer,
+                            settings_diff.clone(),
+                            lmdb_writer_sx.clone(),
+                            extract_word_position_docids,
+                            TypedChunk::WordPositionDocids,
+                        );
+
+                        run_extraction_task::<
+                            _,
+                            _,
+                            (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
+                        >(
+                            fid_docid_facet_strings_chunk.clone(),
+                            indexer,
+                            settings_diff.clone(),
+                            lmdb_writer_sx.clone(),
+                            extract_facet_string_docids,
+                            TypedChunk::FieldIdFacetStringDocids,
+                        );
+
+                        run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
+                            fid_docid_facet_numbers_chunk.clone(),
+                            indexer,
+                            settings_diff.clone(),
+                            lmdb_writer_sx.clone(),
+                            extract_facet_number_docids,
+                            TypedChunk::FieldIdFacetNumberDocids,
+                        );
+
+                        run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
+                            docid_word_positions_chunk.clone(),
+                            indexer,
+                            settings_diff.clone(),
+                            lmdb_writer_sx.clone(),
+                            extract_word_pair_proximity_docids,
+                            TypedChunk::WordPairProximityDocids,
+                        );
+                    }
+
+                    Ok(())
+                })
+                .collect::<Result<()>>()
+        },
+    );
+
+    original_pipeline_result.and(flattened_pipeline_result)
+}
+
+/// Spawn a new task to extract data for a specific DB using extract_fn.
+/// Generated grenad chunks are merged using the merge_fn.
+/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
+/// and sent into lmdb_writer_sx.
+fn run_extraction_task<FE, FS, M>(
+    chunk: grenad::Reader<CursorClonableMmap>,
+    indexer: GrenadParameters,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
+    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    extract_fn: FE,
+    serialize_fn: FS,
+) where
+    FE: Fn(
+            grenad::Reader<CursorClonableMmap>,
+            GrenadParameters,
+            &InnerIndexSettingsDiff,
+        ) -> Result<M>
+        + Sync
+        + Send
+        + 'static,
+    FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
+    M: Send,
+{
+    let current_span = tracing::Span::current();
+
+    rayon::spawn(move || {
+        let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
+        let _entered = child_span.enter();
+
+        match extract_fn(chunk, indexer, &settings_diff) {
+            Ok(chunk) => {
+                let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
+            }
+            Err(e) => {
+                let _ = lmdb_writer_sx.send(Err(e));
+            }
+        }
+    })
+}
+
+fn request_threads() -> &'static ThreadPoolNoAbort {
+    static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
+
+    REQUEST_THREADS.get_or_init(|| {
+        ThreadPoolNoAbortBuilder::new()
+            .num_threads(crate::vector::REQUEST_PARALLELISM)
+            .thread_name(|index| format!("embedding-request-{index}"))
+            .build()
+            .unwrap()
+    })
+}
+
+/// Extract chunked data and send it into lmdb_writer_sx sender:
+/// - documents
+fn send_original_documents_data(
+    original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
+    indexer: GrenadParameters,
+    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
+    possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
+) -> Result<()> {
+    let original_documents_chunk =
+        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
+
+    let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
+        // no point in indexing vectors without embedders
+        && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
+
+    if index_vectors {
+        let settings_diff = settings_diff.clone();
+        let embedders_configs = embedders_configs.clone();
+
+        let original_documents_chunk = original_documents_chunk.clone();
+        let lmdb_writer_sx = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            match extract_vector_points(
+                original_documents_chunk.clone(),
+                indexer,
+                &embedders_configs,
+                &settings_diff,
+                &possible_embedding_mistakes,
+            ) {
+                Ok((extracted_vectors, unused_vectors_distribution)) => {
+                    for ExtractedVectorPoints {
+                        manual_vectors,
+                        remove_vectors,
+                        prompts,
+                        embedder_name,
+                        embedder,
+                        add_to_user_provided,
+                        remove_from_user_provided,
+                    } in extracted_vectors
+                    {
+                        let embeddings = match extract_embeddings(
+                            prompts,
+                            indexer,
+                            embedder.clone(),
+                            &embedder_name,
+                            &possible_embedding_mistakes,
+                            &unused_vectors_distribution,
+                            request_threads(),
+                        ) {
+                            Ok(results) => Some(results),
+                            Err(error) => {
+                                let _ = lmdb_writer_sx.send(Err(error));
+                                None
+                            }
+                        };
+                        if !(remove_vectors.is_empty()
+                            && manual_vectors.is_empty()
+                            && embeddings.as_ref().map_or(true, |e| e.is_empty()))
+                        {
+                            let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
+                                remove_vectors,
+                                embeddings,
+                                expected_dimension: embedder.dimensions(),
+                                manual_vectors,
+                                embedder_name,
+                                add_to_user_provided,
+                                remove_from_user_provided,
+                            }));
+                        }
+                    }
+                }
+                Err(error) => {
+                    let _ = lmdb_writer_sx.send(Err(error));
+                }
+            }
+        });
+    }
+
+    // TODO: create a custom internal error
+    let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
+    Ok(())
+}
+
+/// Extract chunked data and send it into lmdb_writer_sx sender:
+/// - documents_ids
+/// - docid_word_positions
+/// - docid_fid_facet_numbers
+/// - docid_fid_facet_strings
+/// - docid_fid_facet_exists
+#[allow(clippy::too_many_arguments)]
+#[allow(clippy::type_complexity)]
+fn send_and_extract_flattened_documents_data(
+    flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
+    indexer: GrenadParameters,
+    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    primary_key_id: FieldId,
+    settings_diff: Arc<InnerIndexSettingsDiff>,
+    max_positions_per_attributes: Option<u32>,
+) -> Result<(
+    grenad::Reader<CursorClonableMmap>,
+    (grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
+)> {
+    let flattened_documents_chunk =
+        flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
+
+    if settings_diff.run_geo_indexing() {
+        let documents_chunk_cloned = flattened_documents_chunk.clone();
+        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
+        let settings_diff = settings_diff.clone();
+        rayon::spawn(move || {
+            let result =
+                extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
+            let _ = match result {
+                Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
+                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
+            };
+        });
+    }
+
+    let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
+        rayon::join(
+            || {
+                let docid_word_positions_chunk = extract_docid_word_positions(
+                    flattened_documents_chunk.clone(),
+                    indexer,
+                    &settings_diff,
+                    max_positions_per_attributes,
+                )?;
+
+                // send docid_word_positions_chunk to DB writer
+                let docid_word_positions_chunk =
+                    unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
+
+                Ok(docid_word_positions_chunk)
+            },
+            || {
+                let ExtractedFacetValues {
+                    fid_docid_facet_numbers_chunk,
+                    fid_docid_facet_strings_chunk,
+                    fid_facet_is_null_docids_chunk,
+                    fid_facet_is_empty_docids_chunk,
+                    fid_facet_exists_docids_chunk,
+                } = extract_fid_docid_facet_values(
+                    flattened_documents_chunk.clone(),
+                    indexer,
+                    &settings_diff,
+                )?;
+
+                // send fid_docid_facet_numbers_chunk to DB writer
+                let fid_docid_facet_numbers_chunk =
+                    unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
+
+                let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
+                    fid_docid_facet_numbers_chunk.clone(),
+                )));
+
+                // send fid_docid_facet_strings_chunk to DB writer
+                let fid_docid_facet_strings_chunk =
+                    unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
+
+                let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
+                    fid_docid_facet_strings_chunk.clone(),
+                )));
+
+                let _ = lmdb_writer_sx
+                    .send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
+
+                let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
+                    fid_facet_is_empty_docids_chunk,
+                )));
+
+                let _ = lmdb_writer_sx
+                    .send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
+
+                Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
+            },
+        );
+
+    Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
+}
--- a/crates/milli/src/update/index_documents/helpers/clonable_mmap.rs
+++ b/crates/milli/src/update/index_documents/helpers/clonable_mmap.rs
@@ -0,0 +1,24 @@
+use std::sync::Arc;
+
+use memmap2::Mmap;
+
+/// Wrapper around Mmap allowing to virtually clone grenad-chunks
+/// in a parallel process like the indexing.
+#[derive(Debug, Clone)]
+pub struct ClonableMmap {
+    inner: Arc<Mmap>,
+}
+
+impl AsRef<[u8]> for ClonableMmap {
+    fn as_ref(&self) -> &[u8] {
+        self.inner.as_ref()
+    }
+}
+
+impl From<Mmap> for ClonableMmap {
+    fn from(inner: Mmap) -> ClonableMmap {
+        ClonableMmap { inner: Arc::new(inner) }
+    }
+}
+
+pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>;
--- a/crates/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/crates/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -0,0 +1,217 @@
+use std::fs::File;
+use std::io::{self, BufReader, BufWriter, Seek};
+
+use grenad::{CompressionType, MergeFunction, Sorter};
+use heed::types::Bytes;
+
+use super::ClonableMmap;
+use crate::update::index_documents::valid_lmdb_key;
+use crate::Result;
+
+/// This is something reasonable given the fact
+/// that there is one grenad sorter by thread.
+const MAX_GRENAD_SORTER_USAGE: usize = 500 * 1024 * 1024; // 500 MiB
+
+pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
+
+pub fn create_writer<R: io::Write>(
+    typ: grenad::CompressionType,
+    level: Option<u32>,
+    file: R,
+) -> grenad::Writer<BufWriter<R>> {
+    let mut builder = grenad::Writer::builder();
+    builder.compression_type(typ);
+    if let Some(level) = level {
+        builder.compression_level(level);
+    }
+    builder.build(BufWriter::new(file))
+}
+
+/// A helper function that creates a grenad sorter
+/// with the given parameters. The max memory is
+/// clamped to something reasonable.
+pub fn create_sorter<MF: MergeFunction>(
+    sort_algorithm: grenad::SortAlgorithm,
+    merge: MF,
+    chunk_compression_type: grenad::CompressionType,
+    chunk_compression_level: Option<u32>,
+    max_nb_chunks: Option<usize>,
+    max_memory: Option<usize>,
+    sort_in_parallel: bool,
+) -> grenad::Sorter<MF> {
+    let mut builder = grenad::Sorter::builder(merge);
+    builder.chunk_compression_type(chunk_compression_type);
+    if let Some(level) = chunk_compression_level {
+        builder.chunk_compression_level(level);
+    }
+    if let Some(nb_chunks) = max_nb_chunks {
+        builder.max_nb_chunks(nb_chunks);
+    }
+    if let Some(memory) = max_memory {
+        builder.dump_threshold(memory.min(MAX_GRENAD_SORTER_USAGE));
+        builder.allow_realloc(false);
+    }
+    builder.sort_algorithm(sort_algorithm);
+    builder.sort_in_parallel(sort_in_parallel);
+    builder.build()
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
+pub fn sorter_into_reader<MF>(
+    sorter: grenad::Sorter<MF>,
+    indexer: GrenadParameters,
+) -> Result<grenad::Reader<BufReader<File>>>
+where
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
+    let mut writer = create_writer(
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        tempfile::tempfile()?,
+    );
+    sorter.write_into_stream_writer(&mut writer)?;
+
+    writer_into_reader(writer)
+}
+
+pub fn writer_into_reader(
+    writer: grenad::Writer<BufWriter<File>>,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
+    file.rewind()?;
+    grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
+}
+
+/// # Safety
+/// We use memory mapping inside. So, according to the Rust community, it's unsafe.
+pub unsafe fn as_cloneable_grenad(
+    reader: &grenad::Reader<BufReader<File>>,
+) -> Result<grenad::Reader<CursorClonableMmap>> {
+    let file = reader.get_ref().get_ref();
+    let mmap = memmap2::Mmap::map(file)?;
+    let cursor = io::Cursor::new(ClonableMmap::from(mmap));
+    let reader = grenad::Reader::new(cursor)?;
+    Ok(reader)
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GrenadParameters {
+    pub chunk_compression_type: CompressionType,
+    pub chunk_compression_level: Option<u32>,
+    pub max_memory: Option<usize>,
+    pub max_nb_chunks: Option<usize>,
+}
+
+impl Default for GrenadParameters {
+    fn default() -> Self {
+        Self {
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            max_memory: None,
+            max_nb_chunks: None,
+        }
+    }
+}
+
+impl GrenadParameters {
+    /// This function use the number of threads in the current threadpool to compute the value.
+    ///
+    /// This should be called inside of a rayon thread pool,
+    /// otherwise, it will take the global number of threads.
+    ///
+    /// The max memory cannot exceed a given reasonable value.
+    pub fn max_memory_by_thread(&self) -> Option<usize> {
+        self.max_memory.map(|max_memory| {
+            (max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE)
+        })
+    }
+}
+
+/// Returns an iterator that outputs grenad readers of obkv documents
+/// with a maximum size of approximately `documents_chunks_size`.
+///
+/// The grenad obkv entries are composed of an incremental document id big-endian
+/// encoded as the key and an obkv object with an `u8` for the field as the key
+/// and a simple UTF-8 encoded string as the value.
+pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
+    reader: grenad::Reader<R>,
+    indexer: GrenadParameters,
+    documents_chunk_size: usize,
+) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
+    let mut continue_reading = true;
+    let mut cursor = reader.into_cursor()?;
+
+    let mut transposer = move || {
+        if !continue_reading {
+            return Ok(None);
+        }
+
+        let mut current_chunk_size = 0u64;
+        let mut obkv_documents = create_writer(
+            indexer.chunk_compression_type,
+            indexer.chunk_compression_level,
+            tempfile::tempfile()?,
+        );
+
+        while let Some((document_id, obkv)) = cursor.move_on_next()? {
+            if !obkv.is_empty() {
+                obkv_documents.insert(document_id, obkv)?;
+                current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
+
+                if current_chunk_size >= documents_chunk_size as u64 {
+                    return writer_into_reader(obkv_documents).map(Some);
+                }
+            }
+        }
+
+        continue_reading = false;
+        writer_into_reader(obkv_documents).map(Some)
+    };
+
+    Ok(std::iter::from_fn(move || transposer().transpose()))
+}
+
+/// Write provided sorter in database using serialize_value function.
+/// merge_values function is used if an entry already exist in the database.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
+pub fn write_sorter_into_database<K, V, FS, FM, MF>(
+    sorter: Sorter<MF>,
+    database: &heed::Database<K, V>,
+    wtxn: &mut heed::RwTxn<'_>,
+    index_is_empty: bool,
+    serialize_value: FS,
+    merge_values: FM,
+) -> Result<()>
+where
+    FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
+    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
+    let mut buffer = Vec::new();
+    let database = database.remap_types::<Bytes, Bytes>();
+
+    let mut merger_iter = sorter.into_stream_merger_iter()?;
+    while let Some((key, value)) = merger_iter.next()? {
+        if valid_lmdb_key(key) {
+            buffer.clear();
+            let value = if index_is_empty {
+                Some(serialize_value(value, &mut buffer)?)
+            } else {
+                match database.get(wtxn, key)? {
+                    Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
+                    None => Some(serialize_value(value, &mut buffer)?),
+                }
+            };
+            match value {
+                Some(value) => database.put(wtxn, key, value)?,
+                None => {
+                    database.delete(wtxn, key)?;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/crates/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/crates/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -0,0 +1,314 @@
+use std::borrow::Cow;
+use std::collections::BTreeSet;
+use std::io;
+use std::result::Result as StdResult;
+
+use either::Either;
+use grenad::MergeFunction;
+use roaring::RoaringBitmap;
+
+use crate::heed_codec::CboRoaringBitmapCodec;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::transform::Operation;
+use crate::Result;
+
+pub type EitherObkvMerge =
+    Either<ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions>;
+
+pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
+    buffer.clear();
+    buffer.reserve(bitmap.serialized_size());
+    bitmap.serialize_into(buffer)
+}
+
+pub struct MergeRoaringBitmaps;
+
+impl MergeFunction for MergeRoaringBitmaps {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            let merged = values
+                .iter()
+                .map(AsRef::as_ref)
+                .map(RoaringBitmap::deserialize_from)
+                .map(StdResult::unwrap)
+                .reduce(|a, b| a | b)
+                .unwrap();
+            let mut buffer = Vec::new();
+            serialize_roaring_bitmap(&merged, &mut buffer)?;
+            Ok(Cow::Owned(buffer))
+        }
+    }
+}
+
+pub struct KeepFirst;
+
+impl MergeFunction for KeepFirst {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        Ok(values[0].clone())
+    }
+}
+
+/// Only the last value associated with an id is kept.
+pub struct KeepLatestObkv;
+
+impl MergeFunction for KeepLatestObkv {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        Ok(obkvs.last().unwrap().clone())
+    }
+}
+
+pub fn merge_two_del_add_obkvs(
+    base: &obkv::KvReaderU16,
+    update: &obkv::KvReaderU16,
+    merge_additions: bool,
+    buffer: &mut Vec<u8>,
+) {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    buffer.clear();
+
+    let mut writer = obkv::KvWriter::new(buffer);
+    let mut value_buffer = Vec::new();
+    for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
+        match eob {
+            Left((k, v)) => {
+                if merge_additions {
+                    writer.insert(k, v).unwrap()
+                } else {
+                    // If merge_additions is false, recreate an obkv keeping the deletions only.
+                    value_buffer.clear();
+                    let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                    let base_reader = KvReaderDelAdd::from_slice(v);
+
+                    if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
+                        value_writer.insert(DelAdd::Deletion, deletion).unwrap();
+                        value_writer.finish().unwrap();
+                        writer.insert(k, &value_buffer).unwrap()
+                    }
+                }
+            }
+            Right((k, v)) => writer.insert(k, v).unwrap(),
+            Both((k, base), (_, update)) => {
+                // merge deletions and additions.
+                value_buffer.clear();
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                let base_reader = KvReaderDelAdd::from_slice(base);
+                let update_reader = KvReaderDelAdd::from_slice(update);
+
+                // keep newest deletion.
+                if let Some(deletion) = update_reader
+                    .get(DelAdd::Deletion)
+                    .or_else(|| base_reader.get(DelAdd::Deletion))
+                {
+                    value_writer.insert(DelAdd::Deletion, deletion).unwrap();
+                }
+
+                // keep base addition only if merge_additions is true.
+                let base_addition =
+                    merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
+                // keep newest addition.
+                // TODO use or_else
+                if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
+                    value_writer.insert(DelAdd::Addition, addition).unwrap();
+                }
+
+                value_writer.finish().unwrap();
+                writer.insert(k, &value_buffer).unwrap()
+            }
+        }
+    }
+
+    writer.finish().unwrap();
+}
+
+/// Merge all the obkvs from the newest to the oldest.
+fn inner_merge_del_add_obkvs<'a>(
+    obkvs: &[Cow<'a, [u8]>],
+    merge_additions: bool,
+) -> Result<Cow<'a, [u8]>> {
+    // pop the newest operation from the list.
+    let (newest, obkvs) = obkvs.split_last().unwrap();
+    // keep the operation type for the returned value.
+    let newest_operation_type = newest[0];
+
+    // treat the newest obkv as the starting point of the merge.
+    let mut acc_operation_type = newest_operation_type;
+    let mut acc = newest[1..].to_vec();
+    let mut buffer = Vec::new();
+    // reverse iter from the most recent to the oldest.
+    for current in obkvs.iter().rev() {
+        // if in the previous iteration there was a complete deletion,
+        // stop the merge process.
+        if acc_operation_type == Operation::Deletion as u8 {
+            break;
+        }
+
+        let newest = obkv::KvReader::from_slice(&acc);
+        let oldest = obkv::KvReader::from_slice(&current[1..]);
+        merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
+
+        // we want the result of the merge into our accumulator.
+        std::mem::swap(&mut acc, &mut buffer);
+        acc_operation_type = current[0];
+    }
+
+    acc.insert(0, newest_operation_type);
+    Ok(Cow::from(acc))
+}
+
+/// Merge all the obkvs from the newest to the oldest.
+#[derive(Copy, Clone)]
+pub struct ObkvsMergeAdditionsAndDeletions;
+
+impl MergeFunction for ObkvsMergeAdditionsAndDeletions {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        inner_merge_del_add_obkvs(obkvs, true)
+    }
+}
+
+/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
+#[derive(Copy, Clone)]
+pub struct ObkvsKeepLastAdditionMergeDeletions;
+
+impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        inner_merge_del_add_obkvs(obkvs, false)
+    }
+}
+
+/// Do a union of all the CboRoaringBitmaps in the values.
+pub struct MergeCboRoaringBitmaps;
+
+impl MergeFunction for MergeCboRoaringBitmaps {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            let mut vec = Vec::new();
+            CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
+            Ok(Cow::from(vec))
+        }
+    }
+}
+
+/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
+/// separately and outputs a new DelAdd with both unions.
+pub struct MergeDeladdCboRoaringBitmaps;
+
+impl MergeFunction for MergeDeladdCboRoaringBitmaps {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            // Retrieve the bitmaps from both sides
+            let mut del_bitmaps_bytes = Vec::new();
+            let mut add_bitmaps_bytes = Vec::new();
+            for value in values {
+                let obkv = KvReaderDelAdd::from_slice(value);
+                if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
+                    del_bitmaps_bytes.push(bitmap_bytes);
+                }
+                if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
+                    add_bitmaps_bytes.push(bitmap_bytes);
+                }
+            }
+
+            let mut output_deladd_obkv = KvWriterDelAdd::memory();
+            let mut buffer = Vec::new();
+            CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
+            output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
+            buffer.clear();
+            CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
+            output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
+            output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+        }
+    }
+}
+
+/// A function that merges a DelAdd of bitmao into an already existing bitmap.
+///
+/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
+/// the second one is the CboRoaringBitmap to merge into.
+pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
+    deladd_obkv: &[u8],
+    previous: &[u8],
+    buffer: &'a mut Vec<u8>,
+) -> Result<Option<&'a [u8]>> {
+    Ok(CboRoaringBitmapCodec::merge_deladd_into(
+        KvReaderDelAdd::from_slice(deladd_obkv),
+        previous,
+        buffer,
+    )?)
+}
+
+/// Do a union of BtreeSet on both sides of a DelAdd obkv
+/// separately and outputs a new DelAdd with both unions.
+pub struct MergeDeladdBtreesetString;
+
+impl MergeFunction for MergeDeladdBtreesetString {
+    type Error = crate::Error;
+
+    fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+        if values.len() == 1 {
+            Ok(values[0].clone())
+        } else {
+            // Retrieve the bitmaps from both sides
+            let mut del_set = BTreeSet::new();
+            let mut add_set = BTreeSet::new();
+            for value in values {
+                let obkv = KvReaderDelAdd::from_slice(value);
+                if let Some(bytes) = obkv.get(DelAdd::Deletion) {
+                    let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
+                    for value in set {
+                        del_set.insert(value);
+                    }
+                }
+                if let Some(bytes) = obkv.get(DelAdd::Addition) {
+                    let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
+                    for value in set {
+                        add_set.insert(value);
+                    }
+                }
+            }
+
+            let mut output_deladd_obkv = KvWriterDelAdd::memory();
+            let del = serde_json::to_vec(&del_set).unwrap();
+            output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
+            let add = serde_json::to_vec(&add_set).unwrap();
+            output_deladd_obkv.insert(DelAdd::Addition, &add)?;
+            output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+        }
+    }
+}
+
+/// Used when trying to merge readers, but you don't actually care about the values.
+pub struct MergeIgnoreValues;
+
+impl MergeFunction for MergeIgnoreValues {
+    type Error = crate::Error;
+
+    fn merge<'a>(
+        &self,
+        _key: &[u8],
+        _values: &[Cow<'a, [u8]>],
+    ) -> std::result::Result<Cow<'a, [u8]>, Self::Error> {
+        Ok(Cow::Owned(Vec::new()))
+    }
+}
--- a/crates/milli/src/update/index_documents/helpers/mod.rs
+++ b/crates/milli/src/update/index_documents/helpers/mod.rs
@@ -0,0 +1,66 @@
+mod clonable_mmap;
+mod grenad_helpers;
+mod merge_functions;
+
+use std::collections::HashSet;
+use std::convert::{TryFrom, TryInto};
+
+pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
+use fst::{IntoStreamer, Streamer};
+pub use grenad_helpers::*;
+pub use merge_functions::*;
+
+use crate::MAX_WORD_LENGTH;
+
+pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
+    key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
+}
+
+/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
+pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
+    if mid <= slice.len() {
+        Some(slice.split_at(mid))
+    } else {
+        None
+    }
+}
+
+/// Divides one slice into an array and the tail at an index,
+/// returns `None` if `N` is out of bounds.
+pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
+where
+    [T; N]: for<'a> TryFrom<&'a [T]>,
+{
+    let (head, tail) = try_split_at(slice, N)?;
+    let head = head.try_into().ok()?;
+    Some((head, tail))
+}
+
+/// Converts an fst Stream into an HashSet of Strings.
+pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
+where
+    I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
+    S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
+{
+    let mut hashset = HashSet::new();
+    let mut stream = stream.into_stream();
+    while let Some(value) = stream.next() {
+        hashset.insert(value.to_owned());
+    }
+    hashset
+}
+
+// Converts an fst Stream into a Vec of Strings.
+pub fn fst_stream_into_vec<'f, I, S>(stream: I) -> Vec<String>
+where
+    I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
+    S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
+{
+    let mut strings = Vec::new();
+    let mut stream = stream.into_stream();
+    while let Some(word) = stream.next() {
+        let s = std::str::from_utf8(word).unwrap();
+        strings.push(s.to_owned());
+    }
+    strings
+}
--- a/crates/milli/src/update/index_documents/mod.rs
+++ b/crates/milli/src/update/index_documents/mod.rs
--- a/crates/milli/src/update/index_documents/parallel.rs
+++ b/crates/milli/src/update/index_documents/parallel.rs
@@ -0,0 +1,86 @@
+use heed::types::Bytes;
+use heed::{Database, RoTxn};
+use obkv::KvReaderU16;
+use roaring::RoaringBitmap;
+
+use crate::{all_obkv_to_json, DocumentId, FieldsIdsMap, Object, ObkvCodec, Result, BEU32};
+
+pub struct ImmutableObkvs<'t> {
+    ids: RoaringBitmap,
+    fields_ids_map: FieldsIdsMap,
+    slices: Vec<&'t [u8]>,
+}
+
+impl<'t> ImmutableObkvs<'t> {
+    /// Creates the structure by fetching all the OBKVs
+    /// and keeping the transaction making the pointers valid.
+    pub fn new(
+        rtxn: &'t RoTxn,
+        documents_database: Database<BEU32, ObkvCodec>,
+        fields_ids_map: FieldsIdsMap,
+        subset: RoaringBitmap,
+    ) -> heed::Result<Self> {
+        let mut slices = Vec::new();
+        let documents_database = documents_database.remap_data_type::<Bytes>();
+        for docid in &subset {
+            let slice = documents_database.get(rtxn, &docid)?.unwrap();
+            slices.push(slice);
+        }
+
+        Ok(ImmutableObkvs { ids: subset, fields_ids_map, slices })
+    }
+
+    /// Returns the OBKVs identified by the given ID.
+    pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
+        match self
+            .ids
+            .rank(docid)
+            .checked_sub(1)
+            .and_then(|offset| self.slices.get(offset as usize))
+        {
+            Some(&bytes) => Ok(Some(bytes.into())),
+            None => Ok(None),
+        }
+    }
+
+    /// Returns the owned rhai::Map identified by the given ID.
+    pub fn rhai_map(&self, docid: DocumentId) -> Result<Option<rhai::Map>> {
+        let obkv = match self.obkv(docid) {
+            Ok(Some(obkv)) => obkv,
+            Ok(None) => return Ok(None),
+            Err(e) => return Err(e.into()),
+        };
+
+        let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
+        let map: Result<rhai::Map> = all_keys
+            .iter()
+            .copied()
+            .flat_map(|id| obkv.get(id).map(|value| (id, value)))
+            .map(|(id, value)| {
+                let name = self.fields_ids_map.name(id).ok_or(
+                    crate::error::FieldIdMapMissingEntry::FieldId {
+                        field_id: id,
+                        process: "all_obkv_to_rhaimap",
+                    },
+                )?;
+                let value = serde_json::from_slice(value)
+                    .map_err(crate::error::InternalError::SerdeJson)?;
+                Ok((name.into(), value))
+            })
+            .collect();
+
+        map.map(Some)
+    }
+
+    pub fn json_map(&self, docid: DocumentId) -> Result<Option<Object>> {
+        let obkv = match self.obkv(docid) {
+            Ok(Some(obkv)) => obkv,
+            Ok(None) => return Ok(None),
+            Err(e) => return Err(e.into()),
+        };
+
+        all_obkv_to_json(obkv, &self.fields_ids_map).map(Some)
+    }
+}
+
+unsafe impl Sync for ImmutableObkvs<'_> {}
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+[]
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+[2, ]
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap
@@ -0,0 +1,5 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+benoit           [2, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap
@@ -0,0 +1,6 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
+2   [21, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
@@ -0,0 +1,5 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+2   0  2.2    1  [21, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
@@ -0,0 +1,17 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ]
+1   0  aquarium     1  [5, ]
+1   0  art          1  [4, 5, 8, 9, 10, 12, 17, ]
+1   0  cartoon      1  [2, 7, 15, 17, ]
+1   0  colorfulness 1  [13, ]
+1   0  design       1  [2, 18, ]
+1   0  drawing      1  [3, 4, 5, 8, 10, 11, 16, ]
+1   0  geometry     1  [19, ]
+1   0  letter       1  [1, ]
+1   0  outdoor      1  [4, ]
+1   0  painting     1  [3, ]
+1   0  pattern      1  [2, 3, 9, 10, 13, 14, 16, ]
+2   0  design       1  [21, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
@@ -0,0 +1,38 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
+2                [21, ]
+36               [3, ]
+37               [4, ]
+38               [5, ]
+39               [6, ]
+40               [7, ]
+41               [8, ]
+42               [9, ]
+43               [10, ]
+44               [11, ]
+45               [12, ]
+46               [13, ]
+47               [14, ]
+5                [1, ]
+52               [15, ]
+57               [16, ]
+58               [17, ]
+68               [18, ]
+69               [19, ]
+7                [2, ]
+71               [21, ]
+abstract         [2, 6, 10, 13, 14, 15, 16, 17, ]
+aquarium         [5, ]
+art              [4, 5, 8, 9, 10, 12, 17, ]
+cartoon          [2, 7, 15, 17, ]
+colorfulness     [13, ]
+design           [2, 18, 21, ]
+drawing          [3, 4, 5, 8, 10, 11, 16, ]
+geometry         [19, ]
+letter           [1, ]
+outdoor          [4, ]
+painting         [3, ]
+pattern          [2, 3, 9, 10, 13, 14, 16, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap
@@ -0,0 +1,25 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1  1                36               [3, ]
+1  1                37               [4, ]
+1  1                38               [5, ]
+1  1                39               [6, ]
+1  1                40               [7, ]
+1  1                41               [8, ]
+1  1                42               [9, ]
+1  1                43               [10, ]
+1  1                44               [11, ]
+1  1                45               [12, ]
+1  1                46               [13, ]
+1  1                47               [14, ]
+1  1                5                [1, ]
+1  1                52               [15, ]
+1  1                57               [16, ]
+1  1                58               [17, ]
+1  1                68               [18, ]
+1  1                69               [19, ]
+1  1                7                [2, ]
+1  1                71               [21, ]
+1  2                2                [21, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
@@ -0,0 +1,31 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+3   0  48.9021 1  [19, ]
+3   0  49.9314 1  [17, ]
+3   0  50.1793 1  [15, ]
+3   0  50.2844 1  [14, ]
+3   0  50.3518 1  [13, ]
+3   0  50.4502 1  [12, ]
+3   0  50.6053 1  [8, ]
+3   0  50.6224 1  [3, ]
+3   0  50.6299 1  [0, ]
+3   0  50.6312 1  [2, ]
+3   0  50.6415 1  [1, ]
+3   0  50.7453 1  [7, ]
+3   0  50.8466 1  [10, ]
+3   0  51.0537 1  [9, ]
+4   0  2.271  1  [17, ]
+4   0  2.3708 1  [19, ]
+4   0  2.7637 1  [14, ]
+4   0  3.0569 1  [0, ]
+4   0  3.1106 1  [1, 2, ]
+4   0  3.1476 1  [3, ]
+4   0  3.2189 1  [15, ]
+4   0  3.2206 1  [7, ]
+4   0  3.3758 1  [8, ]
+4   0  3.5326 1  [13, ]
+4   0  3.6957 1  [9, ]
+4   0  3.9623 1  [12, ]
+4   0  4.337  1  [10, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
@@ -0,0 +1,57 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+0                [1, ]
+1                [2, ]
+10               [1, ]
+12               [0, ]
+1344             [3, ]
+2                [0, ]
+23               [5, ]
+25               [2, ]
+3                [0, ]
+35               [5, ]
+4                [4, ]
+42               [0, 5, ]
+456              [1, ]
+5                [0, ]
+99               [2, ]
+adams            [5, ]
+adventure        [1, ]
+alice            [2, ]
+and              [0, 4, ]
+antoine          [1, ]
+austin           [0, ]
+blood            [4, ]
+carroll          [2, ]
+de               [1, ]
+douglas          [5, ]
+exupery          [1, ]
+fantasy          [2, 3, 4, ]
+galaxy           [5, ]
+guide            [5, ]
+half             [4, ]
+harry            [4, ]
+hitchhiker       [5, ]
+hobbit           [3, ]
+in               [2, ]
+j                [3, 4, ]
+jane             [0, ]
+k                [4, ]
+le               [1, ]
+lewis            [2, ]
+petit            [1, ]
+potter           [4, ]
+prejudice        [0, ]
+pride            [0, ]
+prince           [1, 4, ]
+r                [3, ]
+romance          [0, ]
+rowling          [4, ]
+s                [5, ]
+saint            [1, ]
+the              [3, 4, 5, ]
+to               [5, ]
+tolkien          [3, ]
+wonderland       [2, ]
+
--- a/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
+++ b/crates/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
@@ -0,0 +1,57 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+0                [1, ]
+1                [2, ]
+10               [1, ]
+12               [0, ]
+1344             [3, ]
+1813             [0, ]
+2                [0, ]
+23               [5, ]
+25               [2, ]
+3                [0, ]
+35               [5, ]
+4                [4, ]
+42               [0, 5, ]
+456              [1, ]
+5                [0, ]
+99               [2, ]
+adams            [5, ]
+adventure        [1, ]
+alice            [2, ]
+and              [0, 4, ]
+antoine          [1, ]
+austen           [0, ]
+blood            [4, ]
+carroll          [2, ]
+de               [1, ]
+douglas          [5, ]
+exupery          [1, ]
+fantasy          [2, 3, 4, ]
+galaxy           [5, ]
+guide            [5, ]
+half             [4, ]
+harry            [4, ]
+hitchhiker       [5, ]
+hobbit           [3, ]
+in               [2, ]
+j                [0, 3, 4, ]
+k                [4, ]
+lewis            [2, ]
+little           [1, ]
+potter           [4, ]
+prejudice        [0, ]
+pride            [0, ]
+prince           [1, ]
+princess         [4, ]
+r                [3, ]
+romance          [0, ]
+rowling          [4, ]
+s                [5, ]
+saint            [1, ]
+the              [1, 3, 4, 5, ]
+to               [5, ]
+tolkien          [3, ]
+wonderland       [2, ]
+
--- a/crates/milli/src/update/index_documents/transform.rs
+++ b/crates/milli/src/update/index_documents/transform.rs
--- a/crates/milli/src/update/index_documents/typed_chunk.rs
+++ b/crates/milli/src/update/index_documents/typed_chunk.rs
@@ -0,0 +1,852 @@
+use std::collections::BTreeSet;
+use std::convert::TryInto;
+use std::fs::File;
+use std::io::{self, BufReader};
+
+use bytemuck::allocation::pod_collect_to_vec;
+use grenad::{MergeFunction, Merger, MergerBuilder};
+use heed::types::Bytes;
+use heed::{BytesDecode, RwTxn};
+use obkv::{KvReader, KvWriter};
+use roaring::RoaringBitmap;
+
+use super::helpers::{
+    self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
+    MergeIgnoreValues,
+};
+use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
+use crate::facet::FacetType;
+use crate::index::db_name::DOCUMENTS;
+use crate::index::IndexEmbeddingConfig;
+use crate::proximity::MAX_DISTANCE;
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
+use crate::update::facet::FacetsUpdate;
+use crate::update::index_documents::helpers::{
+    as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
+};
+use crate::update::settings::InnerIndexSettingsDiff;
+use crate::vector::ArroyWrapper;
+use crate::{
+    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
+    Result, SerializationError, U8StrStrCodec,
+};
+
+/// This struct accumulates and group the TypedChunks
+/// and is able to give the biggest accumulated group to index them all together
+/// with a merger.
+#[derive(Default)]
+pub(crate) struct ChunkAccumulator {
+    inner: Vec<Vec<TypedChunk>>,
+}
+
+impl ChunkAccumulator {
+    pub fn pop_longest(&mut self) -> Option<Vec<TypedChunk>> {
+        match self.inner.iter().max_by_key(|v| v.len()) {
+            Some(left) => {
+                let position = self.inner.iter().position(|right| left.len() == right.len());
+                position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty())
+            }
+            None => None,
+        }
+    }
+
+    pub fn insert(&mut self, chunk: TypedChunk) {
+        match self
+            .inner
+            .iter()
+            .position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right)))
+        {
+            Some(position) => {
+                let v = self.inner.get_mut(position).unwrap();
+                v.push(chunk);
+            }
+            None => self.inner.push(vec![chunk]),
+        }
+    }
+}
+
+pub(crate) enum TypedChunk {
+    FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
+    FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
+    Documents(grenad::Reader<CursorClonableMmap>),
+    FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
+    WordDocids {
+        word_docids_reader: grenad::Reader<BufReader<File>>,
+        exact_word_docids_reader: grenad::Reader<BufReader<File>>,
+        word_fid_docids_reader: grenad::Reader<BufReader<File>>,
+    },
+    WordPositionDocids(grenad::Reader<BufReader<File>>),
+    WordPairProximityDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetStringDocids((grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)),
+    FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
+    GeoPoints(grenad::Reader<BufReader<File>>),
+    VectorPoints {
+        remove_vectors: grenad::Reader<BufReader<File>>,
+        embeddings: Option<grenad::Reader<BufReader<File>>>,
+        expected_dimension: usize,
+        manual_vectors: grenad::Reader<BufReader<File>>,
+        embedder_name: String,
+        add_to_user_provided: RoaringBitmap,
+        remove_from_user_provided: RoaringBitmap,
+    },
+}
+
+impl TypedChunk {
+    fn mergeable_with(&self, other: &Self) -> bool {
+        use TypedChunk::*;
+        match (self, other) {
+            (FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_))
+            | (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_))
+            | (Documents(_), Documents(_))
+            | (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_))
+            | (WordDocids { .. }, WordDocids { .. })
+            | (WordPositionDocids(_), WordPositionDocids(_))
+            | (WordPairProximityDocids(_), WordPairProximityDocids(_))
+            | (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_))
+            | (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_))
+            | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
+            | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
+            | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
+            | (GeoPoints(_), GeoPoints(_)) => true,
+            (
+                VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
+                VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
+            ) => left == right && left_dim == right_dim,
+            _ => false,
+        }
+    }
+}
+
+/// Write typed chunk in the corresponding LMDB database of the provided index.
+/// Return new documents seen.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
+pub(crate) fn write_typed_chunk_into_index(
+    wtxn: &mut RwTxn<'_>,
+    index: &Index,
+    settings_diff: &InnerIndexSettingsDiff,
+    typed_chunks: Vec<TypedChunk>,
+) -> Result<(RoaringBitmap, bool)> {
+    let mut is_merged_database = false;
+    match typed_chunks[0] {
+        TypedChunk::Documents(_) => {
+            let span = tracing::trace_span!(target: "indexing::write_db", "documents");
+            let _entered = span.enter();
+
+            let fields_ids_map = index.fields_ids_map(wtxn)?;
+            let vectors_fid =
+                fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
+
+            let mut builder = MergerBuilder::new(KeepLatestObkv);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::Documents(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+            let mut operations: Vec<DocumentOperation> = Default::default();
+
+            let mut docids = index.documents_ids(wtxn)?;
+            let mut iter = merger.into_stream_merger_iter()?;
+
+            let embedders: BTreeSet<_> = index
+                .embedding_configs(wtxn)?
+                .into_iter()
+                .map(|IndexEmbeddingConfig { name, .. }| name)
+                .collect();
+            let mut vectors_buffer = Vec::new();
+            while let Some((key, reader)) = iter.next()? {
+                let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
+                let reader: &KvReader<FieldId> = reader.into();
+
+                let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
+                    .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
+                let docid = DocumentId::from_be_bytes(document_id_bytes);
+                let external_id = std::str::from_utf8(external_id_bytes)?;
+
+                for (field_id, value) in reader.iter() {
+                    let del_add_reader = KvReaderDelAdd::from_slice(value);
+
+                    if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
+                        let addition = if vectors_fid == Some(field_id) {
+                            'vectors: {
+                                vectors_buffer.clear();
+                                let Ok(mut vectors) =
+                                    crate::vector::parsed_vectors::ParsedVectors::from_bytes(
+                                        addition,
+                                    )
+                                else {
+                                    // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
+                                    break 'vectors Some(addition);
+                                };
+                                vectors.retain_not_embedded_vectors(&embedders);
+                                let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
+                                if vectors.is_empty() {
+                                    // skip writing empty `_vectors` map
+                                    break 'vectors None;
+                                }
+
+                                serde_json::to_writer(&mut vectors_buffer, &vectors)
+                                    .map_err(InternalError::SerdeJson)?;
+                                Some(vectors_buffer.as_slice())
+                            }
+                        } else {
+                            Some(addition)
+                        };
+
+                        if let Some(addition) = addition {
+                            writer.insert(field_id, addition)?;
+                        }
+                    }
+                }
+
+                let db = index.documents.remap_data_type::<Bytes>();
+
+                if !writer.is_empty() {
+                    db.put(wtxn, &docid, &writer.into_inner().unwrap())?;
+                    operations.push(DocumentOperation {
+                        external_id: external_id.to_string(),
+                        internal_id: docid,
+                        kind: DocumentOperationKind::Create,
+                    });
+                    docids.insert(docid);
+                } else {
+                    db.delete(wtxn, &docid)?;
+                    operations.push(DocumentOperation {
+                        external_id: external_id.to_string(),
+                        internal_id: docid,
+                        kind: DocumentOperationKind::Delete,
+                    });
+                    docids.remove(docid);
+                }
+            }
+            let external_documents_docids = index.external_documents_ids();
+            external_documents_docids.apply(wtxn, operations)?;
+            index.put_documents_ids(wtxn, &docids)?;
+        }
+        TypedChunk::FieldIdWordCountDocids(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            write_entries_into_database(
+                merger,
+                &index.field_id_word_count_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+            is_merged_database = true;
+        }
+        TypedChunk::WordDocids { .. } => {
+            let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
+            let _entered = span.enter();
+
+            let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::WordDocids {
+                    word_docids_reader,
+                    exact_word_docids_reader,
+                    word_fid_docids_reader,
+                } = typed_chunk
+                else {
+                    unreachable!();
+                };
+                let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
+                let clonable_exact_word_docids =
+                    unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
+
+                word_docids_builder.push(word_docids_reader.into_cursor()?);
+                exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
+                word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
+                fst_merger_builder.push(clonable_word_docids.into_cursor()?);
+                fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
+            }
+
+            let word_docids_merger = word_docids_builder.build();
+            write_entries_into_database(
+                word_docids_merger,
+                &index.word_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+
+            let exact_word_docids_merger = exact_word_docids_builder.build();
+            write_entries_into_database(
+                exact_word_docids_merger,
+                &index.exact_word_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+
+            let word_fid_docids_merger = word_fid_docids_builder.build();
+            write_entries_into_database(
+                word_fid_docids_merger,
+                &index.word_fid_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+
+            // create fst from word docids
+            let fst_merger = fst_merger_builder.build();
+            let fst = merge_word_docids_reader_into_fst(fst_merger)?;
+            let db_fst = index.words_fst(wtxn)?;
+
+            // merge new fst with database fst
+            let union_stream = fst.op().add(db_fst.stream()).union();
+            let mut builder = fst::SetBuilder::memory();
+            builder.extend_stream(union_stream)?;
+            let fst = builder.into_set();
+            index.put_words_fst(wtxn, &fst)?;
+            is_merged_database = true;
+        }
+        TypedChunk::WordPositionDocids(_) => {
+            let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            write_entries_into_database(
+                merger,
+                &index.word_position_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+            is_merged_database = true;
+        }
+        TypedChunk::FieldIdFacetNumberDocids(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut data_size = 0;
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
+                else {
+                    unreachable!();
+                };
+
+                data_size += facet_id_number_docids.len();
+                builder.push(facet_id_number_docids.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
+            indexer.execute(wtxn)?;
+            is_merged_database = true;
+        }
+        TypedChunk::FieldIdFacetStringDocids(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
+            let _entered = span.enter();
+
+            let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            let mut normalized_facet_id_string_builder =
+                MergerBuilder::new(MergeDeladdBtreesetString);
+            let mut data_size = 0;
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdFacetStringDocids((
+                    facet_id_string_docids,
+                    normalized_facet_id_string_docids,
+                )) = typed_chunk
+                else {
+                    unreachable!();
+                };
+
+                data_size += facet_id_string_docids.len();
+                facet_id_string_builder.push(facet_id_string_docids.into_cursor()?);
+                normalized_facet_id_string_builder
+                    .push(normalized_facet_id_string_docids.into_cursor()?);
+            }
+            let facet_id_string_merger = facet_id_string_builder.build();
+            let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build();
+
+            let indexer = FacetsUpdate::new(
+                index,
+                FacetType::String,
+                facet_id_string_merger,
+                Some(normalized_facet_id_string_merger),
+                data_size,
+            );
+            indexer.execute(wtxn)?;
+            is_merged_database = true;
+        }
+        TypedChunk::FieldIdFacetExistsDocids(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            write_entries_into_database(
+                merger,
+                &index.facet_id_exists_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+            is_merged_database = true;
+        }
+        TypedChunk::FieldIdFacetIsNullDocids(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            write_entries_into_database(
+                merger,
+                &index.facet_id_is_null_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+            is_merged_database = true;
+        }
+        TypedChunk::FieldIdFacetIsEmptyDocids(_) => {
+            let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            write_entries_into_database(
+                merger,
+                &index.facet_id_is_empty_docids,
+                wtxn,
+                deladd_serialize_add_side,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+            )?;
+            is_merged_database = true;
+        }
+        TypedChunk::WordPairProximityDocids(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            if settings_diff.only_additional_fields.is_some() {
+                write_proximity_entries_into_database_additional_searchables(
+                    merger,
+                    &index.word_pair_proximity_docids,
+                    wtxn,
+                )?;
+            } else {
+                write_entries_into_database(
+                    merger,
+                    &index.word_pair_proximity_docids,
+                    wtxn,
+                    deladd_serialize_add_side,
+                    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+                )?;
+            }
+
+            is_merged_database = true;
+        }
+        TypedChunk::FieldIdDocidFacetNumbers(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(KeepFirst);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            let index_fid_docid_facet_numbers =
+                index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
+            let mut iter = merger.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                let reader = KvReaderDelAdd::from_slice(value);
+                if valid_lmdb_key(key) {
+                    match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
+                        (None, None) => {}
+                        (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?,
+                        (Some(_), None) => {
+                            index_fid_docid_facet_numbers.delete(wtxn, key)?;
+                        }
+                        (Some(_), Some(new)) => {
+                            index_fid_docid_facet_numbers.put(wtxn, key, new)?
+                        }
+                    }
+                }
+            }
+        }
+        TypedChunk::FieldIdDocidFacetStrings(_) => {
+            let span =
+                tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(KeepFirst);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            let index_fid_docid_facet_strings =
+                index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
+            let mut iter = merger.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                let reader = KvReaderDelAdd::from_slice(value);
+                if valid_lmdb_key(key) {
+                    match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
+                        (None, None) => {}
+                        (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?,
+                        (Some(_), None) => {
+                            index_fid_docid_facet_strings.delete(wtxn, key)?;
+                        }
+                        (Some(_), Some(new)) => {
+                            index_fid_docid_facet_strings.put(wtxn, key, new)?
+                        }
+                    }
+                }
+            }
+        }
+        TypedChunk::GeoPoints(_) => {
+            let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
+            let _entered = span.enter();
+
+            let mut builder = MergerBuilder::new(KeepFirst);
+            for typed_chunk in typed_chunks {
+                let TypedChunk::GeoPoints(chunk) = typed_chunk else {
+                    unreachable!();
+                };
+
+                builder.push(chunk.into_cursor()?);
+            }
+            let merger = builder.build();
+
+            let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
+            let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
+
+            let mut iter = merger.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                // convert the key back to a u32 (4 bytes)
+                let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
+
+                let deladd_obkv = KvReaderDelAdd::from_slice(value);
+                if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
+                    let geopoint = extract_geo_point(value, docid);
+                    rtree.remove(&geopoint);
+                    geo_faceted_docids.remove(docid);
+                }
+                if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
+                    let geopoint = extract_geo_point(value, docid);
+                    rtree.insert(geopoint);
+                    geo_faceted_docids.insert(docid);
+                }
+            }
+            index.put_geo_rtree(wtxn, &rtree)?;
+            index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
+        }
+        TypedChunk::VectorPoints { .. } => {
+            let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
+            let _entered = span.enter();
+
+            let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
+            let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
+            let mut embeddings_builder = MergerBuilder::new(KeepFirst);
+            let mut add_to_user_provided = RoaringBitmap::new();
+            let mut remove_from_user_provided = RoaringBitmap::new();
+            let mut params = None;
+            for typed_chunk in typed_chunks {
+                let TypedChunk::VectorPoints {
+                    remove_vectors,
+                    manual_vectors,
+                    embeddings,
+                    expected_dimension,
+                    embedder_name,
+                    add_to_user_provided: aud,
+                    remove_from_user_provided: rud,
+                } = typed_chunk
+                else {
+                    unreachable!();
+                };
+
+                params = Some((expected_dimension, embedder_name));
+
+                remove_vectors_builder.push(remove_vectors.into_cursor()?);
+                manual_vectors_builder.push(manual_vectors.into_cursor()?);
+                if let Some(embeddings) = embeddings {
+                    embeddings_builder.push(embeddings.into_cursor()?);
+                }
+                add_to_user_provided |= aud;
+                remove_from_user_provided |= rud;
+            }
+
+            // typed chunks has always at least 1 chunk.
+            let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
+
+            let mut embedding_configs = index.embedding_configs(wtxn)?;
+            let index_embedder_config = embedding_configs
+                .iter_mut()
+                .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
+                .unwrap();
+            index_embedder_config.user_provided -= remove_from_user_provided;
+            index_embedder_config.user_provided |= add_to_user_provided;
+
+            index.put_embedding_configs(wtxn, embedding_configs)?;
+
+            let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
+                InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
+            )?;
+            let binary_quantized = settings_diff
+                .old
+                .embedding_configs
+                .get(&embedder_name)
+                .map_or(false, |conf| conf.2);
+            // FIXME: allow customizing distance
+            let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
+
+            // remove vectors for docids we want them removed
+            let merger = remove_vectors_builder.build();
+            let mut iter = merger.into_stream_merger_iter()?;
+            while let Some((key, _)) = iter.next()? {
+                let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
+                writer.del_items(wtxn, expected_dimension, docid)?;
+            }
+
+            // add generated embeddings
+            let merger = embeddings_builder.build();
+            let mut iter = merger.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
+                let data = pod_collect_to_vec(value);
+                // it is a code error to have embeddings and not expected_dimension
+                let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension)
+                    // code error if we somehow got the wrong dimension
+                    .unwrap();
+
+                if embeddings.embedding_count() > usize::from(u8::MAX) {
+                    let external_docid = if let Ok(Some(Ok(index))) = index
+                        .external_id_of(wtxn, std::iter::once(docid))
+                        .map(|it| it.into_iter().next())
+                    {
+                        index
+                    } else {
+                        format!("internal docid={docid}")
+                    };
+                    return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
+                        external_docid,
+                        embeddings.embedding_count(),
+                    )));
+                }
+                writer.add_items(wtxn, docid, &embeddings)?;
+            }
+
+            // perform the manual diff
+            let merger = manual_vectors_builder.build();
+            let mut iter = merger.into_stream_merger_iter()?;
+            while let Some((key, value)) = iter.next()? {
+                // convert the key back to a u32 (4 bytes)
+                let (left, _index) = try_split_array_at(key).unwrap();
+                let docid = DocumentId::from_be_bytes(left);
+
+                let vector_deladd_obkv = KvReaderDelAdd::from_slice(value);
+                if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
+                    let vector: Vec<f32> = pod_collect_to_vec(value);
+
+                    writer.del_item(wtxn, docid, &vector)?;
+                }
+
+                if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
+                    let vector = pod_collect_to_vec(value);
+
+                    // overflow was detected during vector extraction.
+                    writer.add_item(wtxn, docid, &vector)?;
+                }
+            }
+
+            tracing::debug!("Finished vector chunk for {}", embedder_name);
+        }
+    }
+
+    Ok((RoaringBitmap::new(), is_merged_database))
+}
+
+/// Converts the latitude and longitude back to an xyz GeoPoint.
+fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
+    let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
+    let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
+    let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
+    let xyz_point = lat_lng_to_xyz(&point);
+    GeoPoint::new(xyz_point, (docid, point))
+}
+
+fn merge_word_docids_reader_into_fst<MF>(
+    merger: Merger<CursorClonableMmap, MF>,
+) -> Result<fst::Set<Vec<u8>>>
+where
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
+    let mut iter = merger.into_stream_merger_iter()?;
+    let mut builder = fst::SetBuilder::memory();
+
+    while let Some((k, _)) = iter.next()? {
+        builder.insert(k)?;
+    }
+
+    Ok(builder.into_set())
+}
+
+/// Write provided entries in database using serialize_value function.
+/// merge_values function is used if an entry already exist in the database.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
+fn write_entries_into_database<R, K, V, FS, FM, MF>(
+    merger: Merger<R, MF>,
+    database: &heed::Database<K, V>,
+    wtxn: &mut RwTxn<'_>,
+    serialize_value: FS,
+    merge_values: FM,
+) -> Result<()>
+where
+    R: io::Read + io::Seek,
+    FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
+    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
+    let mut buffer = Vec::new();
+    let database = database.remap_types::<Bytes, Bytes>();
+
+    let mut iter = merger.into_stream_merger_iter()?;
+    while let Some((key, value)) = iter.next()? {
+        if valid_lmdb_key(key) {
+            buffer.clear();
+            let value = match database.get(wtxn, key)? {
+                Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
+                None => Some(serialize_value(value, &mut buffer)?),
+            };
+            match value {
+                Some(value) => database.put(wtxn, key, value)?,
+                None => {
+                    database.delete(wtxn, key)?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
+
+/// Akin to the `write_entries_into_database` function but specialized
+/// for the case when we only index additional searchable fields only.
+#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
+fn write_proximity_entries_into_database_additional_searchables<R, MF>(
+    merger: Merger<R, MF>,
+    database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
+    wtxn: &mut RwTxn<'_>,
+) -> Result<()>
+where
+    R: io::Read + io::Seek,
+    MF: MergeFunction,
+    crate::Error: From<MF::Error>,
+{
+    let mut iter = merger.into_stream_merger_iter()?;
+    while let Some((key, value)) = iter.next()? {
+        if valid_lmdb_key(key) {
+            let (proximity_to_insert, word1, word2) =
+                U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
+            let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
+                Some(value) => {
+                    CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
+                }
+                None => continue,
+            };
+
+            let mut data_to_remove = RoaringBitmap::new();
+            for prox in 1..(MAX_DISTANCE as u8) {
+                let key = (prox, word1, word2);
+                let database_value = database.get(wtxn, &key)?.unwrap_or_default();
+                let value = if prox == proximity_to_insert {
+                    // Proximity that should be changed.
+                    // Union values and remove lower proximity data
+                    (&database_value | &data_to_insert) - &data_to_remove
+                } else {
+                    // Remove lower proximity data
+                    &database_value - &data_to_remove
+                };
+
+                // add the current data in data_to_remove for the next proximities
+                data_to_remove |= &value;
+
+                if database_value != value {
+                    database.put(wtxn, &key, &value)?;
+                }
+            }
+        }
+    }
+    Ok(())
+}
--- a/crates/milli/src/update/indexer_config.rs
+++ b/crates/milli/src/update/indexer_config.rs
@@ -0,0 +1,32 @@
+use grenad::CompressionType;
+
+use crate::thread_pool_no_abort::ThreadPoolNoAbort;
+
+#[derive(Debug)]
+pub struct IndexerConfig {
+    pub log_every_n: Option<usize>,
+    pub max_nb_chunks: Option<usize>,
+    pub documents_chunk_size: Option<usize>,
+    pub max_memory: Option<usize>,
+    pub chunk_compression_type: CompressionType,
+    pub chunk_compression_level: Option<u32>,
+    pub thread_pool: Option<ThreadPoolNoAbort>,
+    pub max_positions_per_attributes: Option<u32>,
+    pub skip_index_budget: bool,
+}
+
+impl Default for IndexerConfig {
+    fn default() -> Self {
+        Self {
+            log_every_n: None,
+            max_nb_chunks: None,
+            documents_chunk_size: None,
+            max_memory: None,
+            chunk_compression_type: CompressionType::None,
+            chunk_compression_level: None,
+            thread_pool: None,
+            max_positions_per_attributes: None,
+            skip_index_budget: false,
+        }
+    }
+}
--- a/crates/milli/src/update/mod.rs
+++ b/crates/milli/src/update/mod.rs
@@ -0,0 +1,26 @@
+pub use self::available_ids::AvailableIds;
+pub use self::clear_documents::ClearDocuments;
+pub use self::concurrent_available_ids::ConcurrentAvailableIds;
+pub use self::facet::bulk::FacetsUpdateBulk;
+pub use self::facet::incremental::FacetsUpdateIncrementalInner;
+pub use self::index_documents::*;
+pub use self::indexer_config::IndexerConfig;
+pub use self::settings::{validate_embedding_settings, Setting, Settings};
+pub use self::update_step::UpdateIndexingStep;
+pub use self::word_prefix_docids::WordPrefixDocids;
+pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
+pub use self::words_prefixes_fst::WordsPrefixesFst;
+
+mod available_ids;
+mod clear_documents;
+mod concurrent_available_ids;
+pub(crate) mod del_add;
+pub(crate) mod facet;
+mod index_documents;
+mod indexer_config;
+pub mod new;
+mod settings;
+mod update_step;
+mod word_prefix_docids;
+mod words_prefix_integer_docids;
+mod words_prefixes_fst;
--- a/crates/milli/src/update/new/channel.rs
+++ b/crates/milli/src/update/new/channel.rs
@@ -0,0 +1,437 @@
+use std::marker::PhantomData;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
+use grenad::Merger;
+use hashbrown::HashMap;
+use heed::types::Bytes;
+use roaring::RoaringBitmap;
+
+use super::extract::FacetKind;
+use super::StdResult;
+use crate::index::main_key::DOCUMENTS_IDS_KEY;
+use crate::update::new::KvReaderFieldId;
+use crate::update::MergeDeladdCboRoaringBitmaps;
+use crate::vector::Embedding;
+use crate::{DocumentId, Index};
+
+/// The capacity of the channel is currently in number of messages.
+pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) {
+    let (sender, receiver) = crossbeam_channel::bounded(cap);
+    (
+        ExtractorSender {
+            sender,
+            send_count: Default::default(),
+            writer_contentious_count: Default::default(),
+            extractor_contentious_count: Default::default(),
+        },
+        WriterReceiver(receiver),
+    )
+}
+
+pub struct KeyValueEntry {
+    pub key_length: usize,
+    pub data: Box<[u8]>,
+}
+
+impl KeyValueEntry {
+    pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self {
+        let mut data = Vec::with_capacity(key.len() + value.len());
+        data.extend_from_slice(key);
+        data.extend_from_slice(value);
+        KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
+    }
+
+    pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self {
+        let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size());
+        data.extend_from_slice(key);
+        bitmap.serialize_into(&mut data).unwrap();
+        KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
+    }
+
+    pub fn key(&self) -> &[u8] {
+        &self.data[..self.key_length]
+    }
+
+    pub fn value(&self) -> &[u8] {
+        &self.data[self.key_length..]
+    }
+}
+
+pub struct KeyEntry {
+    data: Box<[u8]>,
+}
+
+impl KeyEntry {
+    pub fn from_key(key: &[u8]) -> Self {
+        KeyEntry { data: key.to_vec().into_boxed_slice() }
+    }
+
+    pub fn entry(&self) -> &[u8] {
+        self.data.as_ref()
+    }
+}
+
+pub enum EntryOperation {
+    Delete(KeyEntry),
+    Write(KeyValueEntry),
+}
+
+pub enum WriterOperation {
+    DbOperation(DbOperation),
+    ArroyOperation(ArroyOperation),
+}
+
+pub enum ArroyOperation {
+    /// TODO: call when deleting regular documents
+    DeleteVectors {
+        docid: DocumentId,
+    },
+    SetVectors {
+        docid: DocumentId,
+        embedder_id: u8,
+        embeddings: Vec<Embedding>,
+    },
+    SetVector {
+        docid: DocumentId,
+        embedder_id: u8,
+        embedding: Embedding,
+    },
+    Finish {
+        user_provided: HashMap<String, RoaringBitmap>,
+    },
+}
+
+pub struct DbOperation {
+    database: Database,
+    entry: EntryOperation,
+}
+
+#[derive(Debug)]
+pub enum Database {
+    Documents,
+    ExternalDocumentsIds,
+    ExactWordDocids,
+    FidWordCountDocids,
+    Main,
+    WordDocids,
+    WordFidDocids,
+    WordPairProximityDocids,
+    WordPositionDocids,
+    FacetIdIsNullDocids,
+    FacetIdIsEmptyDocids,
+    FacetIdExistsDocids,
+    FacetIdF64NumberDocids,
+    FacetIdStringDocids,
+}
+
+impl Database {
+    pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
+        match self {
+            Database::Documents => index.documents.remap_types(),
+            Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(),
+            Database::ExactWordDocids => index.exact_word_docids.remap_types(),
+            Database::Main => index.main.remap_types(),
+            Database::WordDocids => index.word_docids.remap_types(),
+            Database::WordFidDocids => index.word_fid_docids.remap_types(),
+            Database::WordPositionDocids => index.word_position_docids.remap_types(),
+            Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
+            Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
+            Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(),
+            Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(),
+            Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(),
+            Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(),
+            Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
+        }
+    }
+}
+
+impl From<FacetKind> for Database {
+    fn from(value: FacetKind) -> Self {
+        match value {
+            FacetKind::Number => Database::FacetIdF64NumberDocids,
+            FacetKind::String => Database::FacetIdStringDocids,
+            FacetKind::Null => Database::FacetIdIsNullDocids,
+            FacetKind::Empty => Database::FacetIdIsEmptyDocids,
+            FacetKind::Exists => Database::FacetIdExistsDocids,
+        }
+    }
+}
+
+impl DbOperation {
+    pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
+        self.database.database(index)
+    }
+
+    pub fn entry(self) -> EntryOperation {
+        self.entry
+    }
+}
+
+pub struct WriterReceiver(Receiver<WriterOperation>);
+
+impl IntoIterator for WriterReceiver {
+    type Item = WriterOperation;
+    type IntoIter = IntoIter<Self::Item>;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.0.into_iter()
+    }
+}
+
+pub struct ExtractorSender {
+    sender: Sender<WriterOperation>,
+    /// The number of message we sent in total in the channel.
+    send_count: AtomicUsize,
+    /// The number of times we sent something in a channel that was full.
+    writer_contentious_count: AtomicUsize,
+    /// The number of times we sent something in a channel that was empty.
+    extractor_contentious_count: AtomicUsize,
+}
+
+impl Drop for ExtractorSender {
+    fn drop(&mut self) {
+        let send_count = *self.send_count.get_mut();
+        let writer_contentious_count = *self.writer_contentious_count.get_mut();
+        let extractor_contentious_count = *self.extractor_contentious_count.get_mut();
+        eprintln!(
+            "Extractor channel stats: {send_count} sends, \
+            {writer_contentious_count} writer contentions ({}%), \
+            {extractor_contentious_count} extractor contentions ({}%)",
+            (writer_contentious_count as f32 / send_count as f32) * 100.0,
+            (extractor_contentious_count as f32 / send_count as f32) * 100.0
+        )
+    }
+}
+
+impl ExtractorSender {
+    pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
+        WordDocidsSender { sender: self, _marker: PhantomData }
+    }
+
+    pub fn facet_docids(&self) -> FacetDocidsSender<'_> {
+        FacetDocidsSender { sender: self }
+    }
+
+    pub fn documents(&self) -> DocumentsSender<'_> {
+        DocumentsSender(self)
+    }
+
+    pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap(
+            DOCUMENTS_IDS_KEY.as_bytes(),
+            documents_ids,
+        ));
+        match self.send_db_operation(DbOperation { database: Database::Main, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> {
+        if self.sender.is_full() {
+            self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
+        }
+        if self.sender.is_empty() {
+            self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst);
+        }
+
+        self.send_count.fetch_add(1, Ordering::SeqCst);
+        match self.sender.send(WriterOperation::DbOperation(op)) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub enum ExactWordDocids {}
+pub enum FidWordCountDocids {}
+pub enum WordDocids {}
+pub enum WordFidDocids {}
+pub enum WordPairProximityDocids {}
+pub enum WordPositionDocids {}
+
+pub trait DatabaseType {
+    const DATABASE: Database;
+}
+
+impl DatabaseType for ExactWordDocids {
+    const DATABASE: Database = Database::ExactWordDocids;
+}
+
+impl DatabaseType for FidWordCountDocids {
+    const DATABASE: Database = Database::FidWordCountDocids;
+}
+
+impl DatabaseType for WordDocids {
+    const DATABASE: Database = Database::WordDocids;
+}
+
+impl DatabaseType for WordFidDocids {
+    const DATABASE: Database = Database::WordFidDocids;
+}
+
+impl DatabaseType for WordPairProximityDocids {
+    const DATABASE: Database = Database::WordPairProximityDocids;
+}
+
+impl DatabaseType for WordPositionDocids {
+    const DATABASE: Database = Database::WordPositionDocids;
+}
+
+pub trait DocidsSender {
+    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
+    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
+}
+
+pub struct WordDocidsSender<'a, D> {
+    sender: &'a ExtractorSender,
+    _marker: PhantomData<D>,
+}
+
+impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
+    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
+        match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
+        match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub struct FacetDocidsSender<'a> {
+    sender: &'a ExtractorSender,
+}
+
+impl DocidsSender for FacetDocidsSender<'_> {
+    fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
+        let (facet_kind, key) = FacetKind::extract_from_key(key);
+        let database = Database::from(facet_kind);
+        // let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
+        let entry = match facet_kind {
+            // skip level group size
+            FacetKind::String | FacetKind::Number => {
+                // add facet group size
+                let value = [&[1], value].concat();
+                EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value))
+            }
+            _ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)),
+        };
+        match self.sender.send_db_operation(DbOperation { database, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
+        let (facet_kind, key) = FacetKind::extract_from_key(key);
+        let database = Database::from(facet_kind);
+        let entry = EntryOperation::Delete(KeyEntry::from_key(key));
+        match self.sender.send_db_operation(DbOperation { database, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub struct DocumentsSender<'a>(&'a ExtractorSender);
+
+impl DocumentsSender<'_> {
+    /// TODO do that efficiently
+    pub fn uncompressed(
+        &self,
+        docid: DocumentId,
+        external_id: String,
+        document: &KvReaderFieldId,
+    ) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
+            &docid.to_be_bytes(),
+            document.as_bytes(),
+        ));
+        match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }?;
+
+        let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
+            external_id.as_bytes(),
+            &docid.to_be_bytes(),
+        ));
+        match self
+            .0
+            .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
+        {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+
+    pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
+        let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
+        match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }?;
+
+        let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes()));
+        match self
+            .0
+            .send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
+        {
+            Ok(()) => Ok(()),
+            Err(SendError(_)) => Err(SendError(())),
+        }
+    }
+}
+
+pub struct EmbeddingSender<'a>(&'a Sender<WriterOperation>);
+
+impl EmbeddingSender<'_> {
+    pub fn set_vectors(
+        &self,
+        docid: DocumentId,
+        embedder_id: u8,
+        embeddings: Vec<Embedding>,
+    ) -> StdResult<(), SendError<()>> {
+        self.0
+            .send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors {
+                docid,
+                embedder_id,
+                embeddings,
+            }))
+            .map_err(|_| SendError(()))
+    }
+
+    pub fn set_vector(
+        &self,
+        docid: DocumentId,
+        embedder_id: u8,
+        embedding: Embedding,
+    ) -> StdResult<(), SendError<()>> {
+        self.0
+            .send(WriterOperation::ArroyOperation(ArroyOperation::SetVector {
+                docid,
+                embedder_id,
+                embedding,
+            }))
+            .map_err(|_| SendError(()))
+    }
+
+    /// Marks all embedders as "to be built"
+    pub fn finish(
+        self,
+        user_provided: HashMap<String, RoaringBitmap>,
+    ) -> StdResult<(), SendError<()>> {
+        self.0
+            .send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided }))
+            .map_err(|_| SendError(()))
+    }
+}
--- a/crates/milli/src/update/new/document.rs
+++ b/crates/milli/src/update/new/document.rs
@@ -0,0 +1,398 @@
+use std::collections::{BTreeMap, BTreeSet};
+
+use heed::RoTxn;
+use raw_collections::RawMap;
+use serde_json::value::RawValue;
+
+use super::vector_document::VectorDocument;
+use super::{KvReaderFieldId, KvWriterFieldId};
+use crate::documents::FieldIdMapper;
+use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
+use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError};
+
+/// A view into a document that can represent either the current version from the DB,
+/// the update data from payload or other means, or the merged updated version.
+///
+/// The 'doc lifetime is meant to live sufficiently for the document to be handled by the extractors.
+pub trait Document<'doc> {
+    /// Iterate over all **top-level** fields of the document, returning their name and raw JSON value.
+    ///
+    /// - The returned values *may* contain nested fields.
+    /// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning  they are **not returned** by this method.
+    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;
+
+    fn len(&self) -> usize;
+
+    fn is_empty(&self) -> bool {
+        self.len() == 0
+    }
+
+    fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>>;
+
+    /// Returns the unparsed value of the `_vectors` field from the document data.
+    ///
+    /// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database.
+    /// Use a [`super::vector_document::VectorDocument`] to access the vector.
+    ///
+    /// This method is meant as a convenience for implementors of [`super::vector_document::VectorDocument`].
+    fn vectors_field(&self) -> Result<Option<&'doc RawValue>>;
+
+    /// Returns the unparsed value of the `_geo` field from the document data.
+    ///
+    /// This field alone is insufficient to retrieve geo data, as they may be stored in a dedicated location in the database.
+    /// Use a [`super::geo_document::GeoDocument`] to access the vector.
+    ///
+    /// This method is meant as a convenience for implementors of [`super::geo_document::GeoDocument`].
+    fn geo_field(&self) -> Result<Option<&'doc RawValue>>;
+}
+
+#[derive(Debug)]
+pub struct DocumentFromDb<'t, Mapper: FieldIdMapper>
+where
+    Mapper: FieldIdMapper,
+{
+    fields_ids_map: &'t Mapper,
+    content: &'t KvReaderFieldId,
+}
+
+impl<'t, Mapper: FieldIdMapper> Clone for DocumentFromDb<'t, Mapper> {
+    #[inline]
+    fn clone(&self) -> Self {
+        *self
+    }
+}
+impl<'t, Mapper: FieldIdMapper> Copy for DocumentFromDb<'t, Mapper> {}
+
+impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
+    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'t str, &'t RawValue)>> {
+        let mut it = self.content.iter();
+
+        std::iter::from_fn(move || {
+            let (fid, value) = it.next()?;
+
+            let res = (|| loop {
+                let name = self.fields_ids_map.name(fid).ok_or(
+                    InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
+                        field_id: fid,
+                        process: "getting current document",
+                    }),
+                )?;
+
+                if name == RESERVED_VECTORS_FIELD_NAME || name == "_geo" {
+                    continue;
+                }
+
+                let value =
+                    serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
+
+                return Ok((name, value));
+            })();
+
+            Some(res)
+        })
+    }
+
+    fn vectors_field(&self) -> Result<Option<&'t RawValue>> {
+        self.field(RESERVED_VECTORS_FIELD_NAME)
+    }
+
+    fn geo_field(&self) -> Result<Option<&'t RawValue>> {
+        self.field("_geo")
+    }
+
+    fn len(&self) -> usize {
+        self.content.iter().count()
+    }
+
+    fn top_level_field(&self, k: &str) -> Result<Option<&'t RawValue>> {
+        self.field(k)
+    }
+}
+
+impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> {
+    pub fn new(
+        docid: DocumentId,
+        rtxn: &'t RoTxn,
+        index: &'t Index,
+        db_fields_ids_map: &'t Mapper,
+    ) -> Result<Option<Self>> {
+        index.documents.get(rtxn, &docid).map_err(crate::Error::from).map(|reader| {
+            reader.map(|reader| Self { fields_ids_map: db_fields_ids_map, content: reader })
+        })
+    }
+
+    pub fn field(&self, name: &str) -> Result<Option<&'t RawValue>> {
+        let Some(fid) = self.fields_ids_map.id(name) else {
+            return Ok(None);
+        };
+        let Some(value) = self.content.get(fid) else { return Ok(None) };
+        Ok(Some(serde_json::from_slice(value).map_err(InternalError::SerdeJson)?))
+    }
+}
+
+#[derive(Debug)]
+pub struct DocumentFromVersions<'a, 'doc> {
+    versions: &'a Versions<'doc>,
+}
+
+impl<'a, 'doc> DocumentFromVersions<'a, 'doc> {
+    pub fn new(versions: &'a Versions<'doc>) -> Self {
+        Self { versions }
+    }
+}
+
+impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> {
+    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
+        self.versions.iter_top_level_fields().map(Ok)
+    }
+
+    fn vectors_field(&self) -> Result<Option<&'doc RawValue>> {
+        Ok(self.versions.vectors_field())
+    }
+
+    fn geo_field(&self) -> Result<Option<&'doc RawValue>> {
+        Ok(self.versions.geo_field())
+    }
+
+    fn len(&self) -> usize {
+        self.versions.len()
+    }
+
+    fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
+        Ok(self.versions.top_level_field(k))
+    }
+}
+
+#[derive(Debug)]
+pub struct MergedDocument<'a, 'doc, 't, Mapper: FieldIdMapper> {
+    new_doc: DocumentFromVersions<'a, 'doc>,
+    db: Option<DocumentFromDb<'t, Mapper>>,
+}
+
+impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> {
+    pub fn with_db(
+        docid: DocumentId,
+        rtxn: &'t RoTxn,
+        index: &'t Index,
+        db_fields_ids_map: &'t Mapper,
+        new_doc: DocumentFromVersions<'a, 'doc>,
+    ) -> Result<Self> {
+        let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?;
+        Ok(Self { new_doc, db })
+    }
+
+    pub fn without_db(new_doc: DocumentFromVersions<'a, 'doc>) -> Self {
+        Self { new_doc, db: None }
+    }
+}
+
+impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
+    for MergedDocument<'d, 'doc, 't, Mapper>
+{
+    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> {
+        let mut new_doc_it = self.new_doc.iter_top_level_fields();
+        let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields());
+        let mut seen_fields = BTreeSet::new();
+
+        std::iter::from_fn(move || {
+            if let Some(next) = new_doc_it.next() {
+                if let Ok((name, _)) = next {
+                    seen_fields.insert(name);
+                }
+                return Some(next);
+            }
+            loop {
+                match db_it.next()? {
+                    Ok((name, value)) => {
+                        if seen_fields.contains(name) {
+                            continue;
+                        }
+                        return Some(Ok((name, value)));
+                    }
+                    Err(err) => return Some(Err(err)),
+                }
+            }
+        })
+    }
+
+    fn vectors_field(&self) -> Result<Option<&'d RawValue>> {
+        if let Some(vectors) = self.new_doc.vectors_field()? {
+            return Ok(Some(vectors));
+        }
+
+        let Some(db) = self.db else { return Ok(None) };
+
+        db.vectors_field()
+    }
+
+    fn geo_field(&self) -> Result<Option<&'d RawValue>> {
+        if let Some(geo) = self.new_doc.geo_field()? {
+            return Ok(Some(geo));
+        }
+
+        let Some(db) = self.db else { return Ok(None) };
+
+        db.geo_field()
+    }
+
+    fn len(&self) -> usize {
+        self.iter_top_level_fields().count()
+    }
+
+    fn top_level_field(&self, k: &str) -> Result<Option<&'d RawValue>> {
+        if let Some(f) = self.new_doc.top_level_field(k)? {
+            return Ok(Some(f));
+        }
+        if let Some(db) = self.db {
+            return db.field(k);
+        }
+        Ok(None)
+    }
+}
+
+impl<'doc, D> Document<'doc> for &D
+where
+    D: Document<'doc>,
+{
+    fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
+        D::iter_top_level_fields(self)
+    }
+
+    fn vectors_field(&self) -> Result<Option<&'doc RawValue>> {
+        D::vectors_field(self)
+    }
+
+    fn geo_field(&self) -> Result<Option<&'doc RawValue>> {
+        D::geo_field(self)
+    }
+
+    fn len(&self) -> usize {
+        D::len(self)
+    }
+
+    fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
+        D::top_level_field(self, k)
+    }
+}
+
+/// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`.
+///
+/// The produced obkv is suitable for storing into the documents DB, meaning:
+///
+/// - It contains the contains of `_vectors` that are not configured as an embedder
+/// - It contains all the top-level fields of the document, with their raw JSON value as value.
+///
+/// # Panics
+///
+/// - If the document contains a top-level field that is not present in `fields_ids_map`.
+///
+pub fn write_to_obkv<'s, 'a, 'map>(
+    document: &'s impl Document<'s>,
+    vector_document: Option<&'s impl VectorDocument<'s>>,
+    fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>,
+    mut document_buffer: &'a mut Vec<u8>,
+) -> Result<&'a KvReaderFieldId>
+where
+    's: 'a,
+{
+    // will be used in 'inject_vectors
+    let vectors_value: Box<RawValue>;
+
+    document_buffer.clear();
+    let mut unordered_field_buffer = Vec::new();
+    unordered_field_buffer.clear();
+
+    let mut writer = KvWriterFieldId::new(&mut document_buffer);
+
+    for res in document.iter_top_level_fields() {
+        let (field_name, value) = res?;
+        let field_id =
+            fields_ids_map.id_or_insert(field_name).ok_or(UserError::AttributeLimitReached)?;
+        unordered_field_buffer.push((field_id, value));
+    }
+
+    'inject_vectors: {
+        let Some(vector_document) = vector_document else { break 'inject_vectors };
+
+        let vectors_fid = fields_ids_map
+            .id_or_insert(RESERVED_VECTORS_FIELD_NAME)
+            .ok_or(UserError::AttributeLimitReached)?;
+
+        let mut vectors = BTreeMap::new();
+        for res in vector_document.iter_vectors() {
+            let (name, entry) = res?;
+            if entry.has_configured_embedder {
+                continue; // we don't write vectors with configured embedder in documents
+            }
+            vectors.insert(
+                name,
+                serde_json::json!({
+                    "regenerate": entry.regenerate,
+                    // TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
+                    "embeddings": entry.embeddings,
+                }),
+            );
+        }
+
+        vectors_value = serde_json::value::to_raw_value(&vectors).unwrap();
+        unordered_field_buffer.push((vectors_fid, &vectors_value));
+    }
+
+    unordered_field_buffer.sort_by_key(|(fid, _)| *fid);
+    for (fid, value) in unordered_field_buffer.iter() {
+        writer.insert(*fid, value.get().as_bytes()).unwrap();
+    }
+
+    writer.finish().unwrap();
+    Ok(KvReaderFieldId::from_slice(document_buffer))
+}
+
+pub type Entry<'doc> = (&'doc str, &'doc RawValue);
+
+#[derive(Debug)]
+pub struct Versions<'doc> {
+    data: RawMap<'doc>,
+}
+
+impl<'doc> Versions<'doc> {
+    pub fn multiple(
+        mut versions: impl Iterator<Item = Result<RawMap<'doc>>>,
+    ) -> Result<Option<Self>> {
+        let Some(data) = versions.next() else { return Ok(None) };
+        let mut data = data?;
+        for future_version in versions {
+            let future_version = future_version?;
+            for (field, value) in future_version {
+                data.insert(field, value);
+            }
+        }
+        Ok(Some(Self::single(data)))
+    }
+
+    pub fn single(version: RawMap<'doc>) -> Self {
+        Self { data: version }
+    }
+
+    pub fn iter_top_level_fields(&self) -> impl Iterator<Item = (&'doc str, &'doc RawValue)> + '_ {
+        self.data.iter().filter(|(k, _)| *k != RESERVED_VECTORS_FIELD_NAME && *k != "_geo")
+    }
+
+    pub fn vectors_field(&self) -> Option<&'doc RawValue> {
+        self.data.get(RESERVED_VECTORS_FIELD_NAME)
+    }
+
+    pub fn geo_field(&self) -> Option<&'doc RawValue> {
+        self.data.get("_geo")
+    }
+
+    pub fn len(&self) -> usize {
+        self.data.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.data.is_empty()
+    }
+    pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> {
+        self.data.get(k)
+    }
+}
--- a/crates/milli/src/update/new/document_change.rs
+++ b/crates/milli/src/update/new/document_change.rs
@@ -0,0 +1,191 @@
+use bumpalo::Bump;
+use heed::RoTxn;
+
+use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions};
+use super::vector_document::{
+    MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions,
+};
+use crate::documents::FieldIdMapper;
+use crate::vector::EmbeddingConfigs;
+use crate::{DocumentId, Index, Result};
+
+pub enum DocumentChange<'doc> {
+    Deletion(Deletion<'doc>),
+    Update(Update<'doc>),
+    Insertion(Insertion<'doc>),
+}
+
+pub struct Deletion<'doc> {
+    docid: DocumentId,
+    external_document_id: &'doc str,
+}
+
+pub struct Update<'doc> {
+    docid: DocumentId,
+    external_document_id: &'doc str,
+    new: Versions<'doc>,
+    has_deletion: bool,
+}
+
+pub struct Insertion<'doc> {
+    docid: DocumentId,
+    external_document_id: &'doc str,
+    new: Versions<'doc>,
+}
+
+impl<'doc> DocumentChange<'doc> {
+    pub fn docid(&self) -> DocumentId {
+        match &self {
+            Self::Deletion(inner) => inner.docid(),
+            Self::Update(inner) => inner.docid(),
+            Self::Insertion(inner) => inner.docid(),
+        }
+    }
+
+    pub fn external_docid(&self) -> &'doc str {
+        match self {
+            DocumentChange::Deletion(deletion) => deletion.external_document_id(),
+            DocumentChange::Update(update) => update.external_document_id(),
+            DocumentChange::Insertion(insertion) => insertion.external_document_id(),
+        }
+    }
+}
+
+impl<'doc> Deletion<'doc> {
+    pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self {
+        Self { docid, external_document_id }
+    }
+
+    pub fn docid(&self) -> DocumentId {
+        self.docid
+    }
+
+    pub fn external_document_id(&self) -> &'doc str {
+        self.external_document_id
+    }
+
+    pub fn current<'a, Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+        mapper: &'a Mapper,
+    ) -> Result<DocumentFromDb<'a, Mapper>> {
+        Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
+            crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
+        )?)
+    }
+}
+
+impl<'doc> Insertion<'doc> {
+    pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self {
+        Insertion { docid, external_document_id, new }
+    }
+
+    pub fn docid(&self) -> DocumentId {
+        self.docid
+    }
+
+    pub fn external_document_id(&self) -> &'doc str {
+        self.external_document_id
+    }
+    pub fn inserted(&self) -> DocumentFromVersions<'_, 'doc> {
+        DocumentFromVersions::new(&self.new)
+    }
+
+    pub fn inserted_vectors(
+        &self,
+        doc_alloc: &'doc Bump,
+        embedders: &'doc EmbeddingConfigs,
+    ) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
+        VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders)
+    }
+}
+
+impl<'doc> Update<'doc> {
+    pub fn create(
+        docid: DocumentId,
+        external_document_id: &'doc str,
+        new: Versions<'doc>,
+        has_deletion: bool,
+    ) -> Self {
+        Update { docid, new, external_document_id, has_deletion }
+    }
+
+    pub fn docid(&self) -> DocumentId {
+        self.docid
+    }
+
+    pub fn external_document_id(&self) -> &'doc str {
+        self.external_document_id
+    }
+    pub fn current<'a, Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+        mapper: &'a Mapper,
+    ) -> Result<DocumentFromDb<'a, Mapper>> {
+        Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
+            crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
+        )?)
+    }
+
+    pub fn current_vectors<'a, Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+        mapper: &'a Mapper,
+        doc_alloc: &'a Bump,
+    ) -> Result<VectorDocumentFromDb<'a>> {
+        Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or(
+            crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
+        )?)
+    }
+
+    pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> {
+        DocumentFromVersions::new(&self.new)
+    }
+
+    pub fn merged<'t, Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'t RoTxn,
+        index: &'t Index,
+        mapper: &'t Mapper,
+    ) -> Result<MergedDocument<'_, 'doc, 't, Mapper>> {
+        if self.has_deletion {
+            Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new)))
+        } else {
+            MergedDocument::with_db(
+                self.docid,
+                rtxn,
+                index,
+                mapper,
+                DocumentFromVersions::new(&self.new),
+            )
+        }
+    }
+
+    pub fn updated_vectors(
+        &self,
+        doc_alloc: &'doc Bump,
+        embedders: &'doc EmbeddingConfigs,
+    ) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
+        VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders)
+    }
+
+    pub fn merged_vectors<Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'doc RoTxn,
+        index: &'doc Index,
+        mapper: &'doc Mapper,
+        doc_alloc: &'doc Bump,
+        embedders: &'doc EmbeddingConfigs,
+    ) -> Result<Option<MergedVectorDocument<'doc>>> {
+        if self.has_deletion {
+            MergedVectorDocument::without_db(&self.new, doc_alloc, embedders)
+        } else {
+            MergedVectorDocument::with_db(
+                self.docid, index, rtxn, mapper, &self.new, doc_alloc, embedders,
+            )
+        }
+    }
+}
--- a/crates/milli/src/update/new/extract/cache.rs
+++ b/crates/milli/src/update/new/extract/cache.rs
@@ -0,0 +1,729 @@
+//! # How the Merge Algorithm works
+//!
+//! Each extractor create #Threads caches and balances the entries
+//! based on the hash of the keys. To do that we can use the
+//! hashbrown::hash_map::RawEntryBuilderMut::from_key_hashed_nocheck.
+//! This way we can compute the hash on our own, decide on the cache to
+//! target, and insert it into the right HashMap.
+//!
+//! #Thread  -> caches
+//! t1       -> [t1c1, t1c2, t1c3]
+//! t2       -> [t2c1, t2c2, t2c3]
+//! t3       -> [t3c1, t3c2, t3c3]
+//!
+//! When the extractors are done filling the caches, we want to merge
+//! the content of all the caches. We do a transpose and each thread is
+//! assigned the associated cache. By doing that we know that every key
+//! is put in a known cache and will collide with keys in the other
+//! caches of the other threads.
+//!
+//! #Thread  -> caches
+//! t1       -> [t1c1, t2c1, t3c1]
+//! t2       -> [t1c2, t2c2, t3c2]
+//! t3       -> [t1c3, t2c3, t3c3]
+//!
+//! When we encountered a miss in the other caches we must still try
+//! to find it in the spilled entries. This is the reason why we use
+//! a grenad sorter/reader so that we can seek "efficiently" for a key.
+//!
+//! ## More Detailled Algorithm
+//!
+//! Each sub-cache has an in-memory HashMap and some spilled
+//! lexicographically ordered entries on disk (grenad). We first iterate
+//! over the spilled entries of all the caches at once by using a merge
+//! join algorithm. This algorithm will merge the entries by using its
+//! merge function.
+//!
+//! Everytime a merged entry is emited by the merge join algorithm we also
+//! fetch the value from the other in-memory caches (HashMaps) to finish
+//! the merge. Everytime we retrieve an entry from the in-memory caches
+//! we mark them with a tombstone for later.
+//!
+//! Once we are done with the spilled entries we iterate over the in-memory
+//! HashMaps. We iterate over the first one, retrieve the content from the
+//! other onces and mark them with a tombstone again. We also make sure
+//! to ignore the dead (tombstoned) ones.
+//!
+//! ## Memory Control
+//!
+//! We can detect that there are no more memory available when the
+//! bump allocator reaches a threshold. When this is the case we
+//! freeze the cache. There is one bump allocator by thread and the
+//! memory must be well balanced as we manage one type of extraction
+//! at a time with well-balanced documents.
+//!
+//! It means that the unknown new keys added to the
+//! cache are directly spilled to disk: basically a key followed by a
+//! del/add bitmap. For the known keys we can keep modifying them in
+//! the materialized version in the cache: update the del/add bitmaps.
+//!
+//! For now we can use a grenad sorter for spilling even thought I think
+//! it's not the most efficient way (too many files open, sorting entries).
+
+use std::cmp::Ordering;
+use std::collections::binary_heap::PeekMut;
+use std::collections::BinaryHeap;
+use std::fs::File;
+use std::hash::BuildHasher;
+use std::io::BufReader;
+use std::{io, iter, mem};
+
+use bumpalo::Bump;
+use grenad::ReaderCursor;
+use hashbrown::hash_map::RawEntryMut;
+use hashbrown::HashMap;
+use raw_collections::bbbul::{BitPacker, BitPacker4x};
+use raw_collections::map::FrozenMap;
+use raw_collections::{Bbbul, FrozenBbbul};
+use roaring::RoaringBitmap;
+use rustc_hash::FxBuildHasher;
+
+use crate::update::del_add::{DelAdd, KvWriterDelAdd};
+use crate::update::new::indexer::document_changes::MostlySend;
+use crate::update::new::KvReaderDelAdd;
+use crate::update::MergeDeladdCboRoaringBitmaps;
+use crate::{CboRoaringBitmapCodec, Result};
+
+/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps.
+///
+/// Internally balances the content over `N` buckets for future merging.
+pub struct BalancedCaches<'extractor> {
+    hasher: FxBuildHasher,
+    alloc: &'extractor Bump,
+    max_memory: Option<usize>,
+    caches: InnerCaches<'extractor>,
+}
+
+enum InnerCaches<'extractor> {
+    Normal(NormalCaches<'extractor>),
+    Spilling(SpillingCaches<'extractor>),
+}
+
+impl<'extractor> BalancedCaches<'extractor> {
+    pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
+        Self {
+            hasher: FxBuildHasher,
+            max_memory,
+            caches: InnerCaches::Normal(NormalCaches {
+                caches: iter::repeat_with(|| HashMap::with_hasher_in(FxBuildHasher, alloc))
+                    .take(buckets)
+                    .collect(),
+            }),
+            alloc,
+        }
+    }
+
+    fn buckets(&self) -> usize {
+        match &self.caches {
+            InnerCaches::Normal(caches) => caches.caches.len(),
+            InnerCaches::Spilling(caches) => caches.caches.len(),
+        }
+    }
+
+    pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> Result<()> {
+        if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) {
+            self.start_spilling()?;
+        }
+
+        let buckets = self.buckets();
+        match &mut self.caches {
+            InnerCaches::Normal(normal) => {
+                normal.insert_del_u32(&self.hasher, self.alloc, buckets, key, n);
+                Ok(())
+            }
+            InnerCaches::Spilling(spilling) => {
+                spilling.insert_del_u32(&self.hasher, self.alloc, buckets, key, n)
+            }
+        }
+    }
+
+    pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> Result<()> {
+        if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) {
+            self.start_spilling()?;
+        }
+
+        let buckets = self.buckets();
+        match &mut self.caches {
+            InnerCaches::Normal(normal) => {
+                normal.insert_add_u32(&self.hasher, self.alloc, buckets, key, n);
+                Ok(())
+            }
+            InnerCaches::Spilling(spilling) => {
+                spilling.insert_add_u32(&self.hasher, self.alloc, buckets, key, n)
+            }
+        }
+    }
+
+    /// Make sure the cache is no longer allocating data
+    /// and writes every new and unknow entry to disk.
+    fn start_spilling(&mut self) -> Result<()> {
+        let BalancedCaches { hasher: _, alloc, max_memory: _, caches } = self;
+
+        if let InnerCaches::Normal(normal_caches) = caches {
+            eprintln!(
+                "We are spilling after we allocated {} bytes on thread #{}",
+                alloc.allocated_bytes(),
+                rayon::current_thread_index().unwrap_or(0)
+            );
+
+            let allocated: usize = normal_caches.caches.iter().map(|m| m.allocation_size()).sum();
+            eprintln!("The last allocated HashMap took {allocated} bytes");
+
+            let dummy = NormalCaches { caches: Vec::new() };
+            let NormalCaches { caches: cache_maps } = mem::replace(normal_caches, dummy);
+            *caches = InnerCaches::Spilling(SpillingCaches::from_cache_maps(cache_maps));
+        }
+
+        Ok(())
+    }
+
+    pub fn freeze(&mut self) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
+        match &mut self.caches {
+            InnerCaches::Normal(NormalCaches { caches }) => caches
+                .iter_mut()
+                .enumerate()
+                .map(|(bucket, map)| {
+                    // safety: we are transmuting the Bbbul into a FrozenBbbul
+                    //         that are the same size.
+                    let map = unsafe {
+                        std::mem::transmute::<
+                            &mut HashMap<
+                                &[u8],
+                                DelAddBbbul<BitPacker4x>, // from this
+                                FxBuildHasher,
+                                &Bump,
+                            >,
+                            &mut HashMap<
+                                &[u8],
+                                FrozenDelAddBbbul<BitPacker4x>, // to that
+                                FxBuildHasher,
+                                &Bump,
+                            >,
+                        >(map)
+                    };
+                    Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() })
+                })
+                .collect(),
+            InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches
+                .iter_mut()
+                .zip(mem::take(spilled_entries))
+                .enumerate()
+                .map(|(bucket, (map, sorter))| {
+                    let spilled = sorter
+                        .into_reader_cursors()?
+                        .into_iter()
+                        .map(ReaderCursor::into_inner)
+                        .map(BufReader::new)
+                        .map(|bufreader| grenad::Reader::new(bufreader).map_err(Into::into))
+                        .collect::<Result<_>>()?;
+                    // safety: we are transmuting the Bbbul into a FrozenBbbul
+                    //         that are the same size.
+                    let map = unsafe {
+                        std::mem::transmute::<
+                            &mut HashMap<
+                                &[u8],
+                                DelAddBbbul<BitPacker4x>, // from this
+                                FxBuildHasher,
+                                &Bump,
+                            >,
+                            &mut HashMap<
+                                &[u8],
+                                FrozenDelAddBbbul<BitPacker4x>, // to that
+                                FxBuildHasher,
+                                &Bump,
+                            >,
+                        >(map)
+                    };
+                    Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled })
+                })
+                .collect(),
+        }
+    }
+}
+
+unsafe impl MostlySend for BalancedCaches<'_> {}
+
+struct NormalCaches<'extractor> {
+    caches: Vec<
+        HashMap<
+            &'extractor [u8],
+            DelAddBbbul<'extractor, BitPacker4x>,
+            FxBuildHasher,
+            &'extractor Bump,
+        >,
+    >,
+}
+
+impl<'extractor> NormalCaches<'extractor> {
+    pub fn insert_del_u32(
+        &mut self,
+        hasher: &FxBuildHasher,
+        alloc: &'extractor Bump,
+        buckets: usize,
+        key: &[u8],
+        n: u32,
+    ) {
+        let hash = hasher.hash_one(key);
+        let bucket = compute_bucket_from_hash(buckets, hash);
+
+        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
+            RawEntryMut::Occupied(mut entry) => {
+                entry.get_mut().del.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
+            }
+            RawEntryMut::Vacant(entry) => {
+                entry.insert_hashed_nocheck(
+                    hash,
+                    alloc.alloc_slice_copy(key),
+                    DelAddBbbul::new_del_u32_in(n, alloc),
+                );
+            }
+        }
+    }
+
+    pub fn insert_add_u32(
+        &mut self,
+        hasher: &FxBuildHasher,
+        alloc: &'extractor Bump,
+        buckets: usize,
+        key: &[u8],
+        n: u32,
+    ) {
+        let hash = hasher.hash_one(key);
+        let bucket = compute_bucket_from_hash(buckets, hash);
+        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
+            RawEntryMut::Occupied(mut entry) => {
+                entry.get_mut().add.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
+            }
+            RawEntryMut::Vacant(entry) => {
+                entry.insert_hashed_nocheck(
+                    hash,
+                    alloc.alloc_slice_copy(key),
+                    DelAddBbbul::new_add_u32_in(n, alloc),
+                );
+            }
+        }
+    }
+}
+
+struct SpillingCaches<'extractor> {
+    caches: Vec<
+        HashMap<
+            &'extractor [u8],
+            DelAddBbbul<'extractor, BitPacker4x>,
+            FxBuildHasher,
+            &'extractor Bump,
+        >,
+    >,
+    spilled_entries: Vec<grenad::Sorter<MergeDeladdCboRoaringBitmaps>>,
+    deladd_buffer: Vec<u8>,
+    cbo_buffer: Vec<u8>,
+}
+
+impl<'extractor> SpillingCaches<'extractor> {
+    fn from_cache_maps(
+        caches: Vec<
+            HashMap<
+                &'extractor [u8],
+                DelAddBbbul<'extractor, BitPacker4x>,
+                FxBuildHasher,
+                &'extractor Bump,
+            >,
+        >,
+    ) -> SpillingCaches<'extractor> {
+        SpillingCaches {
+            spilled_entries: iter::repeat_with(|| {
+                let mut builder = grenad::SorterBuilder::new(MergeDeladdCboRoaringBitmaps);
+                builder.dump_threshold(0);
+                builder.allow_realloc(false);
+                builder.build()
+            })
+            .take(caches.len())
+            .collect(),
+            caches,
+            deladd_buffer: Vec::new(),
+            cbo_buffer: Vec::new(),
+        }
+    }
+
+    pub fn insert_del_u32(
+        &mut self,
+        hasher: &FxBuildHasher,
+        alloc: &'extractor Bump,
+        buckets: usize,
+        key: &[u8],
+        n: u32,
+    ) -> Result<()> {
+        let hash = hasher.hash_one(key);
+        let bucket = compute_bucket_from_hash(buckets, hash);
+        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
+            RawEntryMut::Occupied(mut entry) => {
+                entry.get_mut().del.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
+                Ok(())
+            }
+            RawEntryMut::Vacant(_entry) => spill_entry_to_sorter(
+                &mut self.spilled_entries[bucket],
+                &mut self.deladd_buffer,
+                &mut self.cbo_buffer,
+                key,
+                DelAddRoaringBitmap::new_del_u32(n),
+            ),
+        }
+    }
+
+    pub fn insert_add_u32(
+        &mut self,
+        hasher: &FxBuildHasher,
+        alloc: &'extractor Bump,
+        buckets: usize,
+        key: &[u8],
+        n: u32,
+    ) -> Result<()> {
+        let hash = hasher.hash_one(key);
+        let bucket = compute_bucket_from_hash(buckets, hash);
+        match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
+            RawEntryMut::Occupied(mut entry) => {
+                entry.get_mut().add.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
+                Ok(())
+            }
+            RawEntryMut::Vacant(_entry) => spill_entry_to_sorter(
+                &mut self.spilled_entries[bucket],
+                &mut self.deladd_buffer,
+                &mut self.cbo_buffer,
+                key,
+                DelAddRoaringBitmap::new_add_u32(n),
+            ),
+        }
+    }
+}
+
+#[inline]
+fn compute_bucket_from_hash(buckets: usize, hash: u64) -> usize {
+    hash as usize % buckets
+}
+
+fn spill_entry_to_sorter(
+    spilled_entries: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
+    deladd_buffer: &mut Vec<u8>,
+    cbo_buffer: &mut Vec<u8>,
+    key: &[u8],
+    deladd: DelAddRoaringBitmap,
+) -> Result<()> {
+    deladd_buffer.clear();
+    let mut value_writer = KvWriterDelAdd::new(deladd_buffer);
+
+    match deladd {
+        DelAddRoaringBitmap { del: Some(del), add: None } => {
+            cbo_buffer.clear();
+            CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
+            value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
+        }
+        DelAddRoaringBitmap { del: None, add: Some(add) } => {
+            cbo_buffer.clear();
+            CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
+            value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
+        }
+        DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
+            cbo_buffer.clear();
+            CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
+            value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
+
+            cbo_buffer.clear();
+            CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
+            value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
+        }
+        DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
+    }
+
+    let bytes = value_writer.into_inner().unwrap();
+    spilled_entries.insert(key, bytes).map_err(Into::into)
+}
+
+pub struct FrozenCache<'a, 'extractor> {
+    bucket: usize,
+    cache: FrozenMap<
+        'a,
+        'extractor,
+        &'extractor [u8],
+        FrozenDelAddBbbul<'extractor, BitPacker4x>,
+        FxBuildHasher,
+    >,
+    spilled: Vec<grenad::Reader<BufReader<File>>>,
+}
+
+pub fn transpose_and_freeze_caches<'a, 'extractor>(
+    caches: &'a mut [BalancedCaches<'extractor>],
+) -> Result<Vec<Vec<FrozenCache<'a, 'extractor>>>> {
+    let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0);
+    let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect();
+
+    for thread_cache in caches {
+        for frozen in thread_cache.freeze()? {
+            bucket_caches[frozen.bucket].push(frozen);
+        }
+    }
+
+    Ok(bucket_caches)
+}
+
+/// Merges the caches that must be all associated to the same bucket.
+///
+/// # Panics
+///
+/// - If the bucket IDs in these frozen caches are not exactly the same.
+pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()>
+where
+    F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>,
+{
+    let mut maps = Vec::new();
+    let mut readers = Vec::new();
+    let mut current_bucket = None;
+    for FrozenCache { bucket, cache, ref mut spilled } in frozen {
+        assert_eq!(*current_bucket.get_or_insert(bucket), bucket);
+        maps.push(cache);
+        readers.append(spilled);
+    }
+
+    // First manage the spilled entries by looking into the HashMaps,
+    // merge them and mark them as dummy.
+    let mut heap = BinaryHeap::new();
+    for (source_index, source) in readers.into_iter().enumerate() {
+        let mut cursor = source.into_cursor()?;
+        if cursor.move_on_next()?.is_some() {
+            heap.push(Entry { cursor, source_index });
+        }
+    }
+
+    loop {
+        let mut first_entry = match heap.pop() {
+            Some(entry) => entry,
+            None => break,
+        };
+
+        let (first_key, first_value) = match first_entry.cursor.current() {
+            Some((key, value)) => (key, value),
+            None => break,
+        };
+
+        let mut output = DelAddRoaringBitmap::from_bytes(first_value)?;
+        while let Some(mut entry) = heap.peek_mut() {
+            if let Some((key, _value)) = entry.cursor.current() {
+                if first_key == key {
+                    let new = DelAddRoaringBitmap::from_bytes(first_value)?;
+                    output = output.merge(new);
+                    // When we are done we the current value of this entry move make
+                    // it move forward and let the heap reorganize itself (on drop)
+                    if entry.cursor.move_on_next()?.is_none() {
+                        PeekMut::pop(entry);
+                    }
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Once we merged all of the spilled bitmaps we must also
+        // fetch the entries from the non-spilled entries (the HashMaps).
+        for (map_index, map) in maps.iter_mut().enumerate() {
+            if first_entry.source_index != map_index {
+                if let Some(new) = map.get_mut(first_key) {
+                    output.append_and_clear_bbbul(new);
+                }
+            }
+        }
+
+        // We send the merged entry outside.
+        (f)(first_key, output)?;
+
+        // Don't forget to put the first entry back into the heap.
+        if first_entry.cursor.move_on_next()?.is_some() {
+            heap.push(first_entry)
+        }
+    }
+
+    // Then manage the content on the HashMap entries that weren't taken (mem::take).
+    while let Some(mut map) = maps.pop() {
+        for (key, bbbul) in map.iter_mut() {
+            let mut output = DelAddRoaringBitmap::empty();
+            output.append_and_clear_bbbul(bbbul);
+
+            // Make sure we don't try to work with entries already managed by the spilled
+            if !bbbul.is_empty() {
+                for rhs in maps.iter_mut() {
+                    if let Some(new) = rhs.get_mut(key) {
+                        output.append_and_clear_bbbul(new);
+                    }
+                }
+
+                // We send the merged entry outside.
+                (f)(key, output)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+struct Entry<R> {
+    cursor: ReaderCursor<R>,
+    source_index: usize,
+}
+
+impl<R> Ord for Entry<R> {
+    fn cmp(&self, other: &Entry<R>) -> Ordering {
+        let skey = self.cursor.current().map(|(k, _)| k);
+        let okey = other.cursor.current().map(|(k, _)| k);
+        skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse()
+    }
+}
+
+impl<R> Eq for Entry<R> {}
+
+impl<R> PartialEq for Entry<R> {
+    fn eq(&self, other: &Entry<R>) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<R> PartialOrd for Entry<R> {
+    fn partial_cmp(&self, other: &Entry<R>) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+pub struct DelAddBbbul<'bump, B> {
+    pub del: Option<Bbbul<'bump, B>>,
+    pub add: Option<Bbbul<'bump, B>>,
+}
+
+impl<'bump, B: BitPacker> DelAddBbbul<'bump, B> {
+    pub fn insert_del_u32_in(&mut self, n: u32, bump: &'bump Bump) {
+        self.del.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n);
+    }
+
+    pub fn insert_add_u32_in(&mut self, n: u32, bump: &'bump Bump) {
+        self.add.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n);
+    }
+
+    pub fn new_del_u32_in(n: u32, bump: &'bump Bump) -> Self {
+        let mut bbbul = Bbbul::new_in(bump);
+        bbbul.insert(n);
+        DelAddBbbul { del: Some(bbbul), add: None }
+    }
+
+    pub fn new_add_u32_in(n: u32, bump: &'bump Bump) -> Self {
+        let mut bbbul = Bbbul::new_in(bump);
+        bbbul.insert(n);
+        DelAddBbbul { del: None, add: Some(bbbul) }
+    }
+}
+
+pub struct FrozenDelAddBbbul<'bump, B> {
+    pub del: Option<FrozenBbbul<'bump, B>>,
+    pub add: Option<FrozenBbbul<'bump, B>>,
+}
+
+impl<'bump, B> FrozenDelAddBbbul<'bump, B> {
+    fn is_empty(&self) -> bool {
+        self.del.is_none() && self.add.is_none()
+    }
+}
+
+#[derive(Debug, Default, Clone)]
+pub struct DelAddRoaringBitmap {
+    pub del: Option<RoaringBitmap>,
+    pub add: Option<RoaringBitmap>,
+}
+
+impl DelAddRoaringBitmap {
+    fn from_bytes(bytes: &[u8]) -> io::Result<DelAddRoaringBitmap> {
+        let reader = KvReaderDelAdd::from_slice(bytes);
+
+        let del = match reader.get(DelAdd::Deletion) {
+            Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
+            None => None,
+        };
+
+        let add = match reader.get(DelAdd::Addition) {
+            Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
+            None => None,
+        };
+
+        Ok(DelAddRoaringBitmap { del, add })
+    }
+
+    pub fn empty() -> DelAddRoaringBitmap {
+        DelAddRoaringBitmap { del: None, add: None }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        let DelAddRoaringBitmap { del, add } = self;
+        del.is_none() && add.is_none()
+    }
+
+    pub fn insert_del_u32(&mut self, n: u32) {
+        self.del.get_or_insert_with(RoaringBitmap::new).insert(n);
+    }
+
+    pub fn insert_add_u32(&mut self, n: u32) {
+        self.add.get_or_insert_with(RoaringBitmap::new).insert(n);
+    }
+
+    pub fn new_del_u32(n: u32) -> Self {
+        DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None }
+    }
+
+    pub fn new_add_u32(n: u32) -> Self {
+        DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
+    }
+
+    pub fn append_and_clear_bbbul<B: BitPacker>(&mut self, bbbul: &mut FrozenDelAddBbbul<'_, B>) {
+        let FrozenDelAddBbbul { del, add } = bbbul;
+
+        if let Some(ref mut bbbul) = del.take() {
+            let del = self.del.get_or_insert_with(RoaringBitmap::new);
+            let mut iter = bbbul.iter_and_clear();
+            while let Some(block) = iter.next_block() {
+                del.append(block.iter().copied());
+            }
+        }
+
+        if let Some(ref mut bbbul) = add.take() {
+            let add = self.add.get_or_insert_with(RoaringBitmap::new);
+            let mut iter = bbbul.iter_and_clear();
+            while let Some(block) = iter.next_block() {
+                add.append(block.iter().copied());
+            }
+        }
+    }
+
+    pub fn merge(self, rhs: DelAddRoaringBitmap) -> DelAddRoaringBitmap {
+        let DelAddRoaringBitmap { del, add } = self;
+        let DelAddRoaringBitmap { del: ndel, add: nadd } = rhs;
+
+        let del = match (del, ndel) {
+            (None, None) => None,
+            (None, Some(del)) | (Some(del), None) => Some(del),
+            (Some(del), Some(ndel)) => Some(del | ndel),
+        };
+
+        let add = match (add, nadd) {
+            (None, None) => None,
+            (None, Some(add)) | (Some(add), None) => Some(add),
+            (Some(add), Some(nadd)) => Some(add | nadd),
+        };
+
+        DelAddRoaringBitmap { del, add }
+    }
+
+    pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) {
+        let DelAddRoaringBitmap { del, add } = self;
+
+        if let Some(del) = del {
+            *documents_ids -= del;
+        }
+
+        if let Some(add) = add {
+            *documents_ids |= add;
+        }
+    }
+}
--- a/crates/milli/src/update/new/extract/documents.rs
+++ b/crates/milli/src/update/new/extract/documents.rs
@@ -0,0 +1,73 @@
+use std::cell::RefCell;
+
+use bumpalo::Bump;
+
+use super::DelAddRoaringBitmap;
+use crate::update::new::channel::DocumentsSender;
+use crate::update::new::document::write_to_obkv;
+use crate::update::new::indexer::document_changes::{
+    DocumentChangeContext, Extractor, FullySend, RefCellExt as _,
+};
+use crate::update::new::DocumentChange;
+use crate::Result;
+
+pub struct DocumentsExtractor<'a> {
+    documents_sender: &'a DocumentsSender<'a>,
+}
+
+impl<'a> DocumentsExtractor<'a> {
+    pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self {
+        Self { documents_sender }
+    }
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
+    type Data = FullySend<RefCell<DelAddRoaringBitmap>>;
+
+    fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty())))
+    }
+
+    fn process(
+        &self,
+        change: DocumentChange,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        let mut document_buffer = Vec::new();
+        let mut delta_documents_ids = context.data.0.borrow_mut_or_yield();
+
+        let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield();
+        let new_fields_ids_map = &*new_fields_ids_map;
+        let new_fields_ids_map = new_fields_ids_map.local_map();
+
+        let external_docid = change.external_docid().to_owned();
+
+        // document but we need to create a function that collects and compresses documents.
+        match change {
+            DocumentChange::Deletion(deletion) => {
+                let docid = deletion.docid();
+                self.documents_sender.delete(docid, external_docid).unwrap();
+                delta_documents_ids.insert_del_u32(docid);
+            }
+            /// TODO: change NONE by SOME(vector) when implemented
+            DocumentChange::Update(update) => {
+                let docid = update.docid();
+                let content =
+                    update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
+                let content =
+                    write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
+                self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
+            }
+            DocumentChange::Insertion(insertion) => {
+                let docid = insertion.docid();
+                let content = insertion.new();
+                let content =
+                    write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
+                self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
+                delta_documents_ids.insert_add_u32(docid);
+                // extracted_dictionary_sender.send(self, dictionary: &[u8]);
+            }
+        }
+        Ok(())
+    }
+}
--- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs
@@ -0,0 +1,271 @@
+use std::cell::RefCell;
+use std::collections::HashSet;
+use std::ops::DerefMut as _;
+
+use bumpalo::Bump;
+use heed::RoTxn;
+use serde_json::Value;
+
+use super::super::cache::BalancedCaches;
+use super::facet_document::extract_document_facets;
+use super::FacetKind;
+use crate::facet::value_encoding::f64_into_bytes;
+use crate::update::new::extract::DocidsExtractor;
+use crate::update::new::indexer::document_changes::{
+    extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
+    Progress, RefCellExt, ThreadLocal,
+};
+use crate::update::new::DocumentChange;
+use crate::update::GrenadParameters;
+use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
+
+pub struct FacetedExtractorData<'a> {
+    attributes_to_extract: &'a [&'a str],
+    grenad_parameters: GrenadParameters,
+    buckets: usize,
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> {
+    type Data = RefCell<BalancedCaches<'extractor>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(BalancedCaches::new_in(
+            self.buckets,
+            self.grenad_parameters.max_memory,
+            extractor_alloc,
+        )))
+    }
+
+    fn process<'doc>(
+        &self,
+        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        for change in changes {
+            let change = change?;
+            FacetedDocidsExtractor::extract_document_change(
+                context,
+                self.attributes_to_extract,
+                change,
+            )?
+        }
+        Ok(())
+    }
+}
+
+pub struct FacetedDocidsExtractor;
+
+impl FacetedDocidsExtractor {
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
+        attributes_to_extract: &[&str],
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let index = &context.index;
+        let rtxn = &context.txn;
+        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
+        let mut cached_sorter = context.data.borrow_mut_or_yield();
+        match document_change {
+            DocumentChange::Deletion(inner) => extract_document_facets(
+                attributes_to_extract,
+                inner.current(rtxn, index, context.db_fields_ids_map)?,
+                new_fields_ids_map.deref_mut(),
+                &mut |fid, value| {
+                    Self::facet_fn_with_options(
+                        &context.doc_alloc,
+                        cached_sorter.deref_mut(),
+                        BalancedCaches::insert_del_u32,
+                        inner.docid(),
+                        fid,
+                        value,
+                    )
+                },
+            ),
+            DocumentChange::Update(inner) => {
+                extract_document_facets(
+                    attributes_to_extract,
+                    inner.current(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map.deref_mut(),
+                    &mut |fid, value| {
+                        Self::facet_fn_with_options(
+                            &context.doc_alloc,
+                            cached_sorter.deref_mut(),
+                            BalancedCaches::insert_del_u32,
+                            inner.docid(),
+                            fid,
+                            value,
+                        )
+                    },
+                )?;
+
+                extract_document_facets(
+                    attributes_to_extract,
+                    inner.merged(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map.deref_mut(),
+                    &mut |fid, value| {
+                        Self::facet_fn_with_options(
+                            &context.doc_alloc,
+                            cached_sorter.deref_mut(),
+                            BalancedCaches::insert_add_u32,
+                            inner.docid(),
+                            fid,
+                            value,
+                        )
+                    },
+                )
+            }
+            DocumentChange::Insertion(inner) => extract_document_facets(
+                attributes_to_extract,
+                inner.inserted(),
+                new_fields_ids_map.deref_mut(),
+                &mut |fid, value| {
+                    Self::facet_fn_with_options(
+                        &context.doc_alloc,
+                        cached_sorter.deref_mut(),
+                        BalancedCaches::insert_add_u32,
+                        inner.docid(),
+                        fid,
+                        value,
+                    )
+                },
+            ),
+        }
+    }
+
+    fn facet_fn_with_options<'extractor>(
+        doc_alloc: &Bump,
+        cached_sorter: &mut BalancedCaches<'extractor>,
+        cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>,
+        docid: DocumentId,
+        fid: FieldId,
+        value: &Value,
+    ) -> Result<()> {
+        let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
+        // Exists
+        // key: fid
+        buffer.push(FacetKind::Exists as u8);
+        buffer.extend_from_slice(&fid.to_be_bytes());
+        cache_fn(cached_sorter, &buffer, docid)?;
+
+        match value {
+            // Number
+            // key: fid - level - orderedf64 - orignalf64
+            Value::Number(number) => {
+                if let Some((n, ordered)) =
+                    number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered)))
+                {
+                    buffer.clear();
+                    buffer.push(FacetKind::Number as u8);
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(0); // level 0
+                    buffer.extend_from_slice(&ordered);
+                    buffer.extend_from_slice(&n.to_be_bytes());
+                    cache_fn(cached_sorter, &buffer, docid)
+                } else {
+                    Ok(())
+                }
+            }
+            // String
+            // key: fid - level - truncated_string
+            Value::String(s) => {
+                let normalized = crate::normalize_facet(s);
+                let truncated = truncate_str(&normalized);
+                buffer.clear();
+                buffer.push(FacetKind::String as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                buffer.push(0); // level 0
+                buffer.extend_from_slice(truncated.as_bytes());
+                cache_fn(cached_sorter, &buffer, docid)
+            }
+            // Null
+            // key: fid
+            Value::Null => {
+                buffer.clear();
+                buffer.push(FacetKind::Null as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                cache_fn(cached_sorter, &buffer, docid)
+            }
+            // Empty
+            // key: fid
+            Value::Array(a) if a.is_empty() => {
+                buffer.clear();
+                buffer.push(FacetKind::Empty as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                cache_fn(cached_sorter, &buffer, docid)
+            }
+            Value::Object(o) if o.is_empty() => {
+                buffer.clear();
+                buffer.push(FacetKind::Empty as u8);
+                buffer.extend_from_slice(&fid.to_be_bytes());
+                cache_fn(cached_sorter, &buffer, docid)
+            }
+            // Otherwise, do nothing
+            /// TODO: What about Value::Bool?
+            _ => Ok(()),
+        }
+    }
+
+    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
+        index.user_defined_faceted_fields(rtxn)
+    }
+}
+
+/// Truncates a string to the biggest valid LMDB key size.
+fn truncate_str(s: &str) -> &str {
+    let index = s
+        .char_indices()
+        .map(|(idx, _)| idx)
+        .chain(std::iter::once(s.len()))
+        .take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
+        .last();
+
+    &s[..index.unwrap_or(0)]
+}
+
+impl DocidsExtractor for FacetedDocidsExtractor {
+    #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
+    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
+        extractor_allocs: &mut ThreadLocal<FullySend<Bump>>,
+        finished_steps: u16,
+        total_steps: u16,
+        step_name: &'static str,
+    ) -> Result<Vec<BalancedCaches<'extractor>>>
+    where
+        MSP: Fn() -> bool + Sync,
+        SP: Fn(Progress) + Sync,
+    {
+        let index = indexing_context.index;
+        let rtxn = index.read_txn()?;
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
+        let attributes_to_extract: Vec<_> =
+            attributes_to_extract.iter().map(|s| s.as_ref()).collect();
+        let datastore = ThreadLocal::new();
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+
+            let extractor = FacetedExtractorData {
+                attributes_to_extract: &attributes_to_extract,
+                grenad_parameters,
+                buckets: rayon::current_num_threads(),
+            };
+            extract(
+                document_changes,
+                &extractor,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+                finished_steps,
+                total_steps,
+                step_name,
+            )?;
+        }
+
+        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
+    }
+}
--- a/crates/milli/src/update/new/extract/faceted/facet_document.rs
+++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs
@@ -0,0 +1,45 @@
+use serde_json::Value;
+
+use crate::update::new::document::Document;
+use crate::update::new::extract::perm_json_p;
+use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
+
+pub fn extract_document_facets<'doc>(
+    attributes_to_extract: &[&str],
+    document: impl Document<'doc>,
+    field_id_map: &mut GlobalFieldsIdsMap,
+    facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
+) -> Result<()> {
+    for res in document.iter_top_level_fields() {
+        let (field_name, value) = res?;
+
+        let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
+            Some(field_id) => facet_fn(field_id, value),
+            None => Err(UserError::AttributeLimitReached.into()),
+        };
+
+        // if the current field is searchable or contains a searchable attribute
+        if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
+            // parse json.
+            match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? {
+                Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
+                    &object,
+                    Some(attributes_to_extract),
+                    &[], // skip no attributes
+                    field_name,
+                    &mut tokenize_field,
+                )?,
+                Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
+                    &array,
+                    Some(attributes_to_extract),
+                    &[], // skip no attributes
+                    field_name,
+                    &mut tokenize_field,
+                )?,
+                value => tokenize_field(field_name, &value)?,
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/crates/milli/src/update/new/extract/faceted/mod.rs
+++ b/crates/milli/src/update/new/extract/faceted/mod.rs
@@ -0,0 +1,34 @@
+mod extract_facets;
+mod facet_document;
+
+pub use extract_facets::FacetedDocidsExtractor;
+
+#[repr(u8)]
+#[derive(Debug, Clone, Copy)]
+pub enum FacetKind {
+    Number = 0,
+    String = 1,
+    Null = 2,
+    Empty = 3,
+    Exists,
+}
+
+impl From<u8> for FacetKind {
+    fn from(value: u8) -> Self {
+        match value {
+            0 => Self::Number,
+            1 => Self::String,
+            2 => Self::Null,
+            3 => Self::Empty,
+            4 => Self::Exists,
+            _ => unreachable!(),
+        }
+    }
+}
+
+impl FacetKind {
+    pub fn extract_from_key(key: &[u8]) -> (FacetKind, &[u8]) {
+        debug_assert!(key.len() > 3);
+        (FacetKind::from(key[0]), &key[1..])
+    }
+}
--- a/crates/milli/src/update/new/extract/mod.rs
+++ b/crates/milli/src/update/new/extract/mod.rs
@@ -0,0 +1,146 @@
+mod cache;
+mod documents;
+mod faceted;
+mod searchable;
+mod vectors;
+
+use bumpalo::Bump;
+pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap};
+pub use documents::*;
+pub use faceted::*;
+pub use searchable::*;
+pub use vectors::EmbeddingExtractor;
+
+use super::indexer::document_changes::{
+    DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal,
+};
+use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
+use crate::Result;
+
+pub trait DocidsExtractor {
+    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+        finished_steps: u16,
+        total_steps: u16,
+        step_name: &'static str,
+    ) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
+    where
+        MSP: Fn() -> bool + Sync,
+        SP: Fn(Progress) + Sync;
+}
+
+/// TODO move in permissive json pointer
+pub mod perm_json_p {
+    use serde_json::{Map, Value};
+
+    use crate::Result;
+    const SPLIT_SYMBOL: char = '.';
+
+    /// Returns `true` if the `selector` match the `key`.
+    ///
+    /// ```text
+    /// Example:
+    /// `animaux`           match `animaux`
+    /// `animaux.chien`     match `animaux`
+    /// `animaux.chien`     match `animaux`
+    /// `animaux.chien.nom` match `animaux`
+    /// `animaux.chien.nom` match `animaux.chien`
+    /// -----------------------------------------
+    /// `animaux`    doesn't match `animaux.chien`
+    /// `animaux.`   doesn't match `animaux`
+    /// `animaux.ch` doesn't match `animaux.chien`
+    /// `animau`     doesn't match `animaux`
+    /// ```
+    pub fn contained_in(selector: &str, key: &str) -> bool {
+        selector.starts_with(key)
+            && selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
+    }
+
+    pub fn seek_leaf_values_in_object(
+        value: &Map<String, Value>,
+        selectors: Option<&[&str]>,
+        skip_selectors: &[&str],
+        base_key: &str,
+        seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
+    ) -> Result<()> {
+        if value.is_empty() {
+            seeker(base_key, &Value::Object(Map::with_capacity(0)))?;
+        }
+
+        for (key, value) in value.iter() {
+            let base_key = if base_key.is_empty() {
+                key.to_string()
+            } else {
+                format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
+            };
+
+            // here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
+            // so we check the contained_in on both side
+            let should_continue = select_field(&base_key, selectors, skip_selectors);
+            if should_continue {
+                match value {
+                    Value::Object(object) => seek_leaf_values_in_object(
+                        object,
+                        selectors,
+                        skip_selectors,
+                        &base_key,
+                        seeker,
+                    ),
+                    Value::Array(array) => seek_leaf_values_in_array(
+                        array,
+                        selectors,
+                        skip_selectors,
+                        &base_key,
+                        seeker,
+                    ),
+                    value => seeker(&base_key, value),
+                }?;
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn seek_leaf_values_in_array(
+        values: &[Value],
+        selectors: Option<&[&str]>,
+        skip_selectors: &[&str],
+        base_key: &str,
+        seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
+    ) -> Result<()> {
+        if values.is_empty() {
+            seeker(base_key, &Value::Array(vec![]))?;
+        }
+
+        for value in values {
+            match value {
+                Value::Object(object) => {
+                    seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
+                }
+                Value::Array(array) => {
+                    seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
+                }
+                value => seeker(base_key, value),
+            }?;
+        }
+
+        Ok(())
+    }
+
+    pub fn select_field(
+        field_name: &str,
+        selectors: Option<&[&str]>,
+        skip_selectors: &[&str],
+    ) -> bool {
+        selectors.map_or(true, |selectors| {
+            selectors.iter().any(|selector| {
+                contained_in(selector, field_name) || contained_in(field_name, selector)
+            })
+        }) && !skip_selectors.iter().any(|skip_selector| {
+            contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector)
+        })
+    }
+}
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@@ -0,0 +1,400 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::mem::size_of;
+use std::ops::DerefMut as _;
+
+use bumpalo::collections::vec::Vec as BumpVec;
+use bumpalo::Bump;
+use heed::RoTxn;
+
+use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
+use crate::update::new::extract::cache::BalancedCaches;
+use crate::update::new::extract::perm_json_p::contained_in;
+use crate::update::new::indexer::document_changes::{
+    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
+    IndexingContext, MostlySend, RefCellExt, ThreadLocal,
+};
+use crate::update::new::DocumentChange;
+use crate::update::GrenadParameters;
+use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+const MAX_COUNTED_WORDS: usize = 30;
+
+pub struct WordDocidsBalancedCaches<'extractor> {
+    word_fid_docids: BalancedCaches<'extractor>,
+    word_docids: BalancedCaches<'extractor>,
+    exact_word_docids: BalancedCaches<'extractor>,
+    word_position_docids: BalancedCaches<'extractor>,
+    fid_word_count_docids: BalancedCaches<'extractor>,
+    fid_word_count: HashMap<FieldId, (usize, usize)>,
+    current_docid: Option<DocumentId>,
+}
+
+unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}
+
+impl<'extractor> WordDocidsBalancedCaches<'extractor> {
+    /// TODO Make sure to give the same max_memory to all of them, without splitting it
+    pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
+        Self {
+            word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            exact_word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            word_position_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            fid_word_count_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            fid_word_count: HashMap::new(),
+            current_docid: None,
+        }
+    }
+
+    fn insert_add_u32(
+        &mut self,
+        field_id: FieldId,
+        position: u16,
+        word: &str,
+        exact: bool,
+        docid: u32,
+        bump: &Bump,
+    ) -> Result<()> {
+        let word_bytes = word.as_bytes();
+        if exact {
+            self.exact_word_docids.insert_add_u32(word_bytes, docid)?;
+        } else {
+            self.word_docids.insert_add_u32(word_bytes, docid)?;
+        }
+
+        let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
+        let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
+
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&field_id.to_be_bytes());
+        self.word_fid_docids.insert_add_u32(&buffer, docid)?;
+
+        let position = bucketed_position(position);
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&position.to_be_bytes());
+        self.word_position_docids.insert_add_u32(&buffer, docid)?;
+
+        if self.current_docid.map_or(false, |id| docid != id) {
+            self.flush_fid_word_count(&mut buffer)?;
+        }
+
+        self.fid_word_count
+            .entry(field_id)
+            .and_modify(|(_current_count, new_count)| *new_count += 1)
+            .or_insert((0, 1));
+        self.current_docid = Some(docid);
+
+        Ok(())
+    }
+
+    fn insert_del_u32(
+        &mut self,
+        field_id: FieldId,
+        position: u16,
+        word: &str,
+        exact: bool,
+        docid: u32,
+        bump: &Bump,
+    ) -> Result<()> {
+        let word_bytes = word.as_bytes();
+        if exact {
+            self.exact_word_docids.insert_del_u32(word_bytes, docid)?;
+        } else {
+            self.word_docids.insert_del_u32(word_bytes, docid)?;
+        }
+
+        let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
+        let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
+
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&field_id.to_be_bytes());
+        self.word_fid_docids.insert_del_u32(&buffer, docid)?;
+
+        let position = bucketed_position(position);
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&position.to_be_bytes());
+        self.word_position_docids.insert_del_u32(&buffer, docid)?;
+
+        if self.current_docid.map_or(false, |id| docid != id) {
+            self.flush_fid_word_count(&mut buffer)?;
+        }
+
+        self.fid_word_count
+            .entry(field_id)
+            .and_modify(|(current_count, _new_count)| *current_count += 1)
+            .or_insert((1, 0));
+
+        self.current_docid = Some(docid);
+
+        Ok(())
+    }
+
+    fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> {
+        for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
+            if current_count != new_count {
+                if current_count <= MAX_COUNTED_WORDS {
+                    buffer.clear();
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(current_count as u8);
+                    self.fid_word_count_docids
+                        .insert_del_u32(buffer, self.current_docid.unwrap())?;
+                }
+                if new_count <= MAX_COUNTED_WORDS {
+                    buffer.clear();
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(new_count as u8);
+                    self.fid_word_count_docids
+                        .insert_add_u32(buffer, self.current_docid.unwrap())?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub struct WordDocidsCaches<'extractor> {
+    pub word_docids: Vec<BalancedCaches<'extractor>>,
+    pub word_fid_docids: Vec<BalancedCaches<'extractor>>,
+    pub exact_word_docids: Vec<BalancedCaches<'extractor>>,
+    pub word_position_docids: Vec<BalancedCaches<'extractor>>,
+    pub fid_word_count_docids: Vec<BalancedCaches<'extractor>>,
+}
+
+impl<'extractor> WordDocidsCaches<'extractor> {
+    fn new() -> Self {
+        Self {
+            word_docids: Vec::new(),
+            word_fid_docids: Vec::new(),
+            exact_word_docids: Vec::new(),
+            word_position_docids: Vec::new(),
+            fid_word_count_docids: Vec::new(),
+        }
+    }
+
+    fn push(&mut self, other: WordDocidsBalancedCaches<'extractor>) -> Result<()> {
+        let WordDocidsBalancedCaches {
+            word_docids,
+            word_fid_docids,
+            exact_word_docids,
+            word_position_docids,
+            fid_word_count_docids,
+            fid_word_count: _,
+            current_docid: _,
+        } = other;
+
+        self.word_docids.push(word_docids);
+        self.word_fid_docids.push(word_fid_docids);
+        self.exact_word_docids.push(exact_word_docids);
+        self.word_position_docids.push(word_position_docids);
+        self.fid_word_count_docids.push(fid_word_count_docids);
+
+        Ok(())
+    }
+}
+
+pub struct WordDocidsExtractorData<'a> {
+    tokenizer: &'a DocumentTokenizer<'a>,
+    grenad_parameters: GrenadParameters,
+    buckets: usize,
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
+    type Data = RefCell<Option<WordDocidsBalancedCaches<'extractor>>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
+            self.buckets,
+            self.grenad_parameters.max_memory,
+            extractor_alloc,
+        ))))
+    }
+
+    fn process(
+        &self,
+        change: DocumentChange,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
+    }
+}
+
+pub struct WordDocidsExtractors;
+
+impl WordDocidsExtractors {
+    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+    ) -> Result<WordDocidsCaches<'extractor>> {
+        let index = indexing_context.index;
+        let rtxn = index.read_txn()?;
+
+        let stop_words = index.stop_words(&rtxn)?;
+        let allowed_separators = index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.into_tokenizer();
+
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
+        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
+        let localized_attributes_rules =
+            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            attribute_to_extract: attributes_to_extract.as_deref(),
+            attribute_to_skip: attributes_to_skip.as_slice(),
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+
+        let datastore = ThreadLocal::new();
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+
+            let extractor = WordDocidsExtractorData {
+                tokenizer: &document_tokenizer,
+                grenad_parameters,
+                buckets: rayon::current_num_threads(),
+            };
+
+            for_each_document_change(
+                document_changes,
+                &extractor,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+            )?;
+        }
+
+        let mut merger = WordDocidsCaches::new();
+        for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
+            merger.push(cache)?;
+        }
+
+        Ok(merger)
+    }
+
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
+        document_tokenizer: &DocumentTokenizer,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let index = &context.index;
+        let rtxn = &context.txn;
+        let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
+        let cached_sorter = cached_sorter_ref.as_mut().unwrap();
+        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
+        let new_fields_ids_map = new_fields_ids_map.deref_mut();
+        let doc_alloc = &context.doc_alloc;
+
+        let exact_attributes = index.exact_attributes(rtxn)?;
+        let is_exact_attribute =
+            |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_del_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_del_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_add_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.new(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_add_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.new(),
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+        }
+
+        let buffer_size = size_of::<FieldId>();
+        let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc);
+        cached_sorter.flush_fid_word_count(&mut buffer)
+    }
+
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+}
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@@ -0,0 +1,178 @@
+use std::cell::RefCell;
+use std::collections::VecDeque;
+use std::rc::Rc;
+
+use heed::RoTxn;
+
+use super::tokenize_document::DocumentTokenizer;
+use super::SearchableExtractor;
+use crate::proximity::{index_proximity, MAX_DISTANCE};
+use crate::update::new::document::Document;
+use crate::update::new::extract::cache::BalancedCaches;
+use crate::update::new::indexer::document_changes::{DocumentChangeContext, RefCellExt};
+use crate::update::new::DocumentChange;
+use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+
+pub struct WordPairProximityDocidsExtractor;
+
+impl SearchableExtractor for WordPairProximityDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    // This method is reimplemented to count the number of words in the document in each field
+    // and to store the docids of the documents that have a number of words in a given field
+    // equal to or under than MAX_COUNTED_WORDS.
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
+        document_tokenizer: &DocumentTokenizer,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let doc_alloc = &context.doc_alloc;
+
+        let index = context.index;
+        let rtxn = &context.txn;
+
+        let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
+        let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
+        let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
+
+        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
+        let new_fields_ids_map = &mut *new_fields_ids_map;
+
+        let mut cached_sorter = context.data.borrow_mut_or_yield();
+        let cached_sorter = &mut *cached_sorter;
+
+        // is a vecdequeue, and will be smol, so can stay on the heap for now
+        let mut word_positions: VecDeque<(Rc<str>, u16)> =
+            VecDeque::with_capacity(MAX_DISTANCE as usize);
+
+        let docid = document_change.docid();
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        del_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        del_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+                let document = inner.merged(rtxn, index, context.db_fields_ids_map)?;
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        add_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let document = inner.inserted();
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        add_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+        }
+
+        del_word_pair_proximity.sort_unstable();
+        del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
+        for ((w1, w2), prox) in del_word_pair_proximity.iter() {
+            let key = build_key(*prox, w1, w2, &mut key_buffer);
+            cached_sorter.insert_del_u32(key, docid)?;
+        }
+
+        add_word_pair_proximity.sort_unstable();
+        add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
+        for ((w1, w2), prox) in add_word_pair_proximity.iter() {
+            let key = build_key(*prox, w1, w2, &mut key_buffer);
+            cached_sorter.insert_add_u32(key, docid)?;
+        }
+        Ok(())
+    }
+}
+
+fn build_key<'a>(
+    prox: u8,
+    w1: &str,
+    w2: &str,
+    key_buffer: &'a mut bumpalo::collections::Vec<u8>,
+) -> &'a [u8] {
+    key_buffer.clear();
+    key_buffer.push(prox);
+    key_buffer.extend_from_slice(w1.as_bytes());
+    key_buffer.push(0);
+    key_buffer.extend_from_slice(w2.as_bytes());
+    key_buffer.as_slice()
+}
+
+fn word_positions_into_word_pair_proximity(
+    word_positions: &mut VecDeque<(Rc<str>, u16)>,
+    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
+) {
+    let (head_word, head_position) = word_positions.pop_front().unwrap();
+    for (word, position) in word_positions.iter() {
+        let prox = index_proximity(head_position as u32, *position as u32) as u8;
+        if prox > 0 && prox < MAX_DISTANCE as u8 {
+            word_pair_proximity((head_word.clone(), word.clone()), prox);
+        }
+    }
+}
+
+fn process_document_tokens<'doc>(
+    document: impl Document<'doc>,
+    document_tokenizer: &DocumentTokenizer,
+    fields_ids_map: &mut GlobalFieldsIdsMap,
+    word_positions: &mut VecDeque<(Rc<str>, u16)>,
+    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
+) -> Result<()> {
+    let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
+        // drain the proximity window until the head word is considered close to the word we are inserting.
+        while word_positions
+            .front()
+            .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
+        {
+            word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
+        }
+
+        // insert the new word.
+        word_positions.push_back((Rc::from(word), pos));
+        Ok(())
+    };
+    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
+
+    while !word_positions.is_empty() {
+        word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
+    }
+
+    Ok(())
+}
--- a/crates/milli/src/update/new/extract/searchable/mod.rs
+++ b/crates/milli/src/update/new/extract/searchable/mod.rs
@@ -0,0 +1,163 @@
+mod extract_word_docids;
+mod extract_word_pair_proximity_docids;
+mod tokenize_document;
+
+use std::cell::RefCell;
+use std::marker::PhantomData;
+
+use bumpalo::Bump;
+pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
+pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
+use heed::RoTxn;
+use tokenize_document::{tokenizer_builder, DocumentTokenizer};
+
+use super::cache::BalancedCaches;
+use super::DocidsExtractor;
+use crate::update::new::indexer::document_changes::{
+    extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
+    Progress, ThreadLocal,
+};
+use crate::update::new::DocumentChange;
+use crate::update::GrenadParameters;
+use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
+    tokenizer: &'a DocumentTokenizer<'a>,
+    grenad_parameters: GrenadParameters,
+    buckets: usize,
+    _ex: PhantomData<EX>,
+}
+
+impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
+    for SearchableExtractorData<'a, EX>
+{
+    type Data = RefCell<BalancedCaches<'extractor>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(BalancedCaches::new_in(
+            self.buckets,
+            self.grenad_parameters.max_memory,
+            extractor_alloc,
+        )))
+    }
+
+    fn process<'doc>(
+        &self,
+        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        for change in changes {
+            let change = change?;
+            EX::extract_document_change(context, self.tokenizer, change)?;
+        }
+        Ok(())
+    }
+}
+
+pub trait SearchableExtractor: Sized + Sync {
+    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+        finished_steps: u16,
+        total_steps: u16,
+        step_name: &'static str,
+    ) -> Result<Vec<BalancedCaches<'extractor>>>
+    where
+        MSP: Fn() -> bool + Sync,
+        SP: Fn(Progress) + Sync,
+    {
+        let rtxn = indexing_context.index.read_txn()?;
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.into_tokenizer();
+
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
+        let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
+        let localized_attributes_rules =
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            attribute_to_extract: attributes_to_extract.as_deref(),
+            attribute_to_skip: attributes_to_skip.as_slice(),
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+
+        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
+            tokenizer: &document_tokenizer,
+            grenad_parameters,
+            buckets: rayon::current_num_threads(),
+            _ex: PhantomData,
+        };
+
+        let datastore = ThreadLocal::new();
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            extract(
+                document_changes,
+                &extractor_data,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+                finished_steps,
+                total_steps,
+                step_name,
+            )?;
+        }
+
+        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
+    }
+
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
+        document_tokenizer: &DocumentTokenizer,
+        document_change: DocumentChange,
+    ) -> Result<()>;
+
+    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
+        -> Result<Option<Vec<&'a str>>>;
+
+    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
+}
+
+impl<T: SearchableExtractor> DocidsExtractor for T {
+    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+        finished_steps: u16,
+        total_steps: u16,
+        step_name: &'static str,
+    ) -> Result<Vec<BalancedCaches<'extractor>>>
+    where
+        MSP: Fn() -> bool + Sync,
+        SP: Fn(Progress) + Sync,
+    {
+        Self::run_extraction(
+            grenad_parameters,
+            document_changes,
+            indexing_context,
+            extractor_allocs,
+            finished_steps,
+            total_steps,
+            step_name,
+        )
+    }
+}
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@@ -0,0 +1,275 @@
+use std::collections::HashMap;
+
+use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
+use serde_json::Value;
+
+use crate::proximity::MAX_DISTANCE;
+use crate::update::new::document::Document;
+use crate::update::new::extract::perm_json_p::{
+    seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
+};
+use crate::{
+    FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
+    MAX_WORD_LENGTH,
+};
+
+pub struct DocumentTokenizer<'a> {
+    pub tokenizer: &'a Tokenizer<'a>,
+    pub attribute_to_extract: Option<&'a [&'a str]>,
+    pub attribute_to_skip: &'a [&'a str],
+    pub localized_attributes_rules: &'a [LocalizedAttributesRule],
+    pub max_positions_per_attributes: u32,
+}
+
+impl<'a> DocumentTokenizer<'a> {
+    pub fn tokenize_document<'doc>(
+        &self,
+        document: impl Document<'doc>,
+        field_id_map: &mut GlobalFieldsIdsMap,
+        token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
+    ) -> Result<()> {
+        let mut field_position = HashMap::new();
+
+        for entry in document.iter_top_level_fields() {
+            let (field_name, value) = entry?;
+
+            let mut tokenize_field = |name: &str, value: &Value| {
+                let Some(field_id) = field_id_map.id_or_insert(name) else {
+                    return Err(UserError::AttributeLimitReached.into());
+                };
+
+                let position = field_position
+                    .entry(field_id)
+                    .and_modify(|counter| *counter += MAX_DISTANCE)
+                    .or_insert(0);
+                if *position >= self.max_positions_per_attributes {
+                    return Ok(());
+                }
+
+                match value {
+                    Value::Number(n) => {
+                        let token = n.to_string();
+                        if let Ok(position) = (*position).try_into() {
+                            token_fn(name, field_id, position, token.as_str())?;
+                        }
+
+                        Ok(())
+                    }
+                    Value::String(text) => {
+                        // create an iterator of token with their positions.
+                        let locales = self
+                            .localized_attributes_rules
+                            .iter()
+                            .find(|rule| rule.match_str(field_name))
+                            .map(|rule| rule.locales());
+                        let tokens = process_tokens(
+                            *position,
+                            self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
+                        )
+                        .take_while(|(p, _)| *p < self.max_positions_per_attributes);
+
+                        for (index, token) in tokens {
+                            // keep a word only if it is not empty and fit in a LMDB key.
+                            let token = token.lemma().trim();
+                            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                                *position = index;
+                                if let Ok(position) = (*position).try_into() {
+                                    token_fn(name, field_id, position, token)?;
+                                }
+                            }
+                        }
+
+                        Ok(())
+                    }
+                    _ => Ok(()),
+                }
+            };
+
+            // if the current field is searchable or contains a searchable attribute
+            if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
+                // parse json.
+                match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
+                    Value::Object(object) => seek_leaf_values_in_object(
+                        &object,
+                        self.attribute_to_extract,
+                        self.attribute_to_skip,
+                        field_name,
+                        &mut tokenize_field,
+                    )?,
+                    Value::Array(array) => seek_leaf_values_in_array(
+                        &array,
+                        self.attribute_to_extract,
+                        self.attribute_to_skip,
+                        field_name,
+                        &mut tokenize_field,
+                    )?,
+                    value => tokenize_field(field_name, &value)?,
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// take an iterator on tokens and compute their relative position depending on separator kinds
+/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
+/// else we keep the standard proximity of 1 between words.
+fn process_tokens<'a>(
+    start_offset: u32,
+    tokens: impl Iterator<Item = Token<'a>>,
+) -> impl Iterator<Item = (u32, Token<'a>)> {
+    tokens
+        .skip_while(|token| token.is_separator())
+        .scan((start_offset, None), |(offset, prev_kind), mut token| {
+            match token.kind {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
+                    *offset += match *prev_kind {
+                        Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
+                        Some(_) => 1,
+                        None => 0,
+                    };
+                    *prev_kind = Some(token.kind)
+                }
+                TokenKind::Separator(SeparatorKind::Hard) => {
+                    *prev_kind = Some(token.kind);
+                }
+                TokenKind::Separator(SeparatorKind::Soft)
+                    if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
+                {
+                    *prev_kind = Some(token.kind);
+                }
+                _ => token.kind = TokenKind::Unknown,
+            }
+            Some((*offset, token))
+        })
+        .filter(|(_, t)| t.is_word())
+}
+
+/// Factorize tokenizer building.
+pub fn tokenizer_builder<'a>(
+    stop_words: Option<&'a fst::Set<&'a [u8]>>,
+    allowed_separators: Option<&'a [&str]>,
+    dictionary: Option<&'a [&str]>,
+) -> TokenizerBuilder<'a, &'a [u8]> {
+    let mut tokenizer_builder = TokenizerBuilder::new();
+    if let Some(stop_words) = stop_words {
+        tokenizer_builder.stop_words(stop_words);
+    }
+    if let Some(dictionary) = dictionary {
+        tokenizer_builder.words_dict(dictionary);
+    }
+    if let Some(separators) = allowed_separators {
+        tokenizer_builder.separators(separators);
+    }
+
+    tokenizer_builder
+}
+
+#[cfg(test)]
+mod test {
+    use bumpalo::Bump;
+    use charabia::TokenizerBuilder;
+    use meili_snap::snapshot;
+    use raw_collections::RawMap;
+    use serde_json::json;
+    use serde_json::value::RawValue;
+
+    use super::*;
+    use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
+    use crate::update::new::document::{DocumentFromVersions, Versions};
+    use crate::FieldsIdsMap;
+
+    #[test]
+    fn test_tokenize_document() {
+        let mut fields_ids_map = FieldsIdsMap::new();
+
+        let document = json!({
+            "doggo": {                "name": "doggo",
+            "age": 10,},
+            "catto": {
+                "catto": {
+                    "name": "pesti",
+                    "age": 23,
+                }
+            },
+            "doggo.name": ["doggo", "catto"],
+            "not-me": "UNSEARCHABLE",
+            "me-nether": {"nope": "unsearchable"}
+        });
+
+        let _field_1_id = fields_ids_map.insert("doggo").unwrap();
+        let _field_2_id = fields_ids_map.insert("catto").unwrap();
+        let _field_3_id = fields_ids_map.insert("doggo.name").unwrap();
+        let _field_4_id = fields_ids_map.insert("not-me").unwrap();
+        let _field_5_id = fields_ids_map.insert("me-nether").unwrap();
+
+        let mut tb = TokenizerBuilder::default();
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tb.build(),
+            attribute_to_extract: None,
+            attribute_to_skip: &["not-me", "me-nether.nope"],
+            localized_attributes_rules: &[],
+            max_positions_per_attributes: 1000,
+        };
+
+        let fields_ids_map = FieldIdMapWithMetadata::new(
+            fields_ids_map,
+            MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None),
+        );
+
+        let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
+        let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
+
+        let mut words = std::collections::BTreeMap::new();
+
+        let document = document.to_string();
+
+        let bump = Bump::new();
+        let document: &RawValue = serde_json::from_str(&document).unwrap();
+        let document = RawMap::from_raw_value(document, &bump).unwrap();
+
+        let document = Versions::single(document);
+        let document = DocumentFromVersions::new(&document);
+
+        document_tokenizer
+            .tokenize_document(
+                document,
+                &mut global_fields_ids_map,
+                &mut |_fname, fid, pos, word| {
+                    words.insert([fid, pos], word.to_string());
+                    Ok(())
+                },
+            )
+            .unwrap();
+
+        snapshot!(format!("{:#?}", words), @r###"
+        {
+            [
+                2,
+                0,
+            ]: "doggo",
+            [
+                2,
+                MAX_DISTANCE,
+            ]: "doggo",
+            [
+                2,
+                16,
+            ]: "catto",
+            [
+                3,
+                0,
+            ]: "10",
+            [
+                4,
+                0,
+            ]: "pesti",
+            [
+                5,
+                0,
+            ]: "23",
+        }
+        "###);
+    }
+}
--- a/crates/milli/src/update/new/extract/vectors/mod.rs
+++ b/crates/milli/src/update/new/extract/vectors/mod.rs
@@ -0,0 +1,432 @@
+use std::cell::RefCell;
+
+use bumpalo::collections::Vec as BVec;
+use bumpalo::Bump;
+use hashbrown::HashMap;
+
+use super::cache::DelAddRoaringBitmap;
+use crate::error::FaultSource;
+use crate::prompt::Prompt;
+use crate::update::new::channel::EmbeddingSender;
+use crate::update::new::indexer::document_changes::{Extractor, FullySend};
+use crate::update::new::vector_document::VectorDocument;
+use crate::update::new::DocumentChange;
+use crate::vector::error::{
+    EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump,
+};
+use crate::vector::{Embedder, Embedding, EmbeddingConfigs};
+use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError};
+
+pub struct EmbeddingExtractor<'a> {
+    embedders: &'a EmbeddingConfigs,
+    sender: &'a EmbeddingSender<'a>,
+    possible_embedding_mistakes: PossibleEmbeddingMistakes,
+    threads: &'a ThreadPoolNoAbort,
+}
+
+impl<'a> EmbeddingExtractor<'a> {
+    pub fn new(
+        embedders: &'a EmbeddingConfigs,
+        sender: &'a EmbeddingSender<'a>,
+        field_distribution: &'a FieldDistribution,
+        threads: &'a ThreadPoolNoAbort,
+    ) -> Self {
+        let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution);
+        Self { embedders, sender, threads, possible_embedding_mistakes }
+    }
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
+    type Data = FullySend<RefCell<HashMap<String, DelAddRoaringBitmap>>>;
+
+    fn init_data<'doc>(
+        &'doc self,
+        _extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
+    ) -> crate::Result<Self::Data> {
+        /// TODO: use the extractor_alloc in the hashbrown once you merge the branch where it is no longer a RefBump
+        Ok(FullySend(Default::default()))
+    }
+
+    fn process<'doc>(
+        &'doc self,
+        changes: impl Iterator<Item = crate::Result<DocumentChange<'doc>>>,
+        context: &'doc crate::update::new::indexer::document_changes::DocumentChangeContext<
+            Self::Data,
+        >,
+    ) -> crate::Result<()> {
+        let embedders = self.embedders.inner_as_ref();
+        let mut unused_vectors_distribution =
+            UnusedVectorsDistributionBump::new_in(&context.doc_alloc);
+
+        let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
+        for (embedder_name, (embedder, prompt, _is_quantized)) in embedders {
+            let embedder_id =
+                context.index.embedder_category_id.get(&context.txn, embedder_name)?.ok_or_else(
+                    || InternalError::DatabaseMissingEntry {
+                        db_name: "embedder_category_id",
+                        key: None,
+                    },
+                )?;
+            all_chunks.push(Chunks::new(
+                embedder,
+                embedder_id,
+                embedder_name,
+                prompt,
+                &context.data.0,
+                &self.possible_embedding_mistakes,
+                self.threads,
+                self.sender,
+                &context.doc_alloc,
+            ))
+        }
+
+        for change in changes {
+            let change = change?;
+            match change {
+                DocumentChange::Deletion(_deletion) => {
+                    // handled by document sender
+                }
+                DocumentChange::Update(update) => {
+                    let old_vectors = update.current_vectors(
+                        &context.txn,
+                        context.index,
+                        context.db_fields_ids_map,
+                        &context.doc_alloc,
+                    )?;
+                    let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?;
+
+                    if let Some(new_vectors) = &new_vectors {
+                        unused_vectors_distribution.append(new_vectors);
+                    }
+
+                    for chunks in &mut all_chunks {
+                        let embedder_name = chunks.embedder_name();
+                        let prompt = chunks.prompt();
+
+                        let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap();
+                        if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
+                            new_vectors.vectors_for_key(embedder_name).transpose()
+                        }) {
+                            let new_vectors = new_vectors?;
+                            match (old_vectors.regenerate, new_vectors.regenerate) {
+                                (true, true) | (false, false) => todo!(),
+                                _ => {
+                                    chunks.set_regenerate(update.docid(), new_vectors.regenerate);
+                                }
+                            }
+                            // do we have set embeddings?
+                            if let Some(embeddings) = new_vectors.embeddings {
+                                chunks.set_vectors(
+                                    update.docid(),
+                                    embeddings
+                                        .into_vec(&context.doc_alloc, embedder_name)
+                                        .map_err(|error| UserError::InvalidVectorsEmbedderConf {
+                                            document_id: update.external_document_id().to_string(),
+                                            error,
+                                        })?,
+                                );
+                            } else if new_vectors.regenerate {
+                                let new_rendered = prompt.render_document(
+                                    update.current(
+                                        &context.txn,
+                                        context.index,
+                                        context.db_fields_ids_map,
+                                    )?,
+                                    context.new_fields_ids_map,
+                                    &context.doc_alloc,
+                                )?;
+                                let old_rendered = prompt.render_document(
+                                    update.merged(
+                                        &context.txn,
+                                        context.index,
+                                        context.db_fields_ids_map,
+                                    )?,
+                                    context.new_fields_ids_map,
+                                    &context.doc_alloc,
+                                )?;
+                                if new_rendered != old_rendered {
+                                    chunks.set_autogenerated(
+                                        update.docid(),
+                                        new_rendered,
+                                        &unused_vectors_distribution,
+                                    )?;
+                                }
+                            }
+                        } else if old_vectors.regenerate {
+                            let old_rendered = prompt.render_document(
+                                update.current(
+                                    &context.txn,
+                                    context.index,
+                                    context.db_fields_ids_map,
+                                )?,
+                                context.new_fields_ids_map,
+                                &context.doc_alloc,
+                            )?;
+                            let new_rendered = prompt.render_document(
+                                update.merged(
+                                    &context.txn,
+                                    context.index,
+                                    context.db_fields_ids_map,
+                                )?,
+                                context.new_fields_ids_map,
+                                &context.doc_alloc,
+                            )?;
+                            if new_rendered != old_rendered {
+                                chunks.set_autogenerated(
+                                    update.docid(),
+                                    new_rendered,
+                                    &unused_vectors_distribution,
+                                )?;
+                            }
+                        }
+                    }
+                }
+                DocumentChange::Insertion(insertion) => {
+                    let new_vectors =
+                        insertion.inserted_vectors(&context.doc_alloc, self.embedders)?;
+                    if let Some(new_vectors) = &new_vectors {
+                        unused_vectors_distribution.append(new_vectors);
+                    }
+
+                    for chunks in &mut all_chunks {
+                        let embedder_name = chunks.embedder_name();
+                        let prompt = chunks.prompt();
+                        // if no inserted vectors, then regenerate: true + no embeddings => autogenerate
+                        if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
+                            new_vectors.vectors_for_key(embedder_name).transpose()
+                        }) {
+                            let new_vectors = new_vectors?;
+                            chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
+                            if let Some(embeddings) = new_vectors.embeddings {
+                                chunks.set_vectors(
+                                    insertion.docid(),
+                                    embeddings
+                                        .into_vec(&context.doc_alloc, embedder_name)
+                                        .map_err(|error| UserError::InvalidVectorsEmbedderConf {
+                                            document_id: insertion
+                                                .external_document_id()
+                                                .to_string(),
+                                            error,
+                                        })?,
+                                );
+                            } else if new_vectors.regenerate {
+                                let rendered = prompt.render_document(
+                                    insertion.inserted(),
+                                    context.new_fields_ids_map,
+                                    &context.doc_alloc,
+                                )?;
+                                chunks.set_autogenerated(
+                                    insertion.docid(),
+                                    rendered,
+                                    &unused_vectors_distribution,
+                                )?;
+                            }
+                        } else {
+                            let rendered = prompt.render_document(
+                                insertion.inserted(),
+                                context.new_fields_ids_map,
+                                &context.doc_alloc,
+                            )?;
+                            chunks.set_autogenerated(
+                                insertion.docid(),
+                                rendered,
+                                &unused_vectors_distribution,
+                            )?;
+                        }
+                    }
+                }
+            }
+        }
+
+        for chunk in all_chunks {
+            chunk.drain(&unused_vectors_distribution)?;
+        }
+        Ok(())
+    }
+}
+
+// **Warning**: the destructor of this struct is not normally run, make sure that all its fields:
+// 1. don't have side effects tied to they destructors
+// 2. if allocated, are allocated inside of the bumpalo
+//
+// Currently this is the case as:
+// 1. BVec are inside of the bumaplo
+// 2. All other fields are either trivial (u8) or references.
+struct Chunks<'a> {
+    texts: BVec<'a, &'a str>,
+    ids: BVec<'a, DocumentId>,
+
+    embedder: &'a Embedder,
+    embedder_id: u8,
+    embedder_name: &'a str,
+    prompt: &'a Prompt,
+    possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
+    user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
+    threads: &'a ThreadPoolNoAbort,
+    sender: &'a EmbeddingSender<'a>,
+}
+
+impl<'a> Chunks<'a> {
+    #[allow(clippy::too_many_arguments)]
+    pub fn new(
+        embedder: &'a Embedder,
+        embedder_id: u8,
+        embedder_name: &'a str,
+        prompt: &'a Prompt,
+        user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
+        possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
+        threads: &'a ThreadPoolNoAbort,
+        sender: &'a EmbeddingSender<'a>,
+        doc_alloc: &'a Bump,
+    ) -> Self {
+        let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
+        let texts = BVec::with_capacity_in(capacity, doc_alloc);
+        let ids = BVec::with_capacity_in(capacity, doc_alloc);
+        Self {
+            texts,
+            ids,
+            embedder,
+            prompt,
+            possible_embedding_mistakes,
+            threads,
+            sender,
+            embedder_id,
+            embedder_name,
+            user_provided,
+        }
+    }
+
+    pub fn set_autogenerated(
+        &mut self,
+        docid: DocumentId,
+        rendered: &'a str,
+        unused_vectors_distribution: &UnusedVectorsDistributionBump,
+    ) -> Result<()> {
+        if self.texts.len() < self.texts.capacity() {
+            self.texts.push(rendered);
+            self.ids.push(docid);
+            return Ok(());
+        }
+
+        Self::embed_chunks(
+            &mut self.texts,
+            &mut self.ids,
+            self.embedder,
+            self.embedder_id,
+            self.embedder_name,
+            self.possible_embedding_mistakes,
+            unused_vectors_distribution,
+            self.threads,
+            self.sender,
+        )
+    }
+
+    pub fn drain(
+        mut self,
+        unused_vectors_distribution: &UnusedVectorsDistributionBump,
+    ) -> Result<()> {
+        let res = Self::embed_chunks(
+            &mut self.texts,
+            &mut self.ids,
+            self.embedder,
+            self.embedder_id,
+            self.embedder_name,
+            self.possible_embedding_mistakes,
+            unused_vectors_distribution,
+            self.threads,
+            self.sender,
+        );
+        // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
+        std::mem::forget(self);
+        res
+    }
+
+    pub fn embed_chunks(
+        texts: &mut BVec<'a, &'a str>,
+        ids: &mut BVec<'a, DocumentId>,
+        embedder: &Embedder,
+        embedder_id: u8,
+        embedder_name: &str,
+        possible_embedding_mistakes: &PossibleEmbeddingMistakes,
+        unused_vectors_distribution: &UnusedVectorsDistributionBump,
+        threads: &ThreadPoolNoAbort,
+        sender: &EmbeddingSender<'a>,
+    ) -> Result<()> {
+        let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
+            Ok(embeddings) => {
+                for (docid, embedding) in ids.into_iter().zip(embeddings) {
+                    sender.set_vector(*docid, embedder_id, embedding).unwrap();
+                }
+                Ok(())
+            }
+            Err(error) => {
+                if let FaultSource::Bug = error.fault {
+                    Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(
+                        error.into(),
+                    )))
+                } else {
+                    let mut msg = format!(
+                        r"While embedding documents for embedder `{embedder_name}`: {error}"
+                    );
+
+                    if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
+                        msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
+                    }
+
+                    let mut hint_count = 0;
+
+                    for (vector_misspelling, count) in
+                        possible_embedding_mistakes.vector_mistakes().take(2)
+                    {
+                        msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
+                        hint_count += 1;
+                    }
+
+                    for (embedder_misspelling, count) in possible_embedding_mistakes
+                        .embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
+                        .take(2)
+                    {
+                        msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
+                        hint_count += 1;
+                    }
+
+                    if hint_count == 0 {
+                        if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
+                            msg += &format!(
+                                "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
+                            );
+                        }
+                    }
+
+                    Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
+                }
+            }
+        };
+        texts.clear();
+        ids.clear();
+        res
+    }
+
+    pub fn prompt(&self) -> &'a Prompt {
+        self.prompt
+    }
+
+    pub fn embedder_name(&self) -> &'a str {
+        self.embedder_name
+    }
+
+    fn set_regenerate(&self, docid: DocumentId, regenerate: bool) {
+        let mut user_provided = self.user_provided.borrow_mut();
+        let user_provided = user_provided.entry_ref(self.embedder_name).or_default();
+        if regenerate {
+            // regenerate == !user_provided
+            user_provided.del.get_or_insert(Default::default()).insert(docid);
+        } else {
+            user_provided.add.get_or_insert(Default::default()).insert(docid);
+        }
+    }
+
+    fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
+        self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
+    }
+}
--- a/crates/milli/src/update/new/facet_search_builder.rs
+++ b/crates/milli/src/update/new/facet_search_builder.rs
@@ -0,0 +1,264 @@
+use std::collections::{BTreeSet, HashMap};
+
+use charabia::normalizer::NormalizerOption;
+use charabia::{Language, Normalize, StrDetection, Token};
+use grenad::Sorter;
+use heed::types::{Bytes, SerdeJson};
+use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn};
+
+use super::extract::FacetKind;
+use super::fst_merger_builder::FstMergerBuilder;
+use super::KvReaderDelAdd;
+use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
+use crate::heed_codec::StrRefCodec;
+use crate::update::del_add::{DelAdd, KvWriterDelAdd};
+use crate::update::{create_sorter, MergeDeladdBtreesetString};
+use crate::{
+    BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result,
+    MAX_FACET_VALUE_LENGTH,
+};
+
+pub struct FacetSearchBuilder<'indexer> {
+    registered_facets: HashMap<FieldId, usize>,
+    normalized_facet_string_docids_sorter: Sorter<MergeDeladdBtreesetString>,
+    global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
+    localized_attributes_rules: Vec<LocalizedAttributesRule>,
+    // Buffered data below
+    buffer: Vec<u8>,
+    localized_field_ids: HashMap<FieldId, Option<Vec<Language>>>,
+}
+
+impl<'indexer> FacetSearchBuilder<'indexer> {
+    pub fn new(
+        global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
+        localized_attributes_rules: Vec<LocalizedAttributesRule>,
+    ) -> Self {
+        let registered_facets = HashMap::new();
+        let normalized_facet_string_docids_sorter = create_sorter(
+            grenad::SortAlgorithm::Stable,
+            MergeDeladdBtreesetString,
+            grenad::CompressionType::None,
+            None,
+            None,
+            Some(0),
+            true,
+        );
+
+        Self {
+            registered_facets,
+            normalized_facet_string_docids_sorter,
+            buffer: Vec::new(),
+            global_fields_ids_map,
+            localized_attributes_rules,
+            localized_field_ids: HashMap::new(),
+        }
+    }
+
+    fn extract_key_data<'k>(&self, key: &'k [u8]) -> Result<Option<FacetGroupKey<&'k str>>> {
+        match FacetKind::from(key[0]) {
+            // Only strings are searchable
+            FacetKind::String => Ok(Some(
+                FacetGroupKeyCodec::<StrRefCodec>::bytes_decode(&key[1..])
+                    .map_err(heed::Error::Encoding)?,
+            )),
+            _ => Ok(None),
+        }
+    }
+
+    pub fn register_from_key(&mut self, deladd: DelAdd, facet_key: &[u8]) -> Result<()> {
+        let Some(FacetGroupKey { field_id, level: _level, left_bound }) =
+            self.extract_key_data(facet_key)?
+        else {
+            return Ok(());
+        };
+
+        if deladd == DelAdd::Addition {
+            self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1);
+        }
+
+        let locales = self.locales(field_id);
+        let hyper_normalized_value = normalize_facet_string(left_bound, locales);
+
+        let set = BTreeSet::from_iter(std::iter::once(left_bound));
+
+        // as the facet string is the same, we can put the deletion and addition in the same obkv.
+        self.buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut self.buffer);
+        let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
+        obkv.insert(deladd, val)?;
+        obkv.finish()?;
+
+        let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
+        let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+        self.normalized_facet_string_docids_sorter.insert(key_bytes, &self.buffer)?;
+
+        Ok(())
+    }
+
+    fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> {
+        if !self.localized_field_ids.contains_key(&field_id) {
+            let Some(field_name) = self.global_fields_ids_map.name(field_id) else {
+                unreachable!("Field id {} not found in the global fields ids map", field_id);
+            };
+
+            let locales = self
+                .localized_attributes_rules
+                .iter()
+                .find(|rule| rule.match_str(field_name))
+                .map(|rule| rule.locales.clone());
+
+            self.localized_field_ids.insert(field_id, locales);
+        }
+
+        self.localized_field_ids.get(&field_id).unwrap().as_deref()
+    }
+
+    #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")]
+    pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> {
+        let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
+        let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString);
+        builder.extend(reader);
+
+        let database = index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>();
+
+        let mut merger_iter = builder.build().into_stream_merger_iter()?;
+        let mut current_field_id = None;
+        let mut fst;
+        let mut fst_merger_builder: Option<FstMergerBuilder> = None;
+        while let Some((key, deladd)) = merger_iter.next()? {
+            let (field_id, normalized_facet_string) =
+                BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
+
+            if current_field_id != Some(field_id) {
+                if let Some(fst_merger_builder) = fst_merger_builder {
+                    let mmap = fst_merger_builder.build(&mut callback)?;
+                    index
+                        .facet_id_string_fst
+                        .remap_data_type::<Bytes>()
+                        .put(wtxn, &field_id, &mmap)?;
+                }
+
+                fst = index.facet_id_string_fst.get(rtxn, &field_id)?;
+                fst_merger_builder = Some(FstMergerBuilder::new(fst.as_ref())?);
+                current_field_id = Some(field_id);
+            }
+
+            let previous = database.get(rtxn, key)?;
+            let deladd: &KvReaderDelAdd = deladd.into();
+            let del = deladd.get(DelAdd::Deletion);
+            let add = deladd.get(DelAdd::Addition);
+
+            match merge_btreesets(previous, del, add)? {
+                Operation::Write(value) => {
+                    match fst_merger_builder.as_mut() {
+                        Some(fst_merger_builder) => {
+                            fst_merger_builder.register(
+                                DelAdd::Addition,
+                                normalized_facet_string.as_bytes(),
+                                &mut callback,
+                            )?;
+                        }
+                        None => unreachable!(),
+                    }
+                    let key = (field_id, normalized_facet_string);
+                    let key_bytes =
+                        BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+                    database.put(wtxn, &key_bytes, &value)?;
+                }
+                Operation::Delete => {
+                    match fst_merger_builder.as_mut() {
+                        Some(fst_merger_builder) => {
+                            fst_merger_builder.register(
+                                DelAdd::Deletion,
+                                normalized_facet_string.as_bytes(),
+                                &mut callback,
+                            )?;
+                        }
+                        None => unreachable!(),
+                    }
+                    let key = (field_id, normalized_facet_string);
+                    let key_bytes =
+                        BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
+                    database.delete(wtxn, &key_bytes)?;
+                }
+                Operation::Ignore => (),
+            }
+        }
+
+        if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) {
+            let mmap = fst_merger_builder.build(&mut callback)?;
+            index.facet_id_string_fst.remap_data_type::<Bytes>().put(wtxn, &field_id, &mmap)?;
+        }
+
+        Ok(())
+    }
+}
+
+fn callback(_bytes: &[u8], _deladd: DelAdd, _is_modified: bool) -> Result<()> {
+    Ok(())
+}
+
+fn merge_btreesets(
+    current: Option<&[u8]>,
+    del: Option<&[u8]>,
+    add: Option<&[u8]>,
+) -> Result<Operation> {
+    let mut result: BTreeSet<String> = match current {
+        Some(current) => SerdeJson::bytes_decode(current).map_err(heed::Error::Encoding)?,
+        None => BTreeSet::new(),
+    };
+    if let Some(del) = del {
+        let del: BTreeSet<String> = SerdeJson::bytes_decode(del).map_err(heed::Error::Encoding)?;
+        result = result.difference(&del).cloned().collect();
+    }
+    if let Some(add) = add {
+        let add: BTreeSet<String> = SerdeJson::bytes_decode(add).map_err(heed::Error::Encoding)?;
+        result.extend(add);
+    }
+
+    /// TODO remove allocation
+    let result = SerdeJson::bytes_encode(&result).map_err(heed::Error::Encoding)?.into_owned();
+    if Some(result.as_ref()) == current {
+        Ok(Operation::Ignore)
+    } else if result.is_empty() {
+        Ok(Operation::Delete)
+    } else {
+        Ok(Operation::Write(result))
+    }
+}
+
+/// Normalizes the facet string and truncates it to the max length.
+fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
+    let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
+    let mut detection = StrDetection::new(facet_string, locales);
+
+    let script = detection.script();
+    // Detect the language of the facet string only if several locales are explicitly provided.
+    let language = match locales {
+        Some(&[language]) => Some(language),
+        Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
+        _ => None,
+    };
+
+    let token = Token {
+        lemma: std::borrow::Cow::Borrowed(facet_string),
+        script,
+        language,
+        ..Default::default()
+    };
+
+    // truncate the facet string to the max length
+    token
+        .normalize(&options)
+        .lemma
+        .char_indices()
+        .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
+        .map(|(_, c)| c)
+        .collect()
+}
+
+enum Operation {
+    Write(Vec<u8>),
+    Delete,
+    Ignore,
+}
--- a/crates/milli/src/update/new/fst_merger_builder.rs
+++ b/crates/milli/src/update/new/fst_merger_builder.rs
@@ -0,0 +1,155 @@
+use std::{fs::File, io::BufWriter};
+
+use fst::{Set, SetBuilder, Streamer};
+use memmap2::Mmap;
+use tempfile::tempfile;
+
+use crate::{update::del_add::DelAdd, InternalError, Result};
+
+pub struct FstMergerBuilder<'a> {
+    stream: Option<fst::set::Stream<'a>>,
+    fst_builder: SetBuilder<BufWriter<File>>,
+    last: Option<Vec<u8>>,
+    inserted_words: usize,
+}
+
+impl<'a> FstMergerBuilder<'a> {
+    pub fn new<D: AsRef<[u8]>>(fst: Option<&'a Set<D>>) -> Result<Self> {
+        Ok(Self {
+            stream: fst.map(|fst| fst.stream()),
+            fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?,
+            last: None,
+            inserted_words: 0,
+        })
+    }
+
+    pub fn register(
+        &mut self,
+        deladd: DelAdd,
+        right: &[u8],
+        insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
+    ) -> Result<()> {
+        if let Some(left) = self.last.take() {
+            let (left_inserted, right_inserted) =
+                self.compare_and_insert(deladd, left.as_slice(), right, insertion_callback)?;
+
+            // left was not inserted, so we keep it for the next iteration
+            if !left_inserted {
+                self.last = Some(left);
+            }
+
+            // right was inserted, so we can stop
+            if right_inserted {
+                return Ok(());
+            }
+        }
+
+        if let Some(mut stream) = self.stream.take() {
+            while let Some(left) = stream.next() {
+                let (left_inserted, right_inserted) =
+                    self.compare_and_insert(deladd, left, right, insertion_callback)?;
+
+                // left was not inserted, so we keep it for the next iteration
+                if !left_inserted {
+                    self.last = Some(left.to_vec());
+                }
+
+                // right was inserted, so we can stop
+                if right_inserted {
+                    self.stream = Some(stream);
+                    return Ok(());
+                }
+            }
+        }
+
+        // If we reach this point, it means that the stream is empty
+        // and we need to insert the incoming word
+        self.insert(right, deladd, true, insertion_callback)?;
+
+        Ok(())
+    }
+
+    fn compare_and_insert(
+        &mut self,
+        deladd: DelAdd,
+        left: &[u8],
+        right: &[u8],
+        insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
+    ) -> Result<(bool, bool)> {
+        let mut left_inserted = false;
+        let mut right_inserted = false;
+        match left.cmp(right) {
+            std::cmp::Ordering::Less => {
+                // We need to insert the last word from the current fst
+                self.insert(left, DelAdd::Addition, false, insertion_callback)?;
+
+                left_inserted = true;
+            }
+            std::cmp::Ordering::Equal => {
+                self.insert(right, deladd, true, insertion_callback)?;
+
+                left_inserted = true;
+                right_inserted = true;
+            }
+            std::cmp::Ordering::Greater => {
+                self.insert(right, deladd, true, insertion_callback)?;
+
+                right_inserted = true;
+            }
+        }
+
+        Ok((left_inserted, right_inserted))
+    }
+
+    fn insert(
+        &mut self,
+        bytes: &[u8],
+        deladd: DelAdd,
+        is_modified: bool,
+        insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
+    ) -> Result<()> {
+        // Addition: We insert the word
+        // Deletion: We delete the word by not inserting it
+        if deladd == DelAdd::Addition {
+            self.inserted_words += 1;
+            self.fst_builder.insert(bytes)?;
+        }
+
+        insertion_callback(bytes, deladd, is_modified)?;
+
+        Ok(())
+    }
+
+    fn drain_stream(
+        &mut self,
+        insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
+    ) -> Result<()> {
+        if let Some(last) = self.last.take() {
+            self.insert(last.as_slice(), DelAdd::Addition, false, insertion_callback)?;
+        }
+
+        if let Some(mut stream) = self.stream.take() {
+            while let Some(current) = stream.next() {
+                self.insert(current, DelAdd::Addition, false, insertion_callback)?;
+            }
+        }
+
+        Ok(())
+    }
+
+    pub fn build(
+        mut self,
+        insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
+    ) -> Result<Mmap> {
+        self.drain_stream(insertion_callback)?;
+
+        let fst_file = self
+            .fst_builder
+            .into_inner()?
+            .into_inner()
+            .map_err(|_| InternalError::IndexingMergingKeys { process: "building-fst" })?;
+        let fst_mmap = unsafe { Mmap::map(&fst_file)? };
+
+        Ok(fst_mmap)
+    }
+}
--- a/Show More
+++ b/Show More