Merge branch 'main' into indexer-edition-2024

2025-09-10 06:36:29 +00:00 · 2024-11-06 15:19:18 +01:00
parent a9ecbf0b64 6b67f9fc4c
commit 10feeb88f2
1122 changed files with 6265 additions and 5265 deletions
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@ -0,0 +1,400 @@
+use std::cell::RefCell;
+use std::collections::HashMap;
+use std::mem::size_of;
+use std::ops::DerefMut as _;
+
+use bumpalo::collections::vec::Vec as BumpVec;
+use bumpalo::Bump;
+use heed::RoTxn;
+
+use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
+use crate::update::new::extract::cache::BalancedCaches;
+use crate::update::new::extract::perm_json_p::contained_in;
+use crate::update::new::indexer::document_changes::{
+    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
+    IndexingContext, MostlySend, RefCellExt, ThreadLocal,
+};
+use crate::update::new::DocumentChange;
+use crate::update::GrenadParameters;
+use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+const MAX_COUNTED_WORDS: usize = 30;
+
+pub struct WordDocidsBalancedCaches<'extractor> {
+    word_fid_docids: BalancedCaches<'extractor>,
+    word_docids: BalancedCaches<'extractor>,
+    exact_word_docids: BalancedCaches<'extractor>,
+    word_position_docids: BalancedCaches<'extractor>,
+    fid_word_count_docids: BalancedCaches<'extractor>,
+    fid_word_count: HashMap<FieldId, (usize, usize)>,
+    current_docid: Option<DocumentId>,
+}
+
+unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}
+
+impl<'extractor> WordDocidsBalancedCaches<'extractor> {
+    /// TODO Make sure to give the same max_memory to all of them, without splitting it
+    pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
+        Self {
+            word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            exact_word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            word_position_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            fid_word_count_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
+            fid_word_count: HashMap::new(),
+            current_docid: None,
+        }
+    }
+
+    fn insert_add_u32(
+        &mut self,
+        field_id: FieldId,
+        position: u16,
+        word: &str,
+        exact: bool,
+        docid: u32,
+        bump: &Bump,
+    ) -> Result<()> {
+        let word_bytes = word.as_bytes();
+        if exact {
+            self.exact_word_docids.insert_add_u32(word_bytes, docid)?;
+        } else {
+            self.word_docids.insert_add_u32(word_bytes, docid)?;
+        }
+
+        let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
+        let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
+
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&field_id.to_be_bytes());
+        self.word_fid_docids.insert_add_u32(&buffer, docid)?;
+
+        let position = bucketed_position(position);
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&position.to_be_bytes());
+        self.word_position_docids.insert_add_u32(&buffer, docid)?;
+
+        if self.current_docid.map_or(false, |id| docid != id) {
+            self.flush_fid_word_count(&mut buffer)?;
+        }
+
+        self.fid_word_count
+            .entry(field_id)
+            .and_modify(|(_current_count, new_count)| *new_count += 1)
+            .or_insert((0, 1));
+        self.current_docid = Some(docid);
+
+        Ok(())
+    }
+
+    fn insert_del_u32(
+        &mut self,
+        field_id: FieldId,
+        position: u16,
+        word: &str,
+        exact: bool,
+        docid: u32,
+        bump: &Bump,
+    ) -> Result<()> {
+        let word_bytes = word.as_bytes();
+        if exact {
+            self.exact_word_docids.insert_del_u32(word_bytes, docid)?;
+        } else {
+            self.word_docids.insert_del_u32(word_bytes, docid)?;
+        }
+
+        let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
+        let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
+
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&field_id.to_be_bytes());
+        self.word_fid_docids.insert_del_u32(&buffer, docid)?;
+
+        let position = bucketed_position(position);
+        buffer.clear();
+        buffer.extend_from_slice(word_bytes);
+        buffer.push(0);
+        buffer.extend_from_slice(&position.to_be_bytes());
+        self.word_position_docids.insert_del_u32(&buffer, docid)?;
+
+        if self.current_docid.map_or(false, |id| docid != id) {
+            self.flush_fid_word_count(&mut buffer)?;
+        }
+
+        self.fid_word_count
+            .entry(field_id)
+            .and_modify(|(current_count, _new_count)| *current_count += 1)
+            .or_insert((1, 0));
+
+        self.current_docid = Some(docid);
+
+        Ok(())
+    }
+
+    fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> {
+        for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
+            if current_count != new_count {
+                if current_count <= MAX_COUNTED_WORDS {
+                    buffer.clear();
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(current_count as u8);
+                    self.fid_word_count_docids
+                        .insert_del_u32(buffer, self.current_docid.unwrap())?;
+                }
+                if new_count <= MAX_COUNTED_WORDS {
+                    buffer.clear();
+                    buffer.extend_from_slice(&fid.to_be_bytes());
+                    buffer.push(new_count as u8);
+                    self.fid_word_count_docids
+                        .insert_add_u32(buffer, self.current_docid.unwrap())?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+pub struct WordDocidsCaches<'extractor> {
+    pub word_docids: Vec<BalancedCaches<'extractor>>,
+    pub word_fid_docids: Vec<BalancedCaches<'extractor>>,
+    pub exact_word_docids: Vec<BalancedCaches<'extractor>>,
+    pub word_position_docids: Vec<BalancedCaches<'extractor>>,
+    pub fid_word_count_docids: Vec<BalancedCaches<'extractor>>,
+}
+
+impl<'extractor> WordDocidsCaches<'extractor> {
+    fn new() -> Self {
+        Self {
+            word_docids: Vec::new(),
+            word_fid_docids: Vec::new(),
+            exact_word_docids: Vec::new(),
+            word_position_docids: Vec::new(),
+            fid_word_count_docids: Vec::new(),
+        }
+    }
+
+    fn push(&mut self, other: WordDocidsBalancedCaches<'extractor>) -> Result<()> {
+        let WordDocidsBalancedCaches {
+            word_docids,
+            word_fid_docids,
+            exact_word_docids,
+            word_position_docids,
+            fid_word_count_docids,
+            fid_word_count: _,
+            current_docid: _,
+        } = other;
+
+        self.word_docids.push(word_docids);
+        self.word_fid_docids.push(word_fid_docids);
+        self.exact_word_docids.push(exact_word_docids);
+        self.word_position_docids.push(word_position_docids);
+        self.fid_word_count_docids.push(fid_word_count_docids);
+
+        Ok(())
+    }
+}
+
+pub struct WordDocidsExtractorData<'a> {
+    tokenizer: &'a DocumentTokenizer<'a>,
+    grenad_parameters: GrenadParameters,
+    buckets: usize,
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
+    type Data = RefCell<Option<WordDocidsBalancedCaches<'extractor>>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
+            self.buckets,
+            self.grenad_parameters.max_memory,
+            extractor_alloc,
+        ))))
+    }
+
+    fn process(
+        &self,
+        change: DocumentChange,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
+    }
+}
+
+pub struct WordDocidsExtractors;
+
+impl WordDocidsExtractors {
+    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+    ) -> Result<WordDocidsCaches<'extractor>> {
+        let index = indexing_context.index;
+        let rtxn = index.read_txn()?;
+
+        let stop_words = index.stop_words(&rtxn)?;
+        let allowed_separators = index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.into_tokenizer();
+
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
+        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
+        let localized_attributes_rules =
+            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            attribute_to_extract: attributes_to_extract.as_deref(),
+            attribute_to_skip: attributes_to_skip.as_slice(),
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+
+        let datastore = ThreadLocal::new();
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+
+            let extractor = WordDocidsExtractorData {
+                tokenizer: &document_tokenizer,
+                grenad_parameters,
+                buckets: rayon::current_num_threads(),
+            };
+
+            for_each_document_change(
+                document_changes,
+                &extractor,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+            )?;
+        }
+
+        let mut merger = WordDocidsCaches::new();
+        for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
+            merger.push(cache)?;
+        }
+
+        Ok(merger)
+    }
+
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
+        document_tokenizer: &DocumentTokenizer,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let index = &context.index;
+        let rtxn = &context.txn;
+        let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
+        let cached_sorter = cached_sorter_ref.as_mut().unwrap();
+        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
+        let new_fields_ids_map = new_fields_ids_map.deref_mut();
+        let doc_alloc = &context.doc_alloc;
+
+        let exact_attributes = index.exact_attributes(rtxn)?;
+        let is_exact_attribute =
+            |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_del_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_del_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.current(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_add_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.new(rtxn, index, context.db_fields_ids_map)?,
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let mut token_fn = |fname: &str, fid, pos, word: &str| {
+                    cached_sorter.insert_add_u32(
+                        fid,
+                        pos,
+                        word,
+                        is_exact_attribute(fname),
+                        inner.docid(),
+                        doc_alloc,
+                    )
+                };
+                document_tokenizer.tokenize_document(
+                    inner.new(),
+                    new_fields_ids_map,
+                    &mut token_fn,
+                )?;
+            }
+        }
+
+        let buffer_size = size_of::<FieldId>();
+        let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc);
+        cached_sorter.flush_fid_word_count(&mut buffer)
+    }
+
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+}
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -0,0 +1,178 @@
+use std::cell::RefCell;
+use std::collections::VecDeque;
+use std::rc::Rc;
+
+use heed::RoTxn;
+
+use super::tokenize_document::DocumentTokenizer;
+use super::SearchableExtractor;
+use crate::proximity::{index_proximity, MAX_DISTANCE};
+use crate::update::new::document::Document;
+use crate::update::new::extract::cache::BalancedCaches;
+use crate::update::new::indexer::document_changes::{DocumentChangeContext, RefCellExt};
+use crate::update::new::DocumentChange;
+use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+
+pub struct WordPairProximityDocidsExtractor;
+
+impl SearchableExtractor for WordPairProximityDocidsExtractor {
+    fn attributes_to_extract<'a>(
+        rtxn: &'a RoTxn,
+        index: &'a Index,
+    ) -> Result<Option<Vec<&'a str>>> {
+        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
+    }
+
+    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
+        Ok(vec![])
+    }
+
+    // This method is reimplemented to count the number of words in the document in each field
+    // and to store the docids of the documents that have a number of words in a given field
+    // equal to or under than MAX_COUNTED_WORDS.
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
+        document_tokenizer: &DocumentTokenizer,
+        document_change: DocumentChange,
+    ) -> Result<()> {
+        let doc_alloc = &context.doc_alloc;
+
+        let index = context.index;
+        let rtxn = &context.txn;
+
+        let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
+        let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
+        let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
+
+        let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
+        let new_fields_ids_map = &mut *new_fields_ids_map;
+
+        let mut cached_sorter = context.data.borrow_mut_or_yield();
+        let cached_sorter = &mut *cached_sorter;
+
+        // is a vecdequeue, and will be smol, so can stay on the heap for now
+        let mut word_positions: VecDeque<(Rc<str>, u16)> =
+            VecDeque::with_capacity(MAX_DISTANCE as usize);
+
+        let docid = document_change.docid();
+        match document_change {
+            DocumentChange::Deletion(inner) => {
+                let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        del_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+            DocumentChange::Update(inner) => {
+                let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        del_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+                let document = inner.new(rtxn, index, context.db_fields_ids_map)?;
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        add_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+            DocumentChange::Insertion(inner) => {
+                let document = inner.new();
+                process_document_tokens(
+                    document,
+                    document_tokenizer,
+                    new_fields_ids_map,
+                    &mut word_positions,
+                    &mut |(w1, w2), prox| {
+                        add_word_pair_proximity.push(((w1, w2), prox));
+                    },
+                )?;
+            }
+        }
+
+        del_word_pair_proximity.sort_unstable();
+        del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
+        for ((w1, w2), prox) in del_word_pair_proximity.iter() {
+            let key = build_key(*prox, w1, w2, &mut key_buffer);
+            cached_sorter.insert_del_u32(key, docid)?;
+        }
+
+        add_word_pair_proximity.sort_unstable();
+        add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
+        for ((w1, w2), prox) in add_word_pair_proximity.iter() {
+            let key = build_key(*prox, w1, w2, &mut key_buffer);
+            cached_sorter.insert_add_u32(key, docid)?;
+        }
+        Ok(())
+    }
+}
+
+fn build_key<'a>(
+    prox: u8,
+    w1: &str,
+    w2: &str,
+    key_buffer: &'a mut bumpalo::collections::Vec<u8>,
+) -> &'a [u8] {
+    key_buffer.clear();
+    key_buffer.push(prox);
+    key_buffer.extend_from_slice(w1.as_bytes());
+    key_buffer.push(0);
+    key_buffer.extend_from_slice(w2.as_bytes());
+    key_buffer.as_slice()
+}
+
+fn word_positions_into_word_pair_proximity(
+    word_positions: &mut VecDeque<(Rc<str>, u16)>,
+    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
+) {
+    let (head_word, head_position) = word_positions.pop_front().unwrap();
+    for (word, position) in word_positions.iter() {
+        let prox = index_proximity(head_position as u32, *position as u32) as u8;
+        if prox > 0 && prox < MAX_DISTANCE as u8 {
+            word_pair_proximity((head_word.clone(), word.clone()), prox);
+        }
+    }
+}
+
+fn process_document_tokens<'doc>(
+    document: impl Document<'doc>,
+    document_tokenizer: &DocumentTokenizer,
+    fields_ids_map: &mut GlobalFieldsIdsMap,
+    word_positions: &mut VecDeque<(Rc<str>, u16)>,
+    word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
+) -> Result<()> {
+    let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
+        // drain the proximity window until the head word is considered close to the word we are inserting.
+        while word_positions
+            .front()
+            .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
+        {
+            word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
+        }
+
+        // insert the new word.
+        word_positions.push_back((Rc::from(word), pos));
+        Ok(())
+    };
+    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
+
+    while !word_positions.is_empty() {
+        word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
+    }
+
+    Ok(())
+}
--- a/crates/milli/src/update/new/extract/searchable/mod.rs
+++ b/crates/milli/src/update/new/extract/searchable/mod.rs
@ -0,0 +1,139 @@
+mod extract_word_docids;
+mod extract_word_pair_proximity_docids;
+mod tokenize_document;
+
+use std::cell::RefCell;
+use std::marker::PhantomData;
+
+use bumpalo::Bump;
+pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
+pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
+use heed::RoTxn;
+use tokenize_document::{tokenizer_builder, DocumentTokenizer};
+
+use super::cache::BalancedCaches;
+use super::DocidsExtractor;
+use crate::update::new::indexer::document_changes::{
+    for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
+    IndexingContext, ThreadLocal,
+};
+use crate::update::new::DocumentChange;
+use crate::update::GrenadParameters;
+use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
+    tokenizer: &'a DocumentTokenizer<'a>,
+    grenad_parameters: GrenadParameters,
+    buckets: usize,
+    _ex: PhantomData<EX>,
+}
+
+impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
+    for SearchableExtractorData<'a, EX>
+{
+    type Data = RefCell<BalancedCaches<'extractor>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(BalancedCaches::new_in(
+            self.buckets,
+            self.grenad_parameters.max_memory,
+            extractor_alloc,
+        )))
+    }
+
+    fn process(
+        &self,
+        change: DocumentChange,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        EX::extract_document_change(context, self.tokenizer, change)
+    }
+}
+
+pub trait SearchableExtractor: Sized + Sync {
+    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+    ) -> Result<Vec<BalancedCaches<'extractor>>> {
+        let rtxn = indexing_context.index.read_txn()?;
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.into_tokenizer();
+
+        let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
+        let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
+        let localized_attributes_rules =
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            attribute_to_extract: attributes_to_extract.as_deref(),
+            attribute_to_skip: attributes_to_skip.as_slice(),
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+
+        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
+            tokenizer: &document_tokenizer,
+            grenad_parameters,
+            buckets: rayon::current_num_threads(),
+            _ex: PhantomData,
+        };
+
+        let datastore = ThreadLocal::new();
+
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            for_each_document_change(
+                document_changes,
+                &extractor_data,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+            )?;
+        }
+
+        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
+    }
+
+    fn extract_document_change(
+        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
+        document_tokenizer: &DocumentTokenizer,
+        document_change: DocumentChange,
+    ) -> Result<()>;
+
+    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
+        -> Result<Option<Vec<&'a str>>>;
+
+    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
+}
+
+impl<T: SearchableExtractor> DocidsExtractor for T {
+    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
+        grenad_parameters: GrenadParameters,
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+    ) -> Result<Vec<BalancedCaches<'extractor>>> {
+        Self::run_extraction(
+            grenad_parameters,
+            document_changes,
+            indexing_context,
+            extractor_allocs,
+        )
+    }
+}
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@ -0,0 +1,266 @@
+use std::collections::HashMap;
+
+use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
+use serde_json::Value;
+
+use crate::proximity::MAX_DISTANCE;
+use crate::update::new::document::Document;
+use crate::update::new::extract::perm_json_p::{
+    seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
+};
+use crate::{
+    FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
+    MAX_WORD_LENGTH,
+};
+
+pub struct DocumentTokenizer<'a> {
+    pub tokenizer: &'a Tokenizer<'a>,
+    pub attribute_to_extract: Option<&'a [&'a str]>,
+    pub attribute_to_skip: &'a [&'a str],
+    pub localized_attributes_rules: &'a [LocalizedAttributesRule],
+    pub max_positions_per_attributes: u32,
+}
+
+impl<'a> DocumentTokenizer<'a> {
+    pub fn tokenize_document<'doc>(
+        &self,
+        document: impl Document<'doc>,
+        field_id_map: &mut GlobalFieldsIdsMap,
+        token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
+    ) -> Result<()> {
+        let mut field_position = HashMap::new();
+
+        for entry in document.iter_top_level_fields() {
+            let (field_name, value) = entry?;
+
+            let mut tokenize_field = |name: &str, value: &Value| {
+                let Some(field_id) = field_id_map.id_or_insert(name) else {
+                    return Err(UserError::AttributeLimitReached.into());
+                };
+
+                let position = field_position
+                    .entry(field_id)
+                    .and_modify(|counter| *counter += MAX_DISTANCE)
+                    .or_insert(0);
+                if *position >= self.max_positions_per_attributes {
+                    return Ok(());
+                }
+
+                match value {
+                    Value::Number(n) => {
+                        let token = n.to_string();
+                        if let Ok(position) = (*position).try_into() {
+                            token_fn(name, field_id, position, token.as_str())?;
+                        }
+
+                        Ok(())
+                    }
+                    Value::String(text) => {
+                        // create an iterator of token with their positions.
+                        let locales = self
+                            .localized_attributes_rules
+                            .iter()
+                            .find(|rule| rule.match_str(field_name))
+                            .map(|rule| rule.locales());
+                        let tokens = process_tokens(
+                            *position,
+                            self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
+                        )
+                        .take_while(|(p, _)| *p < self.max_positions_per_attributes);
+
+                        for (index, token) in tokens {
+                            // keep a word only if it is not empty and fit in a LMDB key.
+                            let token = token.lemma().trim();
+                            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                                *position = index;
+                                if let Ok(position) = (*position).try_into() {
+                                    token_fn(name, field_id, position, token)?;
+                                }
+                            }
+                        }
+
+                        Ok(())
+                    }
+                    _ => Ok(()),
+                }
+            };
+
+            // if the current field is searchable or contains a searchable attribute
+            if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
+                // parse json.
+                match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
+                    Value::Object(object) => seek_leaf_values_in_object(
+                        &object,
+                        self.attribute_to_extract,
+                        self.attribute_to_skip,
+                        field_name,
+                        &mut tokenize_field,
+                    )?,
+                    Value::Array(array) => seek_leaf_values_in_array(
+                        &array,
+                        self.attribute_to_extract,
+                        self.attribute_to_skip,
+                        field_name,
+                        &mut tokenize_field,
+                    )?,
+                    value => tokenize_field(field_name, &value)?,
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// take an iterator on tokens and compute their relative position depending on separator kinds
+/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
+/// else we keep the standard proximity of 1 between words.
+fn process_tokens<'a>(
+    start_offset: u32,
+    tokens: impl Iterator<Item = Token<'a>>,
+) -> impl Iterator<Item = (u32, Token<'a>)> {
+    tokens
+        .skip_while(|token| token.is_separator())
+        .scan((start_offset, None), |(offset, prev_kind), mut token| {
+            match token.kind {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
+                    *offset += match *prev_kind {
+                        Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
+                        Some(_) => 1,
+                        None => 0,
+                    };
+                    *prev_kind = Some(token.kind)
+                }
+                TokenKind::Separator(SeparatorKind::Hard) => {
+                    *prev_kind = Some(token.kind);
+                }
+                TokenKind::Separator(SeparatorKind::Soft)
+                    if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
+                {
+                    *prev_kind = Some(token.kind);
+                }
+                _ => token.kind = TokenKind::Unknown,
+            }
+            Some((*offset, token))
+        })
+        .filter(|(_, t)| t.is_word())
+}
+
+/// Factorize tokenizer building.
+pub fn tokenizer_builder<'a>(
+    stop_words: Option<&'a fst::Set<&'a [u8]>>,
+    allowed_separators: Option<&'a [&str]>,
+    dictionary: Option<&'a [&str]>,
+) -> TokenizerBuilder<'a, &'a [u8]> {
+    let mut tokenizer_builder = TokenizerBuilder::new();
+    if let Some(stop_words) = stop_words {
+        tokenizer_builder.stop_words(stop_words);
+    }
+    if let Some(dictionary) = dictionary {
+        tokenizer_builder.words_dict(dictionary);
+    }
+    if let Some(separators) = allowed_separators {
+        tokenizer_builder.separators(separators);
+    }
+
+    tokenizer_builder
+}
+
+#[cfg(test)]
+mod test {
+    use bumpalo::Bump;
+    use charabia::TokenizerBuilder;
+    use meili_snap::snapshot;
+    use raw_collections::RawMap;
+    use serde_json::json;
+    use serde_json::value::RawValue;
+
+    use super::*;
+    use crate::FieldsIdsMap;
+
+    #[test]
+    fn test_tokenize_document() {
+        let mut fields_ids_map = FieldsIdsMap::new();
+
+        let document = json!({
+            "doggo": {                "name": "doggo",
+            "age": 10,},
+            "catto": {
+                "catto": {
+                    "name": "pesti",
+                    "age": 23,
+                }
+            },
+            "doggo.name": ["doggo", "catto"],
+            "not-me": "UNSEARCHABLE",
+            "me-nether": {"nope": "unsearchable"}
+        });
+
+        let _field_1_id = fields_ids_map.insert("doggo").unwrap();
+        let _field_2_id = fields_ids_map.insert("catto").unwrap();
+        let _field_3_id = fields_ids_map.insert("doggo.name").unwrap();
+        let _field_4_id = fields_ids_map.insert("not-me").unwrap();
+        let _field_5_id = fields_ids_map.insert("me-nether").unwrap();
+
+        let mut tb = TokenizerBuilder::default();
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tb.build(),
+            attribute_to_extract: None,
+            attribute_to_skip: &["not-me", "me-nether.nope"],
+            localized_attributes_rules: &[],
+            max_positions_per_attributes: 1000,
+        };
+
+        let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
+        let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
+
+        let mut words = std::collections::BTreeMap::new();
+
+        let document = document.to_string();
+
+        let bump = Bump::new();
+        let document: &RawValue = serde_json::from_str(&document).unwrap();
+        let document = RawMap::from_raw_value(document, &bump).unwrap();
+        let document = document.into_bump_slice();
+
+        document_tokenizer
+            .tokenize_document(
+                document,
+                &mut global_fields_ids_map,
+                &mut |_fname, fid, pos, word| {
+                    words.insert([fid, pos], word.to_string());
+                    Ok(())
+                },
+            )
+            .unwrap();
+
+        snapshot!(format!("{:#?}", words), @r###"
+        {
+            [
+                2,
+                0,
+            ]: "doggo",
+            [
+                2,
+                MAX_DISTANCE,
+            ]: "doggo",
+            [
+                2,
+                16,
+            ]: "catto",
+            [
+                3,
+                0,
+            ]: "10",
+            [
+                4,
+                0,
+            ]: "pesti",
+            [
+                5,
+                0,
+            ]: "23",
+        }
+        "###);
+    }
+}