Refactor Document indexing process (searchables)

**Changes:** The searchable database extraction is now relying on the AttributePatterns and FieldIdMapWithMetadata to match the field to extract. Remove the SearchableExtractor trait to make the code less complex. **Impact:** - Document Addition/modification searchable indexing - Document deletion searchable indexing
2025-10-30 23:46:28 +00:00 · 2025-03-03 10:32:42 +01:00
parent 95bccaf5f5
commit ae8d453868
5 changed files with 239 additions and 263 deletions
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@@ -5,8 +5,8 @@ use std::ops::DerefMut as _;

 use bumpalo::collections::vec::Vec as BumpVec;
 use bumpalo::Bump;
-use heed::RoTxn;

+use super::match_searchable_field;
 use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use crate::update::new::extract::cache::BalancedCaches;
 use crate::update::new::extract::perm_json_p::contained_in;
@@ -17,8 +17,7 @@ use crate::update::new::ref_cell_ext::RefCellExt as _;
 use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::update::GrenadParameters;
-use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};

 const MAX_COUNTED_WORDS: usize = 30;

@@ -207,9 +206,10 @@ impl<'extractor> WordDocidsCaches<'extractor> {
 }

 pub struct WordDocidsExtractorData<'a> {
-    tokenizer: &'a DocumentTokenizer<'a>,
-    grenad_parameters: &'a GrenadParameters,
+    tokenizer: DocumentTokenizer<'a>,
+    max_memory_by_thread: Option<usize>,
    buckets: usize,
+    searchable_attributes: Option<Vec<&'a str>>,
 }

 impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
@@ -218,7 +218,7 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
        Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
            self.buckets,
-            self.grenad_parameters.max_memory_by_thread(),
+            self.max_memory_by_thread,
            extractor_alloc,
        ))))
    }
@@ -230,7 +230,12 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
    ) -> Result<()> {
        for change in changes {
            let change = change?;
-            WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)?;
+            WordDocidsExtractors::extract_document_change(
+                context,
+                &self.tokenizer,
+                self.searchable_attributes.as_deref(),
+                change,
+            )?;
        }
        Ok(())
    }
@@ -248,52 +253,42 @@ impl WordDocidsExtractors {
    where
        MSP: Fn() -> bool + Sync,
    {
-        let index = indexing_context.index;
-        let rtxn = index.read_txn()?;
-
-        let stop_words = index.stop_words(&rtxn)?;
-        let allowed_separators = index.allowed_separators(&rtxn)?;
+        // Warning: this is duplicated code from extract_word_pair_proximity_docids.rs
+        let rtxn = indexing_context.index.read_txn()?;
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
        let allowed_separators: Option<Vec<_>> =
            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let dictionary = index.dictionary(&rtxn)?;
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
        let dictionary: Option<Vec<_>> =
            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let builder = tokenizer_builder(
+        let mut builder = tokenizer_builder(
            stop_words.as_ref(),
            allowed_separators.as_deref(),
            dictionary.as_deref(),
        );
-        let tokenizer = builder.into_tokenizer();
-
-        let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
-        let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
+        let tokenizer = builder.build();
        let localized_attributes_rules =
-            index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
-
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tokenizer,
-            attribute_to_extract: attributes_to_extract.as_deref(),
-            attribute_to_skip: attributes_to_skip.as_slice(),
            localized_attributes_rules: &localized_attributes_rules,
            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
        };
-
+        let extractor_data = WordDocidsExtractorData {
+            tokenizer: document_tokenizer,
+            max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
+            buckets: rayon::current_num_threads(),
+            searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?,
+        };
        let datastore = ThreadLocal::new();
-
        {
            let span =
                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
            let _entered = span.enter();
-
-            let extractor = WordDocidsExtractorData {
-                tokenizer: &document_tokenizer,
-                grenad_parameters: indexing_context.grenad_parameters,
-                buckets: rayon::current_num_threads(),
-            };
-
            extract(
                document_changes,
-                &extractor,
+                &extractor_data,
                indexing_context,
                extractor_allocs,
                &datastore,
@@ -312,6 +307,7 @@ impl WordDocidsExtractors {
    fn extract_document_change(
        context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
        document_tokenizer: &DocumentTokenizer,
+        searchable_attributes: Option<&[&str]>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let index = &context.index;
@@ -345,7 +341,9 @@ impl WordDocidsExtractors {
            }
            DocumentChange::Update(inner) => {
                if !inner.has_changed_for_fields(
-                    document_tokenizer.attribute_to_extract,
+                    &mut |field_name: &str| {
+                        match_searchable_field(field_name, searchable_attributes)
+                    },
                    &context.rtxn,
                    context.index,
                    context.db_fields_ids_map,
@@ -408,15 +406,4 @@ impl WordDocidsExtractors {
        let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc);
        cached_sorter.flush_fid_word_count(&mut buffer)
    }
-
-    fn attributes_to_extract<'a>(
-        rtxn: &'a RoTxn,
-        index: &'a Index,
-    ) -> Result<Option<Vec<&'a str>>> {
-        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
-    }
-
-    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
-        Ok(Vec::new())
-    }
 }
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@@ -2,30 +2,114 @@ use std::cell::RefCell;
 use std::collections::VecDeque;
 use std::rc::Rc;

-use heed::RoTxn;
+use bumpalo::Bump;

-use super::tokenize_document::DocumentTokenizer;
-use super::SearchableExtractor;
+use super::match_searchable_field;
+use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
 use crate::proximity::{index_proximity, MAX_DISTANCE};
 use crate::update::new::document::Document;
 use crate::update::new::extract::cache::BalancedCaches;
-use crate::update::new::indexer::document_changes::DocumentChangeContext;
+use crate::update::new::indexer::document_changes::{
+    extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
+};
 use crate::update::new::ref_cell_ext::RefCellExt as _;
+use crate::update::new::steps::IndexingStep;
+use crate::update::new::thread_local::{FullySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
+use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE};
+
+pub struct WordPairProximityDocidsExtractorData<'a> {
+    tokenizer: DocumentTokenizer<'a>,
+    searchable_attributes: Option<Vec<&'a str>>,
+    max_memory_by_thread: Option<usize>,
+    buckets: usize,
+}
+
+impl<'a, 'extractor> Extractor<'extractor> for WordPairProximityDocidsExtractorData<'a> {
+    type Data = RefCell<BalancedCaches<'extractor>>;
+
+    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
+        Ok(RefCell::new(BalancedCaches::new_in(
+            self.buckets,
+            self.max_memory_by_thread,
+            extractor_alloc,
+        )))
+    }
+
+    fn process<'doc>(
+        &self,
+        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
+        context: &DocumentChangeContext<Self::Data>,
+    ) -> Result<()> {
+        for change in changes {
+            let change = change?;
+            WordPairProximityDocidsExtractor::extract_document_change(
+                context,
+                &self.tokenizer,
+                self.searchable_attributes.as_deref(),
+                change,
+            )?;
+        }
+        Ok(())
+    }
+}

 pub struct WordPairProximityDocidsExtractor;

-impl SearchableExtractor for WordPairProximityDocidsExtractor {
-    fn attributes_to_extract<'a>(
-        rtxn: &'a RoTxn,
-        index: &'a Index,
-    ) -> Result<Option<Vec<&'a str>>> {
-        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
-    }
+impl WordPairProximityDocidsExtractor {
+    pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
+        document_changes: &DC,
+        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
+        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
+        step: IndexingStep,
+    ) -> Result<Vec<BalancedCaches<'extractor>>>
+    where
+        MSP: Fn() -> bool + Sync,
+    {
+        // Warning: this is duplicated code from extract_word_docids.rs
+        let rtxn = indexing_context.index.read_txn()?;
+        let stop_words = indexing_context.index.stop_words(&rtxn)?;
+        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
+        let allowed_separators: Option<Vec<_>> =
+            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let dictionary = indexing_context.index.dictionary(&rtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
+        let mut builder = tokenizer_builder(
+            stop_words.as_ref(),
+            allowed_separators.as_deref(),
+            dictionary.as_deref(),
+        );
+        let tokenizer = builder.build();
+        let localized_attributes_rules =
+            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
+        let document_tokenizer = DocumentTokenizer {
+            tokenizer: &tokenizer,
+            localized_attributes_rules: &localized_attributes_rules,
+            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
+        };
+        let extractor_data = WordPairProximityDocidsExtractorData {
+            tokenizer: document_tokenizer,
+            searchable_attributes: indexing_context.index.user_defined_searchable_fields(&rtxn)?,
+            max_memory_by_thread: indexing_context.grenad_parameters.max_memory_by_thread(),
+            buckets: rayon::current_num_threads(),
+        };
+        let datastore = ThreadLocal::new();
+        {
+            let span =
+                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
+            let _entered = span.enter();
+            extract(
+                document_changes,
+                &extractor_data,
+                indexing_context,
+                extractor_allocs,
+                &datastore,
+                step,
+            )?;
+        }

-    fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
-        Ok(Vec::new())
+        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
    }

    // This method is reimplemented to count the number of words in the document in each field
@@ -34,6 +118,7 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
    fn extract_document_change(
        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
        document_tokenizer: &DocumentTokenizer,
+        searchable_attributes: Option<&[&str]>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let doc_alloc = &context.doc_alloc;
@@ -71,7 +156,9 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
            }
            DocumentChange::Update(inner) => {
                if !inner.has_changed_for_fields(
-                    document_tokenizer.attribute_to_extract,
+                    &mut |field_name: &str| {
+                        match_searchable_field(field_name, searchable_attributes)
+                    },
                    rtxn,
                    index,
                    context.db_fields_ids_map,
--- a/crates/milli/src/update/new/extract/searchable/mod.rs
+++ b/crates/milli/src/update/new/extract/searchable/mod.rs
@@ -2,145 +2,28 @@ mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod tokenize_document;

-use std::cell::RefCell;
-use std::marker::PhantomData;
-
-use bumpalo::Bump;
 pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
 pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
-use heed::RoTxn;
-use tokenize_document::{tokenizer_builder, DocumentTokenizer};

-use super::cache::BalancedCaches;
-use super::DocidsExtractor;
-use crate::update::new::indexer::document_changes::{
-    extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
-};
-use crate::update::new::steps::IndexingStep;
-use crate::update::new::thread_local::{FullySend, ThreadLocal};
-use crate::update::new::DocumentChange;
-use crate::update::GrenadParameters;
-use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::attribute_patterns::{match_field_legacy, PatternMatch};

-pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
-    tokenizer: &'a DocumentTokenizer<'a>,
-    grenad_parameters: &'a GrenadParameters,
-    buckets: usize,
-    _ex: PhantomData<EX>,
-}
+pub fn match_searchable_field(
+    field_name: &str,
+    searchable_fields: Option<&[&str]>,
+) -> PatternMatch {
+    let Some(searchable_fields) = searchable_fields else {
+        // If no searchable fields are provided, consider all fields as searchable
+        return PatternMatch::Match;
+    };

-impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
-    for SearchableExtractorData<'a, EX>
-{
-    type Data = RefCell<BalancedCaches<'extractor>>;
-
-    fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
-        Ok(RefCell::new(BalancedCaches::new_in(
-            self.buckets,
-            self.grenad_parameters.max_memory_by_thread(),
-            extractor_alloc,
-        )))
-    }
-
-    fn process<'doc>(
-        &self,
-        changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
-        context: &DocumentChangeContext<Self::Data>,
-    ) -> Result<()> {
-        for change in changes {
-            let change = change?;
-            EX::extract_document_change(context, self.tokenizer, change)?;
+    let mut selection = PatternMatch::NoMatch;
+    for pattern in searchable_fields {
+        match match_field_legacy(pattern, field_name) {
+            PatternMatch::Match => return PatternMatch::Match,
+            PatternMatch::Parent => selection = PatternMatch::Parent,
+            PatternMatch::NoMatch => (),
        }
-        Ok(())
-    }
-}
-
-pub trait SearchableExtractor: Sized + Sync {
-    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
-        document_changes: &DC,
-        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
-        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
-        step: IndexingStep,
-    ) -> Result<Vec<BalancedCaches<'extractor>>>
-    where
-        MSP: Fn() -> bool + Sync,
-    {
-        let rtxn = indexing_context.index.read_txn()?;
-        let stop_words = indexing_context.index.stop_words(&rtxn)?;
-        let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
-        let allowed_separators: Option<Vec<_>> =
-            allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let dictionary = indexing_context.index.dictionary(&rtxn)?;
-        let dictionary: Option<Vec<_>> =
-            dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
-        let mut builder = tokenizer_builder(
-            stop_words.as_ref(),
-            allowed_separators.as_deref(),
-            dictionary.as_deref(),
-        );
-        let tokenizer = builder.build();
-
-        let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
-        let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
-        let localized_attributes_rules =
-            indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
-
-        let document_tokenizer = DocumentTokenizer {
-            tokenizer: &tokenizer,
-            attribute_to_extract: attributes_to_extract.as_deref(),
-            attribute_to_skip: attributes_to_skip.as_slice(),
-            localized_attributes_rules: &localized_attributes_rules,
-            max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
-        };
-
-        let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
-            tokenizer: &document_tokenizer,
-            grenad_parameters: indexing_context.grenad_parameters,
-            buckets: rayon::current_num_threads(),
-            _ex: PhantomData,
-        };
-
-        let datastore = ThreadLocal::new();
-
-        {
-            let span =
-                tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
-            let _entered = span.enter();
-            extract(
-                document_changes,
-                &extractor_data,
-                indexing_context,
-                extractor_allocs,
-                &datastore,
-                step,
-            )?;
-        }
-
-        Ok(datastore.into_iter().map(RefCell::into_inner).collect())
    }

-    fn extract_document_change(
-        context: &DocumentChangeContext<RefCell<BalancedCaches>>,
-        document_tokenizer: &DocumentTokenizer,
-        document_change: DocumentChange,
-    ) -> Result<()>;
-
-    fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
-        -> Result<Option<Vec<&'a str>>>;
-
-    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
-}
-
-impl<T: SearchableExtractor> DocidsExtractor for T {
-    fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
-        document_changes: &DC,
-        indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
-        extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
-        step: IndexingStep,
-    ) -> Result<Vec<BalancedCaches<'extractor>>>
-    where
-        MSP: Fn() -> bool + Sync,
-    {
-        Self::run_extraction(document_changes, indexing_context, extractor_allocs, step)
-    }
+    selection
 }
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@@ -3,9 +3,10 @@ use std::collections::HashMap;
 use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
 use serde_json::Value;

+use crate::attribute_patterns::PatternMatch;
 use crate::update::new::document::Document;
 use crate::update::new::extract::perm_json_p::{
-    seek_leaf_values_in_array, seek_leaf_values_in_object, select_field, Depth, Selection,
+    seek_leaf_values_in_array, seek_leaf_values_in_object, Depth,
 };
 use crate::{
    FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
@@ -17,8 +18,6 @@ const MAX_DISTANCE: u32 = 8;

 pub struct DocumentTokenizer<'a> {
    pub tokenizer: &'a Tokenizer<'a>,
-    pub attribute_to_extract: Option<&'a [&'a str]>,
-    pub attribute_to_skip: &'a [&'a str],
    pub localized_attributes_rules: &'a [LocalizedAttributesRule],
    pub max_positions_per_attributes: u32,
 }
@@ -31,87 +30,94 @@ impl<'a> DocumentTokenizer<'a> {
        token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
    ) -> Result<()> {
        let mut field_position = HashMap::new();
+        let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
+            let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else {
+                return Err(UserError::AttributeLimitReached.into());
+            };
+
+            if meta.is_searchable() {
+                self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?;
+            }
+
+            // todo: should be a match on the field_name using `match_field_legacy` function,
+            // but for legacy reasons we iterate over all the fields to fill the field_id_map.
+            Ok(PatternMatch::Match)
+        };

        for entry in document.iter_top_level_fields() {
            let (field_name, value) = entry?;
-
-            let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
-                let Some(field_id) = field_id_map.id_or_insert(field_name) else {
-                    return Err(UserError::AttributeLimitReached.into());
-                };
-
-                if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip)
-                    != Selection::Select
-                {
-                    return Ok(());
-                }
-
-                let position = field_position
-                    .entry(field_id)
-                    .and_modify(|counter| *counter += MAX_DISTANCE)
-                    .or_insert(0);
-                if *position >= self.max_positions_per_attributes {
-                    return Ok(());
-                }
-
-                let text;
-                let tokens = match value {
-                    Value::Number(n) => {
-                        text = n.to_string();
-                        self.tokenizer.tokenize(text.as_str())
-                    }
-                    Value::Bool(b) => {
-                        text = b.to_string();
-                        self.tokenizer.tokenize(text.as_str())
-                    }
-                    Value::String(text) => {
-                        let locales = self
-                            .localized_attributes_rules
-                            .iter()
-                            .find(|rule| rule.match_str(field_name))
-                            .map(|rule| rule.locales());
-                        self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
-                    }
-                    _ => return Ok(()),
-                };
-
-                // create an iterator of token with their positions.
-                let tokens = process_tokens(*position, tokens)
-                    .take_while(|(p, _)| *p < self.max_positions_per_attributes);
-
-                for (index, token) in tokens {
-                    // keep a word only if it is not empty and fit in a LMDB key.
-                    let token = token.lemma().trim();
-                    if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
-                        *position = index;
-                        if let Ok(position) = (*position).try_into() {
-                            token_fn(field_name, field_id, position, token)?;
-                        }
-                    }
-                }
-
-                Ok(())
-            };
-
            // parse json.
            match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
                Value::Object(object) => seek_leaf_values_in_object(
                    &object,
-                    None,
-                    &[],
                    field_name,
                    Depth::OnBaseKey,
                    &mut tokenize_field,
                )?,
                Value::Array(array) => seek_leaf_values_in_array(
                    &array,
-                    None,
-                    &[],
                    field_name,
                    Depth::OnBaseKey,
                    &mut tokenize_field,
                )?,
-                value => tokenize_field(field_name, Depth::OnBaseKey, &value)?,
+                value => {
+                    tokenize_field(field_name, Depth::OnBaseKey, &value)?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn tokenize_field(
+        &self,
+        field_id: FieldId,
+        field_name: &str,
+        value: &Value,
+        token_fn: &mut impl FnMut(&str, u16, u16, &str) -> std::result::Result<(), crate::Error>,
+        field_position: &mut HashMap<u16, u32>,
+    ) -> Result<()> {
+        let position = field_position
+            .entry(field_id)
+            .and_modify(|counter| *counter += MAX_DISTANCE)
+            .or_insert(0);
+        if *position >= self.max_positions_per_attributes {
+            return Ok(());
+        }
+
+        let text;
+        let tokens = match value {
+            Value::Number(n) => {
+                text = n.to_string();
+                self.tokenizer.tokenize(text.as_str())
+            }
+            Value::Bool(b) => {
+                text = b.to_string();
+                self.tokenizer.tokenize(text.as_str())
+            }
+            Value::String(text) => {
+                let locales = self
+                    .localized_attributes_rules
+                    .iter()
+                    .find(|rule| rule.match_str(field_name) == PatternMatch::Match)
+                    .map(|rule| rule.locales());
+                self.tokenizer.tokenize_with_allow_list(text.as_str(), locales)
+            }
+            _ => return Ok(()),
+        };
+
+        // create an iterator of token with their positions.
+        let tokens = process_tokens(*position, tokens)
+            .take_while(|(p, _)| *p < self.max_positions_per_attributes);
+
+        for (index, token) in tokens {
+            // keep a word only if it is not empty and fit in a LMDB key.
+            let token = token.lemma().trim();
+            if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                *position = index;
+                if let Ok(position) = (*position).try_into() {
+                    token_fn(field_name, field_id, position, token)?;
+                }
            }
        }

@@ -215,15 +221,20 @@ mod test {
        let mut tb = TokenizerBuilder::default();
        let document_tokenizer = DocumentTokenizer {
            tokenizer: &tb.build(),
-            attribute_to_extract: None,
-            attribute_to_skip: &["not-me", "me-nether.nope"],
            localized_attributes_rules: &[],
            max_positions_per_attributes: 1000,
        };

        let fields_ids_map = FieldIdMapWithMetadata::new(
            fields_ids_map,
-            MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None),
+            MetadataBuilder::new(
+                Default::default(),
+                Default::default(),
+                Default::default(),
+                None,
+                None,
+                Default::default(),
+            ),
        );

        let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
@@ -265,6 +276,10 @@ mod test {
                2,
                16,
            ]: "catto",
+            [
+                3,
+                0,
+            ]: "unsearchable",
            [
                5,
                0,
@@ -277,6 +292,10 @@ mod test {
                8,
                0,
            ]: "23",
+            [
+                9,
+                0,
+            ]: "unsearchable",
        }
        "###);
    }
--- a/crates/milli/src/update/new/indexer/extract.rs
+++ b/crates/milli/src/update/new/indexer/extract.rs
@@ -199,7 +199,7 @@ where
            let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
            let _entered = span.enter();

-            <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
+            WordPairProximityDocidsExtractor::run_extraction(
                document_changes,
                indexing_context,
                extractor_allocs,