First better working version

2025-11-22 04:36:32 +00:00 · 2025-11-13 12:31:07 +01:00
parent 30b6a41b8d
commit 9736c0e78e
4 changed files with 162 additions and 55 deletions
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@@ -2,12 +2,14 @@ use std::cell::RefCell;
 use std::collections::HashMap;
 use std::mem::size_of;
 use std::ops::DerefMut as _;
+use std::sync::RwLock;

 use bumpalo::collections::vec::Vec as BumpVec;
 use bumpalo::Bump;

 use super::match_searchable_field;
 use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
+use crate::attribute_patterns::match_field_legacy;
 use crate::fields_ids_map::metadata::Metadata;
 use crate::update::new::document::{Document, DocumentContext};
 use crate::update::new::extract::cache::BalancedCaches;
@@ -23,7 +25,10 @@ use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
 use crate::update::new::{DocumentChange, DocumentIdentifiers};
 use crate::update::settings::SettingsDelta;
-use crate::{bucketed_position, DocumentId, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::{
+    bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, PatternMatch, Result, UserError,
+    MAX_POSITION_PER_ATTRIBUTE,
+};

 const MAX_COUNTED_WORDS: usize = 30;

@@ -330,6 +335,24 @@ impl WordDocidsExtractors {
            exact_attributes.iter().any(|attr| contained_in(fname, attr))
                || disabled_typos_terms.is_exact(word)
        };
+
+        let mut should_tokenize = |field_name: &str| {
+            let Some((field_id, meta)) = new_fields_ids_map.id_with_metadata_or_insert(field_name)
+            else {
+                return Err(UserError::AttributeLimitReached.into());
+            };
+
+            let pattern_match = if meta.is_searchable() {
+                PatternMatch::Match
+            } else {
+                // TODO: should be a match on the field_name using `match_field_legacy` function,
+                //       but for legacy reasons we iterate over all the fields to fill the field_id_map.
+                PatternMatch::Parent
+            };
+
+            Ok((field_id, pattern_match))
+        };
+
        match document_change {
            DocumentChange::Deletion(inner) => {
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
@@ -344,7 +367,7 @@ impl WordDocidsExtractors {
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index, context.db_fields_ids_map)?,
-                    new_fields_ids_map,
+                    &mut should_tokenize,
                    &mut token_fn,
                )?;
            }
@@ -372,7 +395,7 @@ impl WordDocidsExtractors {
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index, context.db_fields_ids_map)?,
-                    new_fields_ids_map,
+                    &mut should_tokenize,
                    &mut token_fn,
                )?;

@@ -388,7 +411,7 @@ impl WordDocidsExtractors {
                };
                document_tokenizer.tokenize_document(
                    inner.merged(rtxn, index, context.db_fields_ids_map)?,
-                    new_fields_ids_map,
+                    &mut should_tokenize,
                    &mut token_fn,
                )?;
            }
@@ -405,7 +428,7 @@ impl WordDocidsExtractors {
                };
                document_tokenizer.tokenize_document(
                    inner.inserted(),
-                    new_fields_ids_map,
+                    &mut should_tokenize,
                    &mut token_fn,
                )?;
            }
@@ -528,50 +551,113 @@ impl SettingsChangeWordDocidsExtractors {
        document_tokenizer: &DocumentTokenizer,
        settings_delta: &SD,
    ) -> Result<()> {
+        let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
+        let cached_sorter = cached_sorter_ref.as_mut().unwrap();
+        let doc_alloc = &context.doc_alloc;
+
        // TODO extract words based on the settings delta here
        //
        // Note: In insert_del_u32 we should touch the word_fid_docids and
        //       the fid_word_count_docids if the current field has been added
        //       or deleted from the list (we can add a boolean to help).

-        dbg!(document.external_document_id());
-
        // TODO do this outside the loop
        let new_fields_ids_map = settings_delta.new_fields_ids_map();
        let old_fields_ids_map = context.index.fields_ids_map_with_metadata(&context.rtxn)?;

+        let old_searchable = settings_delta.old_searchable_attributes().as_ref();
+        let new_searchable = settings_delta.new_searchable_attributes().as_ref();
+
        let current_document = document.current(
            &context.rtxn,
            context.index,
            old_fields_ids_map.as_fields_ids_map(),
        )?;

-        for result in current_document.iter_top_level_fields() {
-            let (field_name, field_value) = result?;
-
-            let field_id = old_fields_ids_map.id(field_name).unwrap();
+        let mut should_tokenize = |field_name: &str| {
+            let field_id = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
            let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
            let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();

-            match (new_field_metadata, old_field_metadata) {
-                (Metadata { searchable: Some(_), .. }, Metadata { searchable: None, .. }) => {
-                    eprintln!(
-                        "The document with id `{}` has the field `{}` that must be deleted (be careful)",
-                        document.external_document_id(), field_name,
-                    );
-                }
-                (Metadata { searchable: None, .. }, Metadata { searchable: Some(_), .. }) => {
-                    eprintln!(
-                        "The document with id `{}` has the field `{}` that must be tokenized",
-                        document.external_document_id(),
-                        field_name,
-                    );
-                }
-                _ => todo!(),
-            }
+            let pattern_match =
+                if old_field_metadata.is_searchable() || new_field_metadata.is_searchable() {
+                    PatternMatch::Match
+                // If any old or new field is searchable then we need to iterate over all fields
+                // else if any field matches we need to iterate over all fields
+                } else if old_searchable.zip(new_searchable).map_or(true, |(old, new)| {
+                    old.iter()
+                        .chain(new)
+                        .any(|attr| match_field_legacy(attr, field_name) == PatternMatch::Parent)
+                }) {
+                    PatternMatch::Parent
+                } else {
+                    PatternMatch::NoMatch
+                };

-            // TODO extract words from the document here
-        }
+            Ok((field_id, pattern_match))
+        };
+
+        let mut token_fn = |_field_name: &str, field_id, pos, word: &str| {
+            let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
+            let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
+
+            match (old_field_metadata, new_field_metadata) {
+                (
+                    Metadata { searchable: Some(_), exact: old_exact, .. },
+                    Metadata { searchable: None, .. },
+                ) => {
+                    cached_sorter.insert_del_u32(
+                        field_id,
+                        pos,
+                        word,
+                        // TODO don't forget to check `disabled_typos_terms` and add it to the SettingsDelta
+                        old_exact,
+                        document.docid(),
+                        doc_alloc,
+                    )
+                }
+                (
+                    Metadata { searchable: None, .. },
+                    Metadata { searchable: Some(_), exact: new_exact, .. },
+                ) => {
+                    cached_sorter.insert_add_u32(
+                        field_id,
+                        pos,
+                        word,
+                        // TODO same
+                        new_exact,
+                        document.docid(),
+                        doc_alloc,
+                    )
+                }
+                (Metadata { exact: old_exact, .. }, Metadata { exact: new_exact, .. }) => {
+                    cached_sorter.insert_del_u32(
+                        field_id,
+                        pos,
+                        word,
+                        // TODO same
+                        old_exact,
+                        document.docid(),
+                        doc_alloc,
+                    )?;
+                    cached_sorter.insert_add_u32(
+                        field_id,
+                        pos,
+                        word,
+                        // TODO same
+                        new_exact,
+                        document.docid(),
+                        doc_alloc,
+                    )
+                }
+            }
+        };
+
+        document_tokenizer.tokenize_document(
+            current_document,
+            &mut should_tokenize,
+            &mut token_fn,
+        )?;

        Ok(())
    }
--- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@@ -16,7 +16,9 @@ use crate::update::new::ref_cell_ext::RefCellExt as _;
 use crate::update::new::steps::IndexingStep;
 use crate::update::new::thread_local::{FullySend, ThreadLocal};
 use crate::update::new::DocumentChange;
-use crate::{FieldId, GlobalFieldsIdsMap, Result, MAX_POSITION_PER_ATTRIBUTE};
+use crate::{
+    FieldId, GlobalFieldsIdsMap, PatternMatch, Result, UserError, MAX_POSITION_PER_ATTRIBUTE,
+};

 pub struct WordPairProximityDocidsExtractorData<'a> {
    tokenizer: DocumentTokenizer<'a>,
@@ -279,7 +281,24 @@ fn process_document_tokens<'doc>(
        word_positions.push_back((Rc::from(word), pos));
        Ok(())
    };
-    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
+
+    let mut should_tokenize = |field_name: &str| {
+        let Some((field_id, meta)) = fields_ids_map.id_with_metadata_or_insert(field_name) else {
+            return Err(UserError::AttributeLimitReached.into());
+        };
+
+        let pattern_match = if meta.is_searchable() {
+            PatternMatch::Match
+        } else {
+            // TODO: should be a match on the field_name using `match_field_legacy` function,
+            //       but for legacy reasons we iterate over all the fields to fill the field_id_map.
+            PatternMatch::Parent
+        };
+
+        Ok((field_id, pattern_match))
+    };
+
+    document_tokenizer.tokenize_document(document, &mut should_tokenize, &mut token_fn)?;

    drain_word_positions(word_positions, word_pair_proximity);
    Ok(())
--- a/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/crates/milli/src/update/new/extract/searchable/tokenize_document.rs
@@ -8,10 +8,7 @@ use crate::update::new::document::Document;
 use crate::update::new::extract::perm_json_p::{
    seek_leaf_values_in_array, seek_leaf_values_in_object, Depth,
 };
-use crate::{
-    FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
-    MAX_WORD_LENGTH,
-};
+use crate::{FieldId, InternalError, LocalizedAttributesRule, Result, MAX_WORD_LENGTH};

 // todo: should be crate::proximity::MAX_DISTANCE but it has been forgotten
 const MAX_DISTANCE: u32 = 8;
@@ -26,22 +23,16 @@ impl DocumentTokenizer<'_> {
    pub fn tokenize_document<'doc>(
        &self,
        document: impl Document<'doc>,
-        field_id_map: &mut GlobalFieldsIdsMap,
+        should_tokenize: &mut impl FnMut(&str) -> Result<(FieldId, PatternMatch)>,
        token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
    ) -> Result<()> {
        let mut field_position = HashMap::new();
        let mut tokenize_field = |field_name: &str, _depth, value: &Value| {
-            let Some((field_id, meta)) = field_id_map.id_with_metadata_or_insert(field_name) else {
-                return Err(UserError::AttributeLimitReached.into());
-            };
-
-            if meta.is_searchable() {
+            let (field_id, pattern_match) = should_tokenize(field_name)?;
+            if pattern_match == PatternMatch::Match {
                self.tokenize_field(field_id, field_name, value, token_fn, &mut field_position)?;
            }
-
-            // todo: should be a match on the field_name using `match_field_legacy` function,
-            // but for legacy reasons we iterate over all the fields to fill the field_id_map.
-            Ok(PatternMatch::Match)
+            Ok(pattern_match)
        };

        for entry in document.iter_top_level_fields() {
@@ -192,7 +183,7 @@ mod test {
    use super::*;
    use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
    use crate::update::new::document::{DocumentFromVersions, Versions};
-    use crate::FieldsIdsMap;
+    use crate::{FieldsIdsMap, GlobalFieldsIdsMap};

    #[test]
    fn test_tokenize_document() {
--- a/crates/milli/src/update/settings.rs
+++ b/crates/milli/src/update/settings.rs
@@ -1633,10 +1633,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
            let embedding_config_updates = self.update_embedding_configs()?;
            self.update_user_defined_searchable_attributes()?;

-            let mut new_inner_settings =
-                InnerIndexSettings::from_index(self.index, self.wtxn, None)?;
-            // TODO maybe not needed?
-            new_inner_settings.recompute_searchables(self.wtxn, self.index)?;
+            // Note that we don't need to update the searchables here,
+            // as it will be done after the settings update.
+            let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?;

            let primary_key_id = self
                .index
@@ -2579,6 +2578,11 @@ fn deserialize_sub_embedder(
 /// Implement this trait for the settings delta type.
 /// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`.
 pub trait SettingsDelta {
+    fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
+
+    fn new_searchable_attributes(&self) -> &Option<Vec<String>>;
+    fn old_searchable_attributes(&self) -> &Option<Vec<String>>;
+
    fn new_embedders(&self) -> &RuntimeEmbedders;
    fn old_embedders(&self) -> &RuntimeEmbedders;
    fn new_embedder_category_id(&self) -> &HashMap<String, u8>;
@@ -2590,7 +2594,6 @@ pub trait SettingsDelta {
    ) -> std::result::Result<(), E>
    where
        F: FnMut(FragmentDiff) -> std::result::Result<(), E>;
-    fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata;
 }

 pub struct FragmentDiff<'a> {
@@ -2599,6 +2602,18 @@ pub struct FragmentDiff<'a> {
 }

 impl SettingsDelta for InnerIndexSettingsDiff {
+    fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
+        &self.new.fields_ids_map
+    }
+
+    fn new_searchable_attributes(&self) -> &Option<Vec<String>> {
+        &self.new.user_defined_searchable_attributes
+    }
+
+    fn old_searchable_attributes(&self) -> &Option<Vec<String>> {
+        &self.old.user_defined_searchable_attributes
+    }
+
    fn new_embedders(&self) -> &RuntimeEmbedders {
        &self.new.runtime_embedders
    }
@@ -2615,10 +2630,6 @@ impl SettingsDelta for InnerIndexSettingsDiff {
        &self.embedding_config_updates
    }

-    fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata {
-        &self.new.fields_ids_map
-    }
-
    fn try_for_each_fragment_diff<F, E>(
        &self,
        embedder_name: &str,