WIP

2025-11-22 04:36:32 +00:00 · 2025-11-13 15:23:35 +01:00
parent 9736c0e78e
commit a6bda2e78c
1 changed files with 5 additions and 3 deletions
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@@ -555,8 +555,6 @@ impl SettingsChangeWordDocidsExtractors {
        let cached_sorter = cached_sorter_ref.as_mut().unwrap();
        let doc_alloc = &context.doc_alloc;

-        // TODO extract words based on the settings delta here
-        //
        // Note: In insert_del_u32 we should touch the word_fid_docids and
        //       the fid_word_count_docids if the current field has been added
        //       or deleted from the list (we can add a boolean to help).
@@ -576,9 +574,11 @@ impl SettingsChangeWordDocidsExtractors {

        let mut should_tokenize = |field_name: &str| {
            let field_id = new_fields_ids_map.id(field_name).expect("All fields IDs must exist");
-            let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();
            let old_field_metadata = old_fields_ids_map.metadata(field_id).unwrap();
+            let new_field_metadata = new_fields_ids_map.metadata(field_id).unwrap();

+            // TODO Optimization if the field is in both old and new searchable
+            //      attributes AND there are only added fields we can return NoMatch.
            let pattern_match =
                if old_field_metadata.is_searchable() || new_field_metadata.is_searchable() {
                    PatternMatch::Match
@@ -653,6 +653,8 @@ impl SettingsChangeWordDocidsExtractors {
            }
        };

+        // TODO we must tokenize twice when we change global parameters like stop words,
+        //      the language settings, dictionary, separators, non-separators...
        document_tokenizer.tokenize_document(
            current_document,
            &mut should_tokenize,