No more use FST to find a word without any typo

Add test
Merge #5422
2025-11-30 01:35:36 +00:00 · 2025-03-17 14:20:10 +01:00 · 2025-03-17 14:20:10 +01:00 · 2025-03-17 12:02:46 +00:00 · 2025-03-17 11:40:50 +01:00 · 2025-03-17 10:24:33 +00:00
13 changed files with 317 additions and 170 deletions
--- a/crates/meilisearch/src/analytics/segment_analytics.rs
+++ b/crates/meilisearch/src/analytics/segment_analytics.rs
@@ -329,7 +329,8 @@ impl Infos {
            http_addr: http_addr != default_http_addr(),
            http_payload_size_limit,
            experimental_max_number_of_batched_tasks,
-            experimental_limit_batched_tasks_total_size,
+            experimental_limit_batched_tasks_total_size:
+                experimental_limit_batched_tasks_total_size.into(),
            task_queue_webhook: task_webhook_url.is_some(),
            task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
            log_level: log_level.to_string(),
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@@ -228,7 +228,7 @@ pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<(Arc<IndexScheduler>, Arc<
        cleanup_enabled: !opt.experimental_replication_parameters,
        max_number_of_tasks: 1_000_000,
        max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
-        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size,
+        batched_tasks_size_limit: opt.experimental_limit_batched_tasks_total_size.into(),
        index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().as_u64() as usize,
        index_count: DEFAULT_INDEX_COUNT,
        instance_features: opt.to_instance_features(),
--- a/crates/meilisearch/src/option.rs
+++ b/crates/meilisearch/src/option.rs
@@ -445,7 +445,7 @@ pub struct Opt {
    /// see: <https://github.com/orgs/meilisearch/discussions/801>
    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE, default_value_t = default_limit_batched_tasks_total_size())]
    #[serde(default = "default_limit_batched_tasks_total_size")]
-    pub experimental_limit_batched_tasks_total_size: u64,
+    pub experimental_limit_batched_tasks_total_size: Byte,

    /// Enables experimental caching of search query embeddings. The value represents the maximal number of entries in the cache of each
    /// distinct embedder.
@@ -958,8 +958,8 @@ fn default_limit_batched_tasks() -> usize {
    usize::MAX
 }

-fn default_limit_batched_tasks_total_size() -> u64 {
-    u64::MAX
+fn default_limit_batched_tasks_total_size() -> Byte {
+    Byte::from_u64(u64::MAX)
 }

 fn default_embedding_cache_entries() -> usize {
--- a/crates/meilisearch/tests/documents/add_documents.rs
+++ b/crates/meilisearch/tests/documents/add_documents.rs
@@ -1897,11 +1897,11 @@ async fn update_documents_with_geo_field() {
        },
        {
            "id": "3",
-            "_geo": { "lat": 1, "lng": 1 },
+            "_geo": { "lat": 3, "lng": 0 },
        },
        {
            "id": "4",
-            "_geo": { "lat": "1", "lng": "1" },
+            "_geo": { "lat": "4", "lng": "0" },
        },
    ]);

@@ -1928,9 +1928,7 @@ async fn update_documents_with_geo_field() {
    }
    "###);

-    let (response, code) = index
-        .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
-        .await;
+    let (response, code) = index.search_post(json!({"sort": ["_geoPoint(10,0):asc"]})).await;
    snapshot!(code, @"200 OK");
    // we are expecting docs 4 and 3 first as they have geo
    snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
@@ -1940,18 +1938,18 @@ async fn update_documents_with_geo_field() {
        {
          "id": "4",
          "_geo": {
-            "lat": "1",
-            "lng": "1"
+            "lat": "4",
+            "lng": "0"
          },
-          "_geoDistance": 5522018
+          "_geoDistance": 667170
        },
        {
          "id": "3",
          "_geo": {
-            "lat": 1,
-            "lng": 1
+            "lat": 3,
+            "lng": 0
          },
-          "_geoDistance": 5522018
+          "_geoDistance": 778364
        },
        {
          "id": "1"
@@ -1969,10 +1967,13 @@ async fn update_documents_with_geo_field() {
    }
    "###);

-    let updated_documents = json!([{
-      "id": "3",
-      "doggo": "kefir",
-    }]);
+    let updated_documents = json!([
+        {
+          "id": "3",
+          "doggo": "kefir",
+          "_geo": { "lat": 5, "lng": 0 },
+        }
+    ]);
    let (task, _status_code) = index.update_documents(updated_documents, None).await;
    let response = index.wait_task(task.uid()).await;
    snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
@@ -2012,16 +2013,16 @@ async fn update_documents_with_geo_field() {
        {
          "id": "3",
          "_geo": {
-            "lat": 1,
-            "lng": 1
+            "lat": 5,
+            "lng": 0
          },
          "doggo": "kefir"
        },
        {
          "id": "4",
          "_geo": {
-            "lat": "1",
-            "lng": "1"
+            "lat": "4",
+            "lng": "0"
          }
        }
      ],
@@ -2031,31 +2032,29 @@ async fn update_documents_with_geo_field() {
    }
    "###);

-    let (response, code) = index
-        .search_post(json!({"sort": ["_geoPoint(50.629973371633746,3.0569447399419567):desc"]}))
-        .await;
+    let (response, code) = index.search_post(json!({"sort": ["_geoPoint(10,0):asc"]})).await;
    snapshot!(code, @"200 OK");
    // the search response should not have changed: we are expecting docs 4 and 3 first as they have geo
    snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }),
    @r###"
    {
      "hits": [
-        {
-          "id": "4",
-          "_geo": {
-            "lat": "1",
-            "lng": "1"
-          },
-          "_geoDistance": 5522018
-        },
        {
          "id": "3",
          "_geo": {
-            "lat": 1,
-            "lng": 1
+            "lat": 5,
+            "lng": 0
          },
          "doggo": "kefir",
-          "_geoDistance": 5522018
+          "_geoDistance": 555975
+        },
+        {
+          "id": "4",
+          "_geo": {
+            "lat": "4",
+            "lng": "0"
+          },
+          "_geoDistance": 667170
        },
        {
          "id": "1"
--- a/crates/meilisearch/tests/search/mod.rs
+++ b/crates/meilisearch/tests/search/mod.rs
@@ -1783,6 +1783,146 @@ async fn test_nested_fields() {
    .await;
 }

+#[actix_rt::test]
+async fn test_typo_settings() {
+    let documents = json!([
+        {
+            "id": 0,
+            "title": "The zeroth document",
+        },
+        {
+            "id": 1,
+            "title": "The first document",
+            "nested": {
+                "object": "field",
+                "machin": "bidule",
+            },
+        },
+        {
+            "id": 2,
+            "title": "The second document",
+            "nested": [
+                "array",
+                {
+                    "object": "field",
+                },
+                {
+                    "prout": "truc",
+                    "machin": "lol",
+                },
+            ],
+        },
+        {
+            "id": 3,
+            "title": "The third document",
+            "nested": "I lied",
+        },
+    ]);
+
+    test_settings_documents_indexing_swapping_and_search(
+        &documents,
+        &json!({
+            "searchableAttributes": ["title", "nested.object", "nested.machin"],
+            "typoTolerance": {
+              "enabled": true,
+              "disableOnAttributes": ["title"]
+            }
+        }),
+        &json!({"q": "document"}),
+        |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 0,
+                "title": "The zeroth document"
+              },
+              {
+                "id": 1,
+                "title": "The first document",
+                "nested": {
+                  "object": "field",
+                  "machin": "bidule"
+                }
+              },
+              {
+                "id": 2,
+                "title": "The second document",
+                "nested": [
+                  "array",
+                  {
+                    "object": "field"
+                  },
+                  {
+                    "prout": "truc",
+                    "machin": "lol"
+                  }
+                ]
+              },
+              {
+                "id": 3,
+                "title": "The third document",
+                "nested": "I lied"
+              }
+            ]
+            "###);
+        },
+    )
+    .await;
+
+    // Test prefix search
+    test_settings_documents_indexing_swapping_and_search(
+        &documents,
+        &json!({
+            "searchableAttributes": ["title", "nested.object", "nested.machin"],
+            "typoTolerance": {
+              "enabled": true,
+              "disableOnAttributes": ["title"]
+            }
+        }),
+        &json!({"q": "docume"}),
+        |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            snapshot!(json_string!(response["hits"]), @r###"
+          [
+            {
+              "id": 0,
+              "title": "The zeroth document"
+            },
+            {
+              "id": 1,
+              "title": "The first document",
+              "nested": {
+                "object": "field",
+                "machin": "bidule"
+              }
+            },
+            {
+              "id": 2,
+              "title": "The second document",
+              "nested": [
+                "array",
+                {
+                  "object": "field"
+                },
+                {
+                  "prout": "truc",
+                  "machin": "lol"
+                }
+              ]
+            },
+            {
+              "id": 3,
+              "title": "The third document",
+              "nested": "I lied"
+            }
+          ]
+          "###);
+        },
+    )
+    .await;
+}
+
 /// Modifying facets with different casing should work correctly
 #[actix_rt::test]
 async fn change_facet_casing() {
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -1755,6 +1755,19 @@ impl Index {
        }
        Ok(stats)
    }
+
+    /// Check if the word is indexed in the index.
+    ///
+    /// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids.
+    ///
+    /// # Arguments
+    ///
+    /// * `rtxn`: The read transaction.
+    /// * `word`: The word to check.
+    pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result<bool> {
+        Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
+            || self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
+    }
 }

 #[derive(Debug, Deserialize, Serialize)]
--- a/crates/milli/src/progress.rs
+++ b/crates/milli/src/progress.rs
@@ -190,8 +190,18 @@ macro_rules! make_atomic_progress {
    };
 }

-make_atomic_progress!(Document alias AtomicDocumentStep => "document" );
-make_atomic_progress!(Payload alias AtomicPayloadStep => "payload" );
+make_atomic_progress!(Document alias AtomicDocumentStep => "document");
+make_atomic_progress!(Payload alias AtomicPayloadStep => "payload");
+
+make_enum_progress! {
+    pub enum MergingWordCache {
+        WordDocids,
+        WordFieldIdDocids,
+        ExactWordDocids,
+        WordPositionDocids,
+        FieldIdWordCountDocids,
+    }
+}

 #[derive(Debug, Serialize, Clone, ToSchema)]
 #[serde(rename_all = "camelCase")]
--- a/crates/milli/src/search/new/query_term/compute_derivations.rs
+++ b/crates/milli/src/search/new/query_term/compute_derivations.rs
@@ -1,10 +1,12 @@
 use std::borrow::Cow;
+use std::cmp::Ordering;
 use std::collections::BTreeSet;
 use std::ops::ControlFlow;

 use fst::automaton::Str;
-use fst::{Automaton, IntoStreamer, Streamer};
+use fst::{IntoStreamer, Streamer};
 use heed::types::DecodeIgnore;
+use itertools::{merge_join_by, EitherOrBoth};

 use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
 use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
@@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH};

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum NumberOfTypos {
-    Zero,
    One,
    Two,
 }

-pub enum ZeroOrOneTypo {
-    Zero,
-    One,
-}
-
 impl Interned<QueryTerm> {
    pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
        let s = ctx.term_interner.get_mut(self);
@@ -47,34 +43,45 @@ impl Interned<QueryTerm> {
 }

 fn find_zero_typo_prefix_derivations(
+    ctx: &mut SearchContext<'_>,
    word_interned: Interned<String>,
-    fst: fst::Set<Cow<'_, [u8]>>,
-    word_interner: &mut DedupInterner<String>,
    mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
 ) -> Result<()> {
-    let word = word_interner.get(word_interned).to_owned();
+    let word = ctx.word_interner.get(word_interned).to_owned();
    let word = word.as_str();
-    let prefix = Str::new(word).starts_with();
-    let mut stream = fst.search(prefix).into_stream();

-    while let Some(derived_word) = stream.next() {
-        let derived_word = std::str::from_utf8(derived_word)?.to_owned();
-        let derived_word_interned = word_interner.insert(derived_word);
-        if derived_word_interned != word_interned {
-            let cf = visit(derived_word_interned)?;
-            if cf.is_break() {
-                break;
+    let words =
+        ctx.index.word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
+    let exact_words =
+        ctx.index.exact_word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
+
+    for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) {
+        (Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word),
+        (Err(_), _) | (_, Err(_)) => Ordering::Equal,
+    }) {
+        match eob {
+            EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => {
+                let (derived_word, _) = kv?;
+                let derived_word = derived_word.to_string();
+                let derived_word_interned = ctx.word_interner.insert(derived_word);
+                if derived_word_interned != word_interned {
+                    let cf = visit(derived_word_interned)?;
+                    if cf.is_break() {
+                        break;
+                    }
+                }
            }
        }
    }
+
    Ok(())
 }

-fn find_zero_one_typo_derivations(
+fn find_one_typo_derivations(
    ctx: &mut SearchContext<'_>,
    word_interned: Interned<String>,
    is_prefix: bool,
-    mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
+    mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
 ) -> Result<()> {
    let fst = ctx.get_words_fst()?;
    let word = ctx.word_interner.get(word_interned).to_owned();
@@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations(
        let derived_word = ctx.word_interner.insert(derived_word.to_owned());
        let d = dfa.distance(state.1);
        match d.to_u8() {
-            0 => {
-                if derived_word != word_interned {
-                    let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
-                    if cf.is_break() {
-                        break;
-                    }
-                }
-            }
+            0 => (),
            1 => {
-                let cf = visit(derived_word, ZeroOrOneTypo::One)?;
+                let cf = visit(derived_word)?;
                if cf.is_break() {
                    break;
                }
@@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations(
    Ok(())
 }

-fn find_zero_one_two_typo_derivations(
+fn find_one_two_typo_derivations(
    word_interned: Interned<String>,
    is_prefix: bool,
    fst: fst::Set<Cow<'_, [u8]>>,
@@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations(
            // correct distance
            let d = second_dfa.distance((state.1).0);
            match d.to_u8() {
-                0 => {
-                    if derived_word_interned != word_interned {
-                        let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
-                        if cf.is_break() {
-                            break;
-                        }
-                    }
-                }
+                0 => (),
                1 => {
                    let cf = visit(derived_word_interned, NumberOfTypos::One)?;
                    if cf.is_break() {
@@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word(
        });
    }

-    let fst = ctx.index.words_fst(ctx.txn)?;
-
    let use_prefix_db = is_prefix
        && (ctx
            .index
@@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word(
    let mut zero_typo = None;
    let mut prefix_of = BTreeSet::new();

-    if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() {
+    if ctx.index.contains_word(ctx.txn, word)? {
        zero_typo = Some(word_interned);
    }

    if is_prefix && use_prefix_db.is_none() {
-        find_zero_typo_prefix_derivations(
-            word_interned,
-            fst,
-            &mut ctx.word_interner,
-            |derived_word| {
-                if prefix_of.len() < limits::MAX_PREFIX_COUNT {
-                    prefix_of.insert(derived_word);
-                    Ok(ControlFlow::Continue(()))
-                } else {
-                    Ok(ControlFlow::Break(()))
-                }
-            },
-        )?;
+        find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
+            if prefix_of.len() < limits::MAX_PREFIX_COUNT {
+                prefix_of.insert(derived_word);
+                Ok(ControlFlow::Continue(()))
+            } else {
+                Ok(ControlFlow::Break(()))
+            }
+        })?;
    }
    let synonyms = ctx.index.synonyms(ctx.txn)?;
    let mut synonym_word_count = 0;
@@ -295,18 +281,13 @@ impl Interned<QueryTerm> {
        let mut one_typo_words = BTreeSet::new();

        if *max_nbr_typos > 0 {
-            find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
-                match nbr_typos {
-                    ZeroOrOneTypo::Zero => {}
-                    ZeroOrOneTypo::One => {
-                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
-                            one_typo_words.insert(derived_word);
-                        } else {
-                            return Ok(ControlFlow::Break(()));
-                        }
-                    }
+            find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
+                if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
+                    one_typo_words.insert(derived_word);
+                    Ok(ControlFlow::Continue(()))
+                } else {
+                    Ok(ControlFlow::Break(()))
                }
-                Ok(ControlFlow::Continue(()))
            })?;
        }

@@ -357,7 +338,7 @@ impl Interned<QueryTerm> {
        let mut two_typo_words = BTreeSet::new();

        if *max_nbr_typos > 0 {
-            find_zero_one_two_typo_derivations(
+            find_one_two_typo_derivations(
                *original,
                *is_prefix,
                ctx.index.words_fst(ctx.txn)?,
@@ -370,7 +351,6 @@ impl Interned<QueryTerm> {
                        return Ok(ControlFlow::Break(()));
                    }
                    match nbr_typos {
-                        NumberOfTypos::Zero => {}
                        NumberOfTypos::One => {
                            if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
                                one_typo_words.insert(derived_word);
--- a/crates/milli/src/update/new/document_change.rs
+++ b/crates/milli/src/update/new/document_change.rs
@@ -1,5 +1,6 @@
 use bumpalo::Bump;
 use heed::RoTxn;
+use serde_json::Value;

 use super::document::{
    Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions,
@@ -10,7 +11,7 @@ use super::vector_document::{
 use crate::attribute_patterns::PatternMatch;
 use crate::documents::FieldIdMapper;
 use crate::vector::EmbeddingConfigs;
-use crate::{DocumentId, Index, Result};
+use crate::{DocumentId, Index, InternalError, Result};

 pub enum DocumentChange<'doc> {
    Deletion(Deletion<'doc>),
@@ -243,6 +244,29 @@ impl<'doc> Update<'doc> {
        Ok(has_deleted_fields)
    }

+    /// Returns `true` if the geo fields have changed.
+    pub fn has_changed_for_geo_fields<'t, Mapper: FieldIdMapper>(
+        &self,
+        rtxn: &'t RoTxn,
+        index: &'t Index,
+        mapper: &'t Mapper,
+    ) -> Result<bool> {
+        let current = self.current(rtxn, index, mapper)?;
+        let current_geo = current.geo_field()?;
+        let updated_geo = self.only_changed_fields().geo_field()?;
+        match (current_geo, updated_geo) {
+            (Some(current_geo), Some(updated_geo)) => {
+                let current: Value =
+                    serde_json::from_str(current_geo.get()).map_err(InternalError::SerdeJson)?;
+                let updated: Value =
+                    serde_json::from_str(updated_geo.get()).map_err(InternalError::SerdeJson)?;
+                Ok(current != updated)
+            }
+            (None, None) => Ok(false),
+            _ => Ok(true),
+        }
+    }
+
    pub fn only_changed_vectors(
        &self,
        doc_alloc: &'doc Bump,
--- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs
+++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs
@@ -117,7 +117,7 @@ impl FacetedDocidsExtractor {
                },
            ),
            DocumentChange::Update(inner) => {
-                if !inner.has_changed_for_fields(
+                let has_changed = inner.has_changed_for_fields(
                    &mut |field_name| {
                        match_faceted_field(
                            field_name,
@@ -130,7 +130,10 @@ impl FacetedDocidsExtractor {
                    rtxn,
                    index,
                    context.db_fields_ids_map,
-                )? {
+                )?;
+                let has_changed_for_geo_fields =
+                    inner.has_changed_for_geo_fields(rtxn, index, context.db_fields_ids_map)?;
+                if !has_changed && !has_changed_for_geo_fields {
                    return Ok(());
                }

--- a/crates/milli/src/update/new/indexer/extract.rs
+++ b/crates/milli/src/update/new/indexer/extract.rs
@@ -13,6 +13,7 @@ use super::super::thread_local::{FullySend, ThreadLocal};
 use super::super::FacetFieldIdsDelta;
 use super::document_changes::{extract, DocumentChanges, IndexingContext};
 use crate::index::IndexEmbeddingConfig;
+use crate::progress::MergingWordCache;
 use crate::proximity::ProximityPrecision;
 use crate::update::new::extract::EmbeddingExtractor;
 use crate::update::new::merger::merge_and_send_rtree;
@@ -96,6 +97,7 @@ where
        {
            let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(IndexingStep::MergingFacetCaches);

            facet_field_ids_delta = merge_and_send_facet_docids(
                caches,
@@ -117,7 +119,6 @@ where
        } = {
            let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
            let _entered = span.enter();
-
            WordDocidsExtractors::run_extraction(
                document_changes,
                indexing_context,
@@ -126,9 +127,13 @@ where
            )?
        };

+        indexing_context.progress.update_progress(IndexingStep::MergingWordCaches);
+
        {
            let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::WordDocids);
+
            merge_and_send_docids(
                word_docids,
                index.word_docids.remap_types(),
@@ -142,6 +147,8 @@ where
            let span =
                tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::WordFieldIdDocids);
+
            merge_and_send_docids(
                word_fid_docids,
                index.word_fid_docids.remap_types(),
@@ -155,6 +162,8 @@ where
            let span =
                tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::ExactWordDocids);
+
            merge_and_send_docids(
                exact_word_docids,
                index.exact_word_docids.remap_types(),
@@ -168,6 +177,8 @@ where
            let span =
                tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::WordPositionDocids);
+
            merge_and_send_docids(
                word_position_docids,
                index.word_position_docids.remap_types(),
@@ -181,6 +192,8 @@ where
            let span =
                tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(MergingWordCache::FieldIdWordCountDocids);
+
            merge_and_send_docids(
                fid_word_count_docids,
                index.field_id_word_count_docids.remap_types(),
@@ -210,6 +223,7 @@ where
        {
            let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
            let _entered = span.enter();
+            indexing_context.progress.update_progress(IndexingStep::MergingWordProximity);

            merge_and_send_docids(
                caches,
--- a/crates/milli/src/update/new/merger.rs
+++ b/crates/milli/src/update/new/merger.rs
@@ -82,14 +82,8 @@ where
        merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
            let current = database.get(&rtxn, key)?;
            match merge_cbo_bitmaps(current, del, add)? {
-                Operation::Write(bitmap) => {
-                    docids_sender.write(key, &bitmap)?;
-                    Ok(())
-                }
-                Operation::Delete => {
-                    docids_sender.delete(key)?;
-                    Ok(())
-                }
+                Operation::Write(bitmap) => docids_sender.write(key, &bitmap),
+                Operation::Delete => docids_sender.delete(key),
                Operation::Ignore => Ok(()),
            }
        })
@@ -130,7 +124,6 @@ pub fn merge_and_send_facet_docids<'extractor>(
                    Operation::Ignore => Ok(()),
                }
            })?;
-
            Ok(facet_field_ids_delta)
        })
        .reduce(
--- a/crates/milli/src/update/new/steps.rs
+++ b/crates/milli/src/update/new/steps.rs
@@ -1,52 +1,22 @@
-use std::borrow::Cow;
+use crate::make_enum_progress;

-use enum_iterator::Sequence;
-
-use crate::progress::Step;
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
-#[repr(u8)]
-pub enum IndexingStep {
-    PreparingPayloads,
-    ExtractingDocuments,
-    ExtractingFacets,
-    ExtractingWords,
-    ExtractingWordProximity,
-    ExtractingEmbeddings,
-    WritingGeoPoints,
-    WaitingForDatabaseWrites,
-    WaitingForExtractors,
-    WritingEmbeddingsToDatabase,
-    PostProcessingFacets,
-    PostProcessingWords,
-    Finalizing,
-}
-
-impl Step for IndexingStep {
-    fn name(&self) -> Cow<'static, str> {
-        match self {
-            IndexingStep::PreparingPayloads => "preparing update file",
-            IndexingStep::ExtractingDocuments => "extracting documents",
-            IndexingStep::ExtractingFacets => "extracting facets",
-            IndexingStep::ExtractingWords => "extracting words",
-            IndexingStep::ExtractingWordProximity => "extracting word proximity",
-            IndexingStep::ExtractingEmbeddings => "extracting embeddings",
-            IndexingStep::WritingGeoPoints => "writing geo points",
-            IndexingStep::WaitingForDatabaseWrites => "waiting for database writes",
-            IndexingStep::WaitingForExtractors => "waiting for extractors",
-            IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
-            IndexingStep::PostProcessingFacets => "post-processing facets",
-            IndexingStep::PostProcessingWords => "post-processing words",
-            IndexingStep::Finalizing => "finalizing",
-        }
-        .into()
-    }
-
-    fn current(&self) -> u32 {
-        *self as u32
-    }
-
-    fn total(&self) -> u32 {
-        Self::CARDINALITY as u32
+make_enum_progress! {
+    pub enum IndexingStep {
+        PreparingPayloads,
+        ExtractingDocuments,
+        ExtractingFacets,
+        ExtractingWords,
+        ExtractingWordProximity,
+        ExtractingEmbeddings,
+        MergingFacetCaches,
+        MergingWordCaches,
+        MergingWordProximity,
+        WritingGeoPoints,
+        WaitingForDatabaseWrites,
+        WaitingForExtractors,
+        WritingEmbeddingsToDatabase,
+        PostProcessingFacets,
+        PostProcessingWords,
+        Finalizing,
    }
 }
Author	SHA1	Message	Date
ManyTheFish	bf144a94d8	No more use FST to find a word without any typo	2025-03-17 14:20:10 +01:00
ManyTheFish	b0b1888ef9	Add test	2025-03-17 14:20:10 +01:00
meili-bors[bot]	cbdf80893d	Merge #5422 5422: Add more progress levels to measure merging r=Kerollmops a=Kerollmops I found out that Meilisearch was not correctly reporting the long indexing times in the progress and that a lot of time was spent on extracting words with all documents already extracted. The reason was that there was no step to report merging the cache and sending the entries to write to the writer thread. This PR adds these entries to the progress. Co-authored-by: Kerollmops <clement@meilisearch.com>	2025-03-17 12:02:46 +00:00
Kerollmops	e2156ddfc7	Simplify the IndexingStep progress enum	2025-03-17 11:40:50 +01:00
meili-bors[bot]	13a88d6131	Merge #5407 5407: Geo update bug r=irevoire a=ManyTheFish # Pull Request ## Related issue Fixes #5380 Fixes #5399 Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>	2025-03-17 10:24:33 +00:00
meili-bors[bot]	d9875b782d	Merge #5421 5421: Accept total batch size in human size r=irevoire a=Kerollmops This PR fixes the new `experimental-limit-batched-tasks-total-size` to accept human-defined sizes in bytes. Co-authored-by: Kerollmops <clement@meilisearch.com>	2025-03-17 09:41:22 +00:00
Kerollmops	cb16baab18	Add more progress levels to measure merging	2025-03-17 10:13:29 +01:00
Kerollmops	d3e4b2dfe7	Accept total batch size in human size	2025-03-14 13:07:51 +01:00
ManyTheFish	d3cd5ea689	Check if the geo fields changed additionally to the other faceted fields when reindexing facets	2025-03-12 11:20:10 +01:00
Tamo	3ed43f9097	add a failing test reproducing the bug	2025-03-12 11:20:10 +01:00