Merge pull request #5617 from workbackai/workback/patch/5594/FB6ED899-E821-4C88-AA79-8BB975E1937A

fix(milli/search): Cyrillic has different typo tolerance due to byte counting bug
2025-08-02 03:40:00 +00:00 · 2025-06-12 07:39:19 +00:00
parent 170ad87e44 666680bd87
commit aefebdeb8b
2 changed files with 66 additions and 14 deletions
--- a/crates/meilisearch/tests/search/locales.rs
+++ b/crates/meilisearch/tests/search/locales.rs
@ -147,23 +147,20 @@ async fn simple_search() {
        .search(
            json!({"q": "進撃", "locales": ["jpn"], "attributesToRetrieve": ["id"]}),
            |response, code| {
-                snapshot!(response, @r###"
+                snapshot!(response, @r#"
                {
                  "hits": [
                    {
                      "id": 852
                    },
                    {
                      "id": 853
                    }
                  ],
                  "query": "進撃",
                  "processingTimeMs": "[duration]",
                  "limit": 20,
                  "offset": 0,
-                  "estimatedTotalHits": 2
+                  "estimatedTotalHits": 1
                }
-                "###);
+                "#);
                snapshot!(code, @"200 OK");
            },
        )
@ -172,23 +169,20 @@ async fn simple_search() {
    // chinese
    index
        .search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| {
-            snapshot!(response, @r###"
+            snapshot!(response, @r#"
            {
              "hits": [
                {
                  "id": 853
                },
                {
                  "id": 852
                }
              ],
              "query": "进击",
              "processingTimeMs": "[duration]",
              "limit": 20,
              "offset": 0,
-              "estimatedTotalHits": 2
+              "estimatedTotalHits": 1
            }
-            "###);
+            "#);
            snapshot!(code, @"200 OK");
        })
        .await;
--- a/crates/milli/src/search/new/query_term/parse_query.rs
+++ b/crates/milli/src/search/new/query_term/parse_query.rs
@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>(
    Ok(Box::new(move |word: &str| {
        if !authorize_typos
-            || word.len() < min_len_one_typo as usize
+            || word.chars().count() < min_len_one_typo as usize
            || exact_words.as_ref().is_some_and(|fst| fst.contains(word))
        {
            0
-        } else if word.len() < min_len_two_typos as usize {
+        } else if word.chars().count() < min_len_two_typos as usize {
            1
        } else {
            2
@ -380,4 +380,62 @@ mod tests {
        Ok(())
    }
    #[test]
    fn test_unicode_typo_tolerance_fixed() -> Result<()> {
        let temp_index = temp_index_with_documents();
        let rtxn = temp_index.read_txn()?;
        let ctx = SearchContext::new(&temp_index, &rtxn)?;
        let nbr_typos = number_of_typos_allowed(&ctx)?;
        // ASCII word "doggy" (5 chars, 5 bytes)
        let ascii_word = "doggy";
        let ascii_typos = nbr_typos(ascii_word);
        // Cyrillic word "собак" (5 chars, 10 bytes)
        let cyrillic_word = "собак";
        let cyrillic_typos = nbr_typos(cyrillic_word);
        // Both words have 5 characters, so they should have the same typo tolerance
        assert_eq!(
            ascii_typos, cyrillic_typos,
            "Words with same character count should get same typo tolerance"
        );
        // With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo
        assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance");
        assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance");
        Ok(())
    }
    #[test]
    fn test_various_unicode_scripts() -> Result<()> {
        let temp_index = temp_index_with_documents();
        let rtxn = temp_index.read_txn()?;
        let ctx = SearchContext::new(&temp_index, &rtxn)?;
        let nbr_typos = number_of_typos_allowed(&ctx)?;
        // Let's use 5-character words for consistent testing
        let five_char_words = vec![
            ("doggy", "ASCII"),    // 5 chars, 5 bytes
            ("café!", "Accented"), // 5 chars, 7 bytes
            ("собак", "Cyrillic"), // 5 chars, 10 bytes
        ];
        let expected_typos = 1; // With default settings, 5-char words get 1 typo
        for (word, script) in five_char_words {
            let typos = nbr_typos(word);
            assert_eq!(
                typos, expected_typos,
                "{} word '{}' should get {} typo(s)",
                script, word, expected_typos
            );
        }
        Ok(())
    }
 }