From ef9fc6c85433706d4d961cd42e664262be36946c Mon Sep 17 00:00:00 2001
From: arthurgousset <46296830+arthurgousset@users.noreply.github.com>
Date: Thu, 29 May 2025 23:24:01 +0100
Subject: [PATCH] fix(parse_query): cyrillic bug

---
 .../src/search/new/query_term/parse_query.rs  | 64 ++++++++++++++++++-
 1 file changed, 62 insertions(+), 2 deletions(-)

diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs
index e492363f8..58ddff206 100644
--- a/crates/milli/src/search/new/query_term/parse_query.rs
+++ b/crates/milli/src/search/new/query_term/parse_query.rs
@@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>(
 
     Ok(Box::new(move |word: &str| {
         if !authorize_typos
-            || word.len() < min_len_one_typo as usize
+            || word.chars().count() < min_len_one_typo as usize
             || exact_words.as_ref().is_some_and(|fst| fst.contains(word))
         {
             0
-        } else if word.len() < min_len_two_typos as usize {
+        } else if word.chars().count() < min_len_two_typos as usize {
             1
         } else {
             2
@@ -381,3 +381,63 @@ mod tests {
         Ok(())
     }
 }
+
+    #[test]
+    fn test_unicode_typo_tolerance_fixed() -> Result<()> {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn()?;
+        let ctx = SearchContext::new(&temp_index, &rtxn)?;
+        
+        let nbr_typos = number_of_typos_allowed(&ctx)?;
+        
+        // ASCII word "doggy" (5 chars, 5 bytes)
+        let ascii_word = "doggy";
+        let ascii_typos = nbr_typos(ascii_word);
+        
+        // Cyrillic word "собак" (5 chars, 10 bytes)  
+        let cyrillic_word = "собак";
+        let cyrillic_typos = nbr_typos(cyrillic_word);
+        
+        eprintln!("ASCII '{}': char_count={}, typos={}", 
+                  ascii_word, ascii_word.chars().count(), ascii_typos);
+        eprintln!("Cyrillic '{}': char_count={}, typos={}", 
+                  cyrillic_word, cyrillic_word.chars().count(), cyrillic_typos);
+        
+        // Both words have 5 characters, so they should have the same typo tolerance
+        assert_eq!(ascii_typos, cyrillic_typos,
+            "Words with same character count should get same typo tolerance");
+        
+        // With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo
+        assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance");
+        assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance");
+        
+        Ok(())
+    }
+
+    #[test] 
+    fn test_various_unicode_scripts() -> Result<()> {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn()?;
+        let ctx = SearchContext::new(&temp_index, &rtxn)?;
+        
+        let nbr_typos = number_of_typos_allowed(&ctx)?;
+        
+        // Let's use 5-character words for consistent testing
+        let five_char_words = vec![
+            ("doggy", "ASCII"),           // 5 chars, 5 bytes
+            ("café!", "Accented"),        // 5 chars, 7 bytes
+            ("собак", "Cyrillic"),        // 5 chars, 10 bytes 
+        ];
+        
+        let expected_typos = 1; // With default settings, 5-char words get 1 typo
+        
+        for (word, script) in five_char_words {
+            let typos = nbr_typos(word);
+            eprintln!("{} '{}': chars={}, bytes={}, typos={}", 
+                      script, word, word.chars().count(), word.chars().count(), typos);
+            assert_eq!(typos, expected_typos,
+                "{} word '{}' should get {} typo(s)", script, word, expected_typos);
+        }
+        
+        Ok(())
+    }