mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	fix(parse_query): cyrillic bug
This commit is contained in:
		| @@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>( | ||||
|  | ||||
|     Ok(Box::new(move |word: &str| { | ||||
|         if !authorize_typos | ||||
|             || word.len() < min_len_one_typo as usize | ||||
|             || word.chars().count() < min_len_one_typo as usize | ||||
|             || exact_words.as_ref().is_some_and(|fst| fst.contains(word)) | ||||
|         { | ||||
|             0 | ||||
|         } else if word.len() < min_len_two_typos as usize { | ||||
|         } else if word.chars().count() < min_len_two_typos as usize { | ||||
|             1 | ||||
|         } else { | ||||
|             2 | ||||
| @@ -381,3 +381,63 @@ mod tests { | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
|     #[test] | ||||
|     fn test_unicode_typo_tolerance_fixed() -> Result<()> { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn()?; | ||||
|         let ctx = SearchContext::new(&temp_index, &rtxn)?; | ||||
|          | ||||
|         let nbr_typos = number_of_typos_allowed(&ctx)?; | ||||
|          | ||||
|         // ASCII word "doggy" (5 chars, 5 bytes) | ||||
|         let ascii_word = "doggy"; | ||||
|         let ascii_typos = nbr_typos(ascii_word); | ||||
|          | ||||
|         // Cyrillic word "собак" (5 chars, 10 bytes)   | ||||
|         let cyrillic_word = "собак"; | ||||
|         let cyrillic_typos = nbr_typos(cyrillic_word); | ||||
|          | ||||
|         eprintln!("ASCII '{}': char_count={}, typos={}",  | ||||
|                   ascii_word, ascii_word.chars().count(), ascii_typos); | ||||
|         eprintln!("Cyrillic '{}': char_count={}, typos={}",  | ||||
|                   cyrillic_word, cyrillic_word.chars().count(), cyrillic_typos); | ||||
|          | ||||
|         // Both words have 5 characters, so they should have the same typo tolerance | ||||
|         assert_eq!(ascii_typos, cyrillic_typos, | ||||
|             "Words with same character count should get same typo tolerance"); | ||||
|          | ||||
|         // With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo | ||||
|         assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance"); | ||||
|         assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance"); | ||||
|          | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     #[test]  | ||||
|     fn test_various_unicode_scripts() -> Result<()> { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn()?; | ||||
|         let ctx = SearchContext::new(&temp_index, &rtxn)?; | ||||
|          | ||||
|         let nbr_typos = number_of_typos_allowed(&ctx)?; | ||||
|          | ||||
|         // Let's use 5-character words for consistent testing | ||||
|         let five_char_words = vec![ | ||||
|             ("doggy", "ASCII"),           // 5 chars, 5 bytes | ||||
|             ("café!", "Accented"),        // 5 chars, 7 bytes | ||||
|             ("собак", "Cyrillic"),        // 5 chars, 10 bytes  | ||||
|         ]; | ||||
|          | ||||
|         let expected_typos = 1; // With default settings, 5-char words get 1 typo | ||||
|          | ||||
|         for (word, script) in five_char_words { | ||||
|             let typos = nbr_typos(word); | ||||
|             eprintln!("{} '{}': chars={}, bytes={}, typos={}",  | ||||
|                       script, word, word.chars().count(), word.chars().count(), typos); | ||||
|             assert_eq!(typos, expected_typos, | ||||
|                 "{} word '{}' should get {} typo(s)", script, word, expected_typos); | ||||
|         } | ||||
|          | ||||
|         Ok(()) | ||||
|     } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user