fix(parse_query): cyrillic bug

This commit is contained in:
arthurgousset 2025-05-29 23:24:01 +01:00
parent 72736c0ea9
commit ef9fc6c854
No known key found for this signature in database

View File

@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>(
Ok(Box::new(move |word: &str| {
if !authorize_typos
|| word.len() < min_len_one_typo as usize
|| word.chars().count() < min_len_one_typo as usize
|| exact_words.as_ref().is_some_and(|fst| fst.contains(word))
{
0
} else if word.len() < min_len_two_typos as usize {
} else if word.chars().count() < min_len_two_typos as usize {
1
} else {
2
@ -381,3 +381,63 @@ mod tests {
Ok(())
}
}
#[test]
fn test_unicode_typo_tolerance_fixed() -> Result<()> {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn()?;
let ctx = SearchContext::new(&temp_index, &rtxn)?;
let nbr_typos = number_of_typos_allowed(&ctx)?;
// ASCII word "doggy" (5 chars, 5 bytes)
let ascii_word = "doggy";
let ascii_typos = nbr_typos(ascii_word);
// Cyrillic word "собак" (5 chars, 10 bytes)
let cyrillic_word = "собак";
let cyrillic_typos = nbr_typos(cyrillic_word);
eprintln!("ASCII '{}': char_count={}, typos={}",
ascii_word, ascii_word.chars().count(), ascii_typos);
eprintln!("Cyrillic '{}': char_count={}, typos={}",
cyrillic_word, cyrillic_word.chars().count(), cyrillic_typos);
// Both words have 5 characters, so they should have the same typo tolerance
assert_eq!(ascii_typos, cyrillic_typos,
"Words with same character count should get same typo tolerance");
// With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo
assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance");
assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance");
Ok(())
}
#[test]
fn test_various_unicode_scripts() -> Result<()> {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn()?;
let ctx = SearchContext::new(&temp_index, &rtxn)?;
let nbr_typos = number_of_typos_allowed(&ctx)?;
// Let's use 5-character words for consistent testing
let five_char_words = vec![
("doggy", "ASCII"), // 5 chars, 5 bytes
("café!", "Accented"), // 5 chars, 7 bytes
("собак", "Cyrillic"), // 5 chars, 10 bytes
];
let expected_typos = 1; // With default settings, 5-char words get 1 typo
for (word, script) in five_char_words {
let typos = nbr_typos(word);
eprintln!("{} '{}': chars={}, bytes={}, typos={}",
script, word, word.chars().count(), word.chars().count(), typos);
assert_eq!(typos, expected_typos,
"{} word '{}' should get {} typo(s)", script, word, expected_typos);
}
Ok(())
}