Merge pull request #5617 from workbackai/workback/patch/5594/FB6ED899-E821-4C88-AA79-8BB975E1937A

fix(milli/search): Cyrillic has different typo tolerance due to byte counting bug
This commit is contained in:
Many the fish 2025-06-12 07:39:19 +00:00 committed by GitHub
commit aefebdeb8b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 66 additions and 14 deletions

View File

@ -147,23 +147,20 @@ async fn simple_search() {
.search( .search(
json!({"q": "進撃", "locales": ["jpn"], "attributesToRetrieve": ["id"]}), json!({"q": "進撃", "locales": ["jpn"], "attributesToRetrieve": ["id"]}),
|response, code| { |response, code| {
snapshot!(response, @r###" snapshot!(response, @r#"
{ {
"hits": [ "hits": [
{ {
"id": 852 "id": 852
},
{
"id": 853
} }
], ],
"query": "進撃", "query": "進撃",
"processingTimeMs": "[duration]", "processingTimeMs": "[duration]",
"limit": 20, "limit": 20,
"offset": 0, "offset": 0,
"estimatedTotalHits": 2 "estimatedTotalHits": 1
} }
"###); "#);
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
}, },
) )
@ -172,23 +169,20 @@ async fn simple_search() {
// chinese // chinese
index index
.search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| { .search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| {
snapshot!(response, @r###" snapshot!(response, @r#"
{ {
"hits": [ "hits": [
{ {
"id": 853 "id": 853
},
{
"id": 852
} }
], ],
"query": "进击", "query": "进击",
"processingTimeMs": "[duration]", "processingTimeMs": "[duration]",
"limit": 20, "limit": 20,
"offset": 0, "offset": 0,
"estimatedTotalHits": 2 "estimatedTotalHits": 1
} }
"###); "#);
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
}) })
.await; .await;

View File

@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>(
Ok(Box::new(move |word: &str| { Ok(Box::new(move |word: &str| {
if !authorize_typos if !authorize_typos
|| word.len() < min_len_one_typo as usize || word.chars().count() < min_len_one_typo as usize
|| exact_words.as_ref().is_some_and(|fst| fst.contains(word)) || exact_words.as_ref().is_some_and(|fst| fst.contains(word))
{ {
0 0
} else if word.len() < min_len_two_typos as usize { } else if word.chars().count() < min_len_two_typos as usize {
1 1
} else { } else {
2 2
@ -380,4 +380,62 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn test_unicode_typo_tolerance_fixed() -> Result<()> {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn()?;
let ctx = SearchContext::new(&temp_index, &rtxn)?;
let nbr_typos = number_of_typos_allowed(&ctx)?;
// ASCII word "doggy" (5 chars, 5 bytes)
let ascii_word = "doggy";
let ascii_typos = nbr_typos(ascii_word);
// Cyrillic word "собак" (5 chars, 10 bytes)
let cyrillic_word = "собак";
let cyrillic_typos = nbr_typos(cyrillic_word);
// Both words have 5 characters, so they should have the same typo tolerance
assert_eq!(
ascii_typos, cyrillic_typos,
"Words with same character count should get same typo tolerance"
);
// With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo
assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance");
assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance");
Ok(())
}
#[test]
fn test_various_unicode_scripts() -> Result<()> {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn()?;
let ctx = SearchContext::new(&temp_index, &rtxn)?;
let nbr_typos = number_of_typos_allowed(&ctx)?;
// Let's use 5-character words for consistent testing
let five_char_words = vec![
("doggy", "ASCII"), // 5 chars, 5 bytes
("café!", "Accented"), // 5 chars, 7 bytes
("собак", "Cyrillic"), // 5 chars, 10 bytes
];
let expected_typos = 1; // With default settings, 5-char words get 1 typo
for (word, script) in five_char_words {
let typos = nbr_typos(word);
assert_eq!(
typos, expected_typos,
"{} word '{}' should get {} typo(s)",
script, word, expected_typos
);
}
Ok(())
}
} }