mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-06-16 00:41:00 +00:00
Merge pull request #5617 from workbackai/workback/patch/5594/FB6ED899-E821-4C88-AA79-8BB975E1937A
fix(milli/search): Cyrillic has different typo tolerance due to byte counting bug
This commit is contained in:
commit
aefebdeb8b
@ -147,23 +147,20 @@ async fn simple_search() {
|
||||
.search(
|
||||
json!({"q": "進撃", "locales": ["jpn"], "attributesToRetrieve": ["id"]}),
|
||||
|response, code| {
|
||||
snapshot!(response, @r###"
|
||||
snapshot!(response, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"id": 852
|
||||
},
|
||||
{
|
||||
"id": 853
|
||||
}
|
||||
],
|
||||
"query": "進撃",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 2
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
"#);
|
||||
snapshot!(code, @"200 OK");
|
||||
},
|
||||
)
|
||||
@ -172,23 +169,20 @@ async fn simple_search() {
|
||||
// chinese
|
||||
index
|
||||
.search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| {
|
||||
snapshot!(response, @r###"
|
||||
snapshot!(response, @r#"
|
||||
{
|
||||
"hits": [
|
||||
{
|
||||
"id": 853
|
||||
},
|
||||
{
|
||||
"id": 852
|
||||
}
|
||||
],
|
||||
"query": "进击",
|
||||
"processingTimeMs": "[duration]",
|
||||
"limit": 20,
|
||||
"offset": 0,
|
||||
"estimatedTotalHits": 2
|
||||
"estimatedTotalHits": 1
|
||||
}
|
||||
"###);
|
||||
"#);
|
||||
snapshot!(code, @"200 OK");
|
||||
})
|
||||
.await;
|
||||
|
@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>(
|
||||
|
||||
Ok(Box::new(move |word: &str| {
|
||||
if !authorize_typos
|
||||
|| word.len() < min_len_one_typo as usize
|
||||
|| word.chars().count() < min_len_one_typo as usize
|
||||
|| exact_words.as_ref().is_some_and(|fst| fst.contains(word))
|
||||
{
|
||||
0
|
||||
} else if word.len() < min_len_two_typos as usize {
|
||||
} else if word.chars().count() < min_len_two_typos as usize {
|
||||
1
|
||||
} else {
|
||||
2
|
||||
@ -380,4 +380,62 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unicode_typo_tolerance_fixed() -> Result<()> {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn()?;
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn)?;
|
||||
|
||||
let nbr_typos = number_of_typos_allowed(&ctx)?;
|
||||
|
||||
// ASCII word "doggy" (5 chars, 5 bytes)
|
||||
let ascii_word = "doggy";
|
||||
let ascii_typos = nbr_typos(ascii_word);
|
||||
|
||||
// Cyrillic word "собак" (5 chars, 10 bytes)
|
||||
let cyrillic_word = "собак";
|
||||
let cyrillic_typos = nbr_typos(cyrillic_word);
|
||||
|
||||
// Both words have 5 characters, so they should have the same typo tolerance
|
||||
assert_eq!(
|
||||
ascii_typos, cyrillic_typos,
|
||||
"Words with same character count should get same typo tolerance"
|
||||
);
|
||||
|
||||
// With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo
|
||||
assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance");
|
||||
assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_various_unicode_scripts() -> Result<()> {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn()?;
|
||||
let ctx = SearchContext::new(&temp_index, &rtxn)?;
|
||||
|
||||
let nbr_typos = number_of_typos_allowed(&ctx)?;
|
||||
|
||||
// Let's use 5-character words for consistent testing
|
||||
let five_char_words = vec![
|
||||
("doggy", "ASCII"), // 5 chars, 5 bytes
|
||||
("café!", "Accented"), // 5 chars, 7 bytes
|
||||
("собак", "Cyrillic"), // 5 chars, 10 bytes
|
||||
];
|
||||
|
||||
let expected_typos = 1; // With default settings, 5-char words get 1 typo
|
||||
|
||||
for (word, script) in five_char_words {
|
||||
let typos = nbr_typos(word);
|
||||
assert_eq!(
|
||||
typos, expected_typos,
|
||||
"{} word '{}' should get {} typo(s)",
|
||||
script, word, expected_typos
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user