From ef9fc6c85433706d4d961cd42e664262be36946c Mon Sep 17 00:00:00 2001 From: arthurgousset <46296830+arthurgousset@users.noreply.github.com> Date: Thu, 29 May 2025 23:24:01 +0100 Subject: [PATCH 1/5] fix(parse_query): cyrillic bug --- .../src/search/new/query_term/parse_query.rs | 64 ++++++++++++++++++- 1 file changed, 62 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs index e492363f8..58ddff206 100644 --- a/crates/milli/src/search/new/query_term/parse_query.rs +++ b/crates/milli/src/search/new/query_term/parse_query.rs @@ -202,11 +202,11 @@ pub fn number_of_typos_allowed<'ctx>( Ok(Box::new(move |word: &str| { if !authorize_typos - || word.len() < min_len_one_typo as usize + || word.chars().count() < min_len_one_typo as usize || exact_words.as_ref().is_some_and(|fst| fst.contains(word)) { 0 - } else if word.len() < min_len_two_typos as usize { + } else if word.chars().count() < min_len_two_typos as usize { 1 } else { 2 @@ -381,3 +381,63 @@ mod tests { Ok(()) } } + + #[test] + fn test_unicode_typo_tolerance_fixed() -> Result<()> { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn()?; + let ctx = SearchContext::new(&temp_index, &rtxn)?; + + let nbr_typos = number_of_typos_allowed(&ctx)?; + + // ASCII word "doggy" (5 chars, 5 bytes) + let ascii_word = "doggy"; + let ascii_typos = nbr_typos(ascii_word); + + // Cyrillic word "собак" (5 chars, 10 bytes) + let cyrillic_word = "собак"; + let cyrillic_typos = nbr_typos(cyrillic_word); + + eprintln!("ASCII '{}': char_count={}, typos={}", + ascii_word, ascii_word.chars().count(), ascii_typos); + eprintln!("Cyrillic '{}': char_count={}, typos={}", + cyrillic_word, cyrillic_word.chars().count(), cyrillic_typos); + + // Both words have 5 characters, so they should have the same typo tolerance + assert_eq!(ascii_typos, cyrillic_typos, + "Words with same character count should get same typo tolerance"); + + // With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo + assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance"); + assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance"); + + Ok(()) + } + + #[test] + fn test_various_unicode_scripts() -> Result<()> { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn()?; + let ctx = SearchContext::new(&temp_index, &rtxn)?; + + let nbr_typos = number_of_typos_allowed(&ctx)?; + + // Let's use 5-character words for consistent testing + let five_char_words = vec![ + ("doggy", "ASCII"), // 5 chars, 5 bytes + ("café!", "Accented"), // 5 chars, 7 bytes + ("собак", "Cyrillic"), // 5 chars, 10 bytes + ]; + + let expected_typos = 1; // With default settings, 5-char words get 1 typo + + for (word, script) in five_char_words { + let typos = nbr_typos(word); + eprintln!("{} '{}': chars={}, bytes={}, typos={}", + script, word, word.chars().count(), word.chars().count(), typos); + assert_eq!(typos, expected_typos, + "{} word '{}' should get {} typo(s)", script, word, expected_typos); + } + + Ok(()) + } From ab3d92d16392166f96f570874980f4b41f5cc0b3 Mon Sep 17 00:00:00 2001 From: arthurgousset <46296830+arthurgousset@users.noreply.github.com> Date: Thu, 29 May 2025 23:33:32 +0100 Subject: [PATCH 2/5] chore(parse_query): delete println and move test inside tests module --- crates/milli/src/search/new/query_term/parse_query.rs | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs index 58ddff206..8a23a0e61 100644 --- a/crates/milli/src/search/new/query_term/parse_query.rs +++ b/crates/milli/src/search/new/query_term/parse_query.rs @@ -380,7 +380,6 @@ mod tests { Ok(()) } -} #[test] fn test_unicode_typo_tolerance_fixed() -> Result<()> { @@ -398,11 +397,6 @@ mod tests { let cyrillic_word = "собак"; let cyrillic_typos = nbr_typos(cyrillic_word); - eprintln!("ASCII '{}': char_count={}, typos={}", - ascii_word, ascii_word.chars().count(), ascii_typos); - eprintln!("Cyrillic '{}': char_count={}, typos={}", - cyrillic_word, cyrillic_word.chars().count(), cyrillic_typos); - // Both words have 5 characters, so they should have the same typo tolerance assert_eq!(ascii_typos, cyrillic_typos, "Words with same character count should get same typo tolerance"); @@ -433,11 +427,10 @@ mod tests { for (word, script) in five_char_words { let typos = nbr_typos(word); - eprintln!("{} '{}': chars={}, bytes={}, typos={}", - script, word, word.chars().count(), word.chars().count(), typos); assert_eq!(typos, expected_typos, "{} word '{}' should get {} typo(s)", script, word, expected_typos); } Ok(()) } +} From 263300b3a30c7c5eb56c422d08e9157ed5ac5a64 Mon Sep 17 00:00:00 2001 From: arthurgousset <46296830+arthurgousset@users.noreply.github.com> Date: Wed, 4 Jun 2025 10:56:02 +0100 Subject: [PATCH 3/5] style(milli): linting --- .../src/search/new/query_term/parse_query.rs | 45 ++++++++++--------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/crates/milli/src/search/new/query_term/parse_query.rs b/crates/milli/src/search/new/query_term/parse_query.rs index 8a23a0e61..64bbb94c0 100644 --- a/crates/milli/src/search/new/query_term/parse_query.rs +++ b/crates/milli/src/search/new/query_term/parse_query.rs @@ -386,51 +386,56 @@ mod tests { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn()?; let ctx = SearchContext::new(&temp_index, &rtxn)?; - + let nbr_typos = number_of_typos_allowed(&ctx)?; - + // ASCII word "doggy" (5 chars, 5 bytes) let ascii_word = "doggy"; let ascii_typos = nbr_typos(ascii_word); - - // Cyrillic word "собак" (5 chars, 10 bytes) + + // Cyrillic word "собак" (5 chars, 10 bytes) let cyrillic_word = "собак"; let cyrillic_typos = nbr_typos(cyrillic_word); - + // Both words have 5 characters, so they should have the same typo tolerance - assert_eq!(ascii_typos, cyrillic_typos, - "Words with same character count should get same typo tolerance"); - + assert_eq!( + ascii_typos, cyrillic_typos, + "Words with same character count should get same typo tolerance" + ); + // With default settings (oneTypo=5, twoTypos=9), 5-char words should get 1 typo assert_eq!(ascii_typos, 1, "5-character word should get 1 typo tolerance"); assert_eq!(cyrillic_typos, 1, "5-character word should get 1 typo tolerance"); - + Ok(()) } - #[test] + #[test] fn test_various_unicode_scripts() -> Result<()> { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn()?; let ctx = SearchContext::new(&temp_index, &rtxn)?; - + let nbr_typos = number_of_typos_allowed(&ctx)?; - + // Let's use 5-character words for consistent testing let five_char_words = vec![ - ("doggy", "ASCII"), // 5 chars, 5 bytes - ("café!", "Accented"), // 5 chars, 7 bytes - ("собак", "Cyrillic"), // 5 chars, 10 bytes + ("doggy", "ASCII"), // 5 chars, 5 bytes + ("café!", "Accented"), // 5 chars, 7 bytes + ("собак", "Cyrillic"), // 5 chars, 10 bytes ]; - + let expected_typos = 1; // With default settings, 5-char words get 1 typo - + for (word, script) in five_char_words { let typos = nbr_typos(word); - assert_eq!(typos, expected_typos, - "{} word '{}' should get {} typo(s)", script, word, expected_typos); + assert_eq!( + typos, expected_typos, + "{} word '{}' should get {} typo(s)", + script, word, expected_typos + ); } - + Ok(()) } } From 27527849bb58ec8323e3394a138f4d515cd2d599 Mon Sep 17 00:00:00 2001 From: arthurgousset <46296830+arthurgousset@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:17:10 +0100 Subject: [PATCH 4/5] test(meilisearch/search/locales.rs): updates snapshot Used `cargo insta test` Reviewed with `cargo insta review` --- crates/meilisearch/tests/search/locales.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/tests/search/locales.rs b/crates/meilisearch/tests/search/locales.rs index b1c9b2bc2..8da7f585b 100644 --- a/crates/meilisearch/tests/search/locales.rs +++ b/crates/meilisearch/tests/search/locales.rs @@ -147,23 +147,20 @@ async fn simple_search() { .search( json!({"q": "進撃", "locales": ["jpn"], "attributesToRetrieve": ["id"]}), |response, code| { - snapshot!(response, @r###" + snapshot!(response, @r#" { "hits": [ { "id": 852 - }, - { - "id": 853 } ], "query": "進撃", "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 2 + "estimatedTotalHits": 1 } - "###); + "#); snapshot!(code, @"200 OK"); }, ) From 666680bd87edace5dc6acc0c726ebd2fd9edf946 Mon Sep 17 00:00:00 2001 From: arthurgousset <46296830+arthurgousset@users.noreply.github.com> Date: Wed, 4 Jun 2025 14:18:20 +0100 Subject: [PATCH 5/5] test(meilisearch/search/locales.rs): updates snapshot Used `cargo insta test` Reviewed with `cargo insta review` --- crates/meilisearch/tests/search/locales.rs | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/tests/search/locales.rs b/crates/meilisearch/tests/search/locales.rs index 8da7f585b..f45554d41 100644 --- a/crates/meilisearch/tests/search/locales.rs +++ b/crates/meilisearch/tests/search/locales.rs @@ -169,23 +169,20 @@ async fn simple_search() { // chinese index .search(json!({"q": "进击", "attributesToRetrieve": ["id"]}), |response, code| { - snapshot!(response, @r###" + snapshot!(response, @r#" { "hits": [ { "id": 853 - }, - { - "id": 852 } ], "query": "进击", "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 2 + "estimatedTotalHits": 1 } - "###); + "#); snapshot!(code, @"200 OK"); }) .await;