mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Compute split words derivations of terms that don't accept typos
This commit is contained in:
		| @@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo { | ||||
| impl Interned<QueryTerm> { | ||||
|     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { | ||||
|         let s = ctx.term_interner.get_mut(self); | ||||
|         if s.max_nbr_typos == 0 { | ||||
|             s.one_typo = Lazy::Init(OneTypoTerm::default()); | ||||
|             s.two_typo = Lazy::Init(TwoTypoTerm::default()); | ||||
|         } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { | ||||
|         if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { | ||||
|             assert!(s.two_typo.is_uninit()); | ||||
|             // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words | ||||
|             self.initialize_one_typo_subterm(ctx)?; | ||||
|             let s = ctx.term_interner.get_mut(self); | ||||
|             assert!(s.one_typo.is_init()); | ||||
| @@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern | ||||
| impl Interned<QueryTerm> { | ||||
|     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { | ||||
|         let self_mut = ctx.term_interner.get_mut(self); | ||||
|         let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; | ||||
|         let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut; | ||||
|         let original = *original; | ||||
|         let is_prefix = *is_prefix; | ||||
|         // let original_str = ctx.word_interner.get(*original).to_owned(); | ||||
| @@ -286,19 +284,22 @@ impl Interned<QueryTerm> { | ||||
|         } | ||||
|         let mut one_typo_words = BTreeSet::new(); | ||||
|  | ||||
|         find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { | ||||
|             match nbr_typos { | ||||
|                 ZeroOrOneTypo::Zero => {} | ||||
|                 ZeroOrOneTypo::One => { | ||||
|                     if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { | ||||
|                         one_typo_words.insert(derived_word); | ||||
|                     } else { | ||||
|                         return Ok(ControlFlow::Break(())); | ||||
|         if *max_nbr_typos > 0 { | ||||
|             find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { | ||||
|                 match nbr_typos { | ||||
|                     ZeroOrOneTypo::Zero => {} | ||||
|                     ZeroOrOneTypo::One => { | ||||
|                         if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { | ||||
|                             one_typo_words.insert(derived_word); | ||||
|                         } else { | ||||
|                             return Ok(ControlFlow::Break(())); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|             Ok(ControlFlow::Continue(())) | ||||
|         })?; | ||||
|                 Ok(ControlFlow::Continue(())) | ||||
|             })?; | ||||
|         } | ||||
|  | ||||
|         let original_str = ctx.word_interner.get(original).to_owned(); | ||||
|         let split_words = find_split_words(ctx, original_str.as_str())?; | ||||
|  | ||||
| @@ -327,7 +328,7 @@ impl Interned<QueryTerm> { | ||||
|     } | ||||
|     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { | ||||
|         let self_mut = ctx.term_interner.get_mut(self); | ||||
|         let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; | ||||
|         let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; | ||||
|         let original_str = ctx.word_interner.get(*original).to_owned(); | ||||
|         if two_typo.is_init() { | ||||
|             return Ok(()); | ||||
| @@ -335,34 +336,37 @@ impl Interned<QueryTerm> { | ||||
|         let mut one_typo_words = BTreeSet::new(); | ||||
|         let mut two_typo_words = BTreeSet::new(); | ||||
|  | ||||
|         find_zero_one_two_typo_derivations( | ||||
|             *original, | ||||
|             *is_prefix, | ||||
|             ctx.index.words_fst(ctx.txn)?, | ||||
|             &mut ctx.word_interner, | ||||
|             |derived_word, nbr_typos| { | ||||
|                 if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT | ||||
|                     && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT | ||||
|                 { | ||||
|                     // No chance we will add either one- or two-typo derivations anymore, stop iterating. | ||||
|                     return Ok(ControlFlow::Break(())); | ||||
|                 } | ||||
|                 match nbr_typos { | ||||
|                     NumberOfTypos::Zero => {} | ||||
|                     NumberOfTypos::One => { | ||||
|                         if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { | ||||
|                             one_typo_words.insert(derived_word); | ||||
|         if *max_nbr_typos > 0 { | ||||
|             find_zero_one_two_typo_derivations( | ||||
|                 *original, | ||||
|                 *is_prefix, | ||||
|                 ctx.index.words_fst(ctx.txn)?, | ||||
|                 &mut ctx.word_interner, | ||||
|                 |derived_word, nbr_typos| { | ||||
|                     if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT | ||||
|                         && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT | ||||
|                     { | ||||
|                         // No chance we will add either one- or two-typo derivations anymore, stop iterating. | ||||
|                         return Ok(ControlFlow::Break(())); | ||||
|                     } | ||||
|                     match nbr_typos { | ||||
|                         NumberOfTypos::Zero => {} | ||||
|                         NumberOfTypos::One => { | ||||
|                             if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { | ||||
|                                 one_typo_words.insert(derived_word); | ||||
|                             } | ||||
|                         } | ||||
|                         NumberOfTypos::Two => { | ||||
|                             if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { | ||||
|                                 two_typo_words.insert(derived_word); | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                     NumberOfTypos::Two => { | ||||
|                         if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { | ||||
|                             two_typo_words.insert(derived_word); | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 Ok(ControlFlow::Continue(())) | ||||
|             }, | ||||
|         )?; | ||||
|                     Ok(ControlFlow::Continue(())) | ||||
|                 }, | ||||
|             )?; | ||||
|         } | ||||
|  | ||||
|         let split_words = find_split_words(ctx, original_str.as_str())?; | ||||
|         let self_mut = ctx.term_interner.get_mut(self); | ||||
|  | ||||
|   | ||||
| @@ -3,9 +3,9 @@ This module tests the following properties: | ||||
|  | ||||
| 1. Two consecutive words from a query can be combined into a "2gram" | ||||
| 2. Three consecutive words from a query can be combined into a "3gram" | ||||
| 3. A word from the query can be split into two consecutive words (split words) | ||||
| 3. A word from the query can be split into two consecutive words (split words), no matter how short it is | ||||
| 4. A 2gram can be split into two words | ||||
| 5. A 3gram cannot be split into two words | ||||
| 5. A 3gram can be split into two words | ||||
| 6. 2grams can contain up to 1 typo | ||||
| 7. 3grams cannot have typos | ||||
| 8. 2grams and 3grams can be prefix tolerant | ||||
| @@ -14,6 +14,7 @@ This module tests the following properties: | ||||
| 11. Disabling typo tolerance does not disable ngram tolerance | ||||
| 12. Prefix tolerance is disabled for the last word if a space follows it | ||||
| 13. Ngrams cannot be formed by combining a phrase and a word or two phrases | ||||
| 14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings | ||||
| */ | ||||
|  | ||||
| use crate::index::tests::TempIndex; | ||||
| @@ -56,6 +57,10 @@ fn create_index() -> TempIndex { | ||||
|             { | ||||
|                 "id": 5, | ||||
|                 "text": "sunflowering is not a verb" | ||||
|             }, | ||||
|             { | ||||
|                 "id": 6, | ||||
|                 "text": "xy z" | ||||
|             } | ||||
|         ])) | ||||
|         .unwrap(); | ||||
| @@ -263,10 +268,11 @@ fn test_disable_split_words() { | ||||
|     s.query("sunflower "); | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|     // no document containing `sun flower` | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"the sun flower is tall\"", | ||||
|         "\"the sunflower is tall\"", | ||||
|     ] | ||||
|     "###); | ||||
| @@ -307,10 +313,11 @@ fn test_3gram_no_split_words() { | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     // no document with `sun flower` | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"the sun flower is tall\"", | ||||
|         "\"the sunflowers are pretty\"", | ||||
|         "\"the sunflower is tall\"", | ||||
|         "\"sunflowering is not a verb\"", | ||||
| @@ -369,3 +376,50 @@ fn test_no_ngram_phrases() { | ||||
|     ] | ||||
|     "###); | ||||
| } | ||||
|  | ||||
| #[test] | ||||
| fn test_short_split_words() { | ||||
|     let index = create_index(); | ||||
|     let txn = index.read_txn().unwrap(); | ||||
|  | ||||
|     let mut s = Search::new(&txn, &index); | ||||
|     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||
|     s.query("xyz"); | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"xy z\"", | ||||
|     ] | ||||
|     "###); | ||||
| } | ||||
|  | ||||
| #[test] | ||||
| fn test_split_words_never_disabled() { | ||||
|     let index = create_index(); | ||||
|  | ||||
|     index | ||||
|         .update_settings(|s| { | ||||
|             s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect()); | ||||
|             s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect()); | ||||
|         }) | ||||
|         .unwrap(); | ||||
|  | ||||
|     let txn = index.read_txn().unwrap(); | ||||
|  | ||||
|     let mut s = Search::new(&txn, &index); | ||||
|     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||
|     s.query("the sunflower is tall"); | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|  | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"the sun flower is tall\"", | ||||
|         "\"the sunflower is tall\"", | ||||
|     ] | ||||
|     "###); | ||||
| } | ||||
|   | ||||
| @@ -9,7 +9,7 @@ This module tests the following properties: | ||||
| 6. A typo on the first letter of a word counts as two typos | ||||
| 7. Phrases are not typo tolerant | ||||
| 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` | ||||
| 9. 3grams are not typo tolerant | ||||
| 9. 3grams are not typo tolerant (but they can be split into two words) | ||||
| 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly | ||||
| if `words` doesn't exist before it. | ||||
| 11. The `typo` ranking rule places documents with the same number of typos in the same bucket | ||||
| @@ -287,16 +287,17 @@ fn test_typo_exact_word() { | ||||
|     ] | ||||
|     "###); | ||||
|  | ||||
|     // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower) | ||||
|     // exact words do not disable prefix (sunflowering OK, but no sunflowar) | ||||
|     let mut s = Search::new(&txn, &index); | ||||
|     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||
|     s.query("network interconnection sunflower"); | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]"); | ||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||
|     insta::assert_debug_snapshot!(texts, @r###" | ||||
|     [ | ||||
|         "\"network interconnection sunflower\"", | ||||
|         "\"network interconnection sun flower\"", | ||||
|         "\"network interconnection sunflowering\"", | ||||
|     ] | ||||
|     "###); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user