mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-29 23:16:26 +00:00 
			
		
		
		
	Compute split words derivations of terms that don't accept typos
This commit is contained in:
		| @@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo { | |||||||
| impl Interned<QueryTerm> { | impl Interned<QueryTerm> { | ||||||
|     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { |     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { | ||||||
|         let s = ctx.term_interner.get_mut(self); |         let s = ctx.term_interner.get_mut(self); | ||||||
|         if s.max_nbr_typos == 0 { |         if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { | ||||||
|             s.one_typo = Lazy::Init(OneTypoTerm::default()); |  | ||||||
|             s.two_typo = Lazy::Init(TwoTypoTerm::default()); |  | ||||||
|         } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { |  | ||||||
|             assert!(s.two_typo.is_uninit()); |             assert!(s.two_typo.is_uninit()); | ||||||
|  |             // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words | ||||||
|             self.initialize_one_typo_subterm(ctx)?; |             self.initialize_one_typo_subterm(ctx)?; | ||||||
|             let s = ctx.term_interner.get_mut(self); |             let s = ctx.term_interner.get_mut(self); | ||||||
|             assert!(s.one_typo.is_init()); |             assert!(s.one_typo.is_init()); | ||||||
| @@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern | |||||||
| impl Interned<QueryTerm> { | impl Interned<QueryTerm> { | ||||||
|     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { |     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { | ||||||
|         let self_mut = ctx.term_interner.get_mut(self); |         let self_mut = ctx.term_interner.get_mut(self); | ||||||
|         let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; |         let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut; | ||||||
|         let original = *original; |         let original = *original; | ||||||
|         let is_prefix = *is_prefix; |         let is_prefix = *is_prefix; | ||||||
|         // let original_str = ctx.word_interner.get(*original).to_owned(); |         // let original_str = ctx.word_interner.get(*original).to_owned(); | ||||||
| @@ -286,19 +284,22 @@ impl Interned<QueryTerm> { | |||||||
|         } |         } | ||||||
|         let mut one_typo_words = BTreeSet::new(); |         let mut one_typo_words = BTreeSet::new(); | ||||||
|  |  | ||||||
|         find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { |         if *max_nbr_typos > 0 { | ||||||
|             match nbr_typos { |             find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { | ||||||
|                 ZeroOrOneTypo::Zero => {} |                 match nbr_typos { | ||||||
|                 ZeroOrOneTypo::One => { |                     ZeroOrOneTypo::Zero => {} | ||||||
|                     if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { |                     ZeroOrOneTypo::One => { | ||||||
|                         one_typo_words.insert(derived_word); |                         if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { | ||||||
|                     } else { |                             one_typo_words.insert(derived_word); | ||||||
|                         return Ok(ControlFlow::Break(())); |                         } else { | ||||||
|  |                             return Ok(ControlFlow::Break(())); | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|             } |                 Ok(ControlFlow::Continue(())) | ||||||
|             Ok(ControlFlow::Continue(())) |             })?; | ||||||
|         })?; |         } | ||||||
|  |  | ||||||
|         let original_str = ctx.word_interner.get(original).to_owned(); |         let original_str = ctx.word_interner.get(original).to_owned(); | ||||||
|         let split_words = find_split_words(ctx, original_str.as_str())?; |         let split_words = find_split_words(ctx, original_str.as_str())?; | ||||||
|  |  | ||||||
| @@ -327,7 +328,7 @@ impl Interned<QueryTerm> { | |||||||
|     } |     } | ||||||
|     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { |     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { | ||||||
|         let self_mut = ctx.term_interner.get_mut(self); |         let self_mut = ctx.term_interner.get_mut(self); | ||||||
|         let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; |         let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; | ||||||
|         let original_str = ctx.word_interner.get(*original).to_owned(); |         let original_str = ctx.word_interner.get(*original).to_owned(); | ||||||
|         if two_typo.is_init() { |         if two_typo.is_init() { | ||||||
|             return Ok(()); |             return Ok(()); | ||||||
| @@ -335,34 +336,37 @@ impl Interned<QueryTerm> { | |||||||
|         let mut one_typo_words = BTreeSet::new(); |         let mut one_typo_words = BTreeSet::new(); | ||||||
|         let mut two_typo_words = BTreeSet::new(); |         let mut two_typo_words = BTreeSet::new(); | ||||||
|  |  | ||||||
|         find_zero_one_two_typo_derivations( |         if *max_nbr_typos > 0 { | ||||||
|             *original, |             find_zero_one_two_typo_derivations( | ||||||
|             *is_prefix, |                 *original, | ||||||
|             ctx.index.words_fst(ctx.txn)?, |                 *is_prefix, | ||||||
|             &mut ctx.word_interner, |                 ctx.index.words_fst(ctx.txn)?, | ||||||
|             |derived_word, nbr_typos| { |                 &mut ctx.word_interner, | ||||||
|                 if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT |                 |derived_word, nbr_typos| { | ||||||
|                     && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT |                     if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT | ||||||
|                 { |                         && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT | ||||||
|                     // No chance we will add either one- or two-typo derivations anymore, stop iterating. |                     { | ||||||
|                     return Ok(ControlFlow::Break(())); |                         // No chance we will add either one- or two-typo derivations anymore, stop iterating. | ||||||
|                 } |                         return Ok(ControlFlow::Break(())); | ||||||
|                 match nbr_typos { |                     } | ||||||
|                     NumberOfTypos::Zero => {} |                     match nbr_typos { | ||||||
|                     NumberOfTypos::One => { |                         NumberOfTypos::Zero => {} | ||||||
|                         if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { |                         NumberOfTypos::One => { | ||||||
|                             one_typo_words.insert(derived_word); |                             if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { | ||||||
|  |                                 one_typo_words.insert(derived_word); | ||||||
|  |                             } | ||||||
|  |                         } | ||||||
|  |                         NumberOfTypos::Two => { | ||||||
|  |                             if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { | ||||||
|  |                                 two_typo_words.insert(derived_word); | ||||||
|  |                             } | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                     NumberOfTypos::Two => { |                     Ok(ControlFlow::Continue(())) | ||||||
|                         if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { |                 }, | ||||||
|                             two_typo_words.insert(derived_word); |             )?; | ||||||
|                         } |         } | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|                 Ok(ControlFlow::Continue(())) |  | ||||||
|             }, |  | ||||||
|         )?; |  | ||||||
|         let split_words = find_split_words(ctx, original_str.as_str())?; |         let split_words = find_split_words(ctx, original_str.as_str())?; | ||||||
|         let self_mut = ctx.term_interner.get_mut(self); |         let self_mut = ctx.term_interner.get_mut(self); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -3,9 +3,9 @@ This module tests the following properties: | |||||||
|  |  | ||||||
| 1. Two consecutive words from a query can be combined into a "2gram" | 1. Two consecutive words from a query can be combined into a "2gram" | ||||||
| 2. Three consecutive words from a query can be combined into a "3gram" | 2. Three consecutive words from a query can be combined into a "3gram" | ||||||
| 3. A word from the query can be split into two consecutive words (split words) | 3. A word from the query can be split into two consecutive words (split words), no matter how short it is | ||||||
| 4. A 2gram can be split into two words | 4. A 2gram can be split into two words | ||||||
| 5. A 3gram cannot be split into two words | 5. A 3gram can be split into two words | ||||||
| 6. 2grams can contain up to 1 typo | 6. 2grams can contain up to 1 typo | ||||||
| 7. 3grams cannot have typos | 7. 3grams cannot have typos | ||||||
| 8. 2grams and 3grams can be prefix tolerant | 8. 2grams and 3grams can be prefix tolerant | ||||||
| @@ -14,6 +14,7 @@ This module tests the following properties: | |||||||
| 11. Disabling typo tolerance does not disable ngram tolerance | 11. Disabling typo tolerance does not disable ngram tolerance | ||||||
| 12. Prefix tolerance is disabled for the last word if a space follows it | 12. Prefix tolerance is disabled for the last word if a space follows it | ||||||
| 13. Ngrams cannot be formed by combining a phrase and a word or two phrases | 13. Ngrams cannot be formed by combining a phrase and a word or two phrases | ||||||
|  | 14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings | ||||||
| */ | */ | ||||||
|  |  | ||||||
| use crate::index::tests::TempIndex; | use crate::index::tests::TempIndex; | ||||||
| @@ -56,6 +57,10 @@ fn create_index() -> TempIndex { | |||||||
|             { |             { | ||||||
|                 "id": 5, |                 "id": 5, | ||||||
|                 "text": "sunflowering is not a verb" |                 "text": "sunflowering is not a verb" | ||||||
|  |             }, | ||||||
|  |             { | ||||||
|  |                 "id": 6, | ||||||
|  |                 "text": "xy z" | ||||||
|             } |             } | ||||||
|         ])) |         ])) | ||||||
|         .unwrap(); |         .unwrap(); | ||||||
| @@ -263,10 +268,11 @@ fn test_disable_split_words() { | |||||||
|     s.query("sunflower "); |     s.query("sunflower "); | ||||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); |     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||||
|     // no document containing `sun flower` |     // no document containing `sun flower` | ||||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); | ||||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|     insta::assert_debug_snapshot!(texts, @r###" |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|     [ |     [ | ||||||
|  |         "\"the sun flower is tall\"", | ||||||
|         "\"the sunflower is tall\"", |         "\"the sunflower is tall\"", | ||||||
|     ] |     ] | ||||||
|     "###); |     "###); | ||||||
| @@ -307,10 +313,11 @@ fn test_3gram_no_split_words() { | |||||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); |     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||||
|  |  | ||||||
|     // no document with `sun flower` |     // no document with `sun flower` | ||||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]"); | ||||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|     insta::assert_debug_snapshot!(texts, @r###" |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|     [ |     [ | ||||||
|  |         "\"the sun flower is tall\"", | ||||||
|         "\"the sunflowers are pretty\"", |         "\"the sunflowers are pretty\"", | ||||||
|         "\"the sunflower is tall\"", |         "\"the sunflower is tall\"", | ||||||
|         "\"sunflowering is not a verb\"", |         "\"sunflowering is not a verb\"", | ||||||
| @@ -369,3 +376,50 @@ fn test_no_ngram_phrases() { | |||||||
|     ] |     ] | ||||||
|     "###); |     "###); | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[test] | ||||||
|  | fn test_short_split_words() { | ||||||
|  |     let index = create_index(); | ||||||
|  |     let txn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |     let mut s = Search::new(&txn, &index); | ||||||
|  |     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||||
|  |     s.query("xyz"); | ||||||
|  |     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||||
|  |  | ||||||
|  |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); | ||||||
|  |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|  |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|  |     [ | ||||||
|  |         "\"xy z\"", | ||||||
|  |     ] | ||||||
|  |     "###); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[test] | ||||||
|  | fn test_split_words_never_disabled() { | ||||||
|  |     let index = create_index(); | ||||||
|  |  | ||||||
|  |     index | ||||||
|  |         .update_settings(|s| { | ||||||
|  |             s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect()); | ||||||
|  |             s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect()); | ||||||
|  |         }) | ||||||
|  |         .unwrap(); | ||||||
|  |  | ||||||
|  |     let txn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |     let mut s = Search::new(&txn, &index); | ||||||
|  |     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||||
|  |     s.query("the sunflower is tall"); | ||||||
|  |     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||||
|  |  | ||||||
|  |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); | ||||||
|  |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|  |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|  |     [ | ||||||
|  |         "\"the sun flower is tall\"", | ||||||
|  |         "\"the sunflower is tall\"", | ||||||
|  |     ] | ||||||
|  |     "###); | ||||||
|  | } | ||||||
|   | |||||||
| @@ -9,7 +9,7 @@ This module tests the following properties: | |||||||
| 6. A typo on the first letter of a word counts as two typos | 6. A typo on the first letter of a word counts as two typos | ||||||
| 7. Phrases are not typo tolerant | 7. Phrases are not typo tolerant | ||||||
| 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` | 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` | ||||||
| 9. 3grams are not typo tolerant | 9. 3grams are not typo tolerant (but they can be split into two words) | ||||||
| 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly | 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly | ||||||
| if `words` doesn't exist before it. | if `words` doesn't exist before it. | ||||||
| 11. The `typo` ranking rule places documents with the same number of typos in the same bucket | 11. The `typo` ranking rule places documents with the same number of typos in the same bucket | ||||||
| @@ -287,16 +287,17 @@ fn test_typo_exact_word() { | |||||||
|     ] |     ] | ||||||
|     "###); |     "###); | ||||||
|  |  | ||||||
|     // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower) |     // exact words do not disable prefix (sunflowering OK, but no sunflowar) | ||||||
|     let mut s = Search::new(&txn, &index); |     let mut s = Search::new(&txn, &index); | ||||||
|     s.terms_matching_strategy(TermsMatchingStrategy::All); |     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||||
|     s.query("network interconnection sunflower"); |     s.query("network interconnection sunflower"); | ||||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); |     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]"); |     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]"); | ||||||
|     let texts = collect_field_values(&index, &txn, "text", &documents_ids); |     let texts = collect_field_values(&index, &txn, "text", &documents_ids); | ||||||
|     insta::assert_debug_snapshot!(texts, @r###" |     insta::assert_debug_snapshot!(texts, @r###" | ||||||
|     [ |     [ | ||||||
|         "\"network interconnection sunflower\"", |         "\"network interconnection sunflower\"", | ||||||
|  |         "\"network interconnection sun flower\"", | ||||||
|         "\"network interconnection sunflowering\"", |         "\"network interconnection sunflowering\"", | ||||||
|     ] |     ] | ||||||
|     "###); |     "###); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user