mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Merge pull request #5587 from meilisearch/fix-derivations-again
Fix another derivation-related panic in the search
This commit is contained in:
		| @@ -1,7 +1,6 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::cmp::Ordering; | use std::cmp::Ordering; | ||||||
| use std::collections::BTreeSet; | use std::collections::BTreeSet; | ||||||
| use std::ops::ControlFlow; |  | ||||||
|  |  | ||||||
| use fst::automaton::Str; | use fst::automaton::Str; | ||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| @@ -16,12 +15,6 @@ use crate::search::new::{limits, SearchContext}; | |||||||
| use crate::search::{build_dfa, get_first}; | use crate::search::{build_dfa, get_first}; | ||||||
| use crate::{Result, MAX_WORD_LENGTH}; | use crate::{Result, MAX_WORD_LENGTH}; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] |  | ||||||
| pub enum NumberOfTypos { |  | ||||||
|     One, |  | ||||||
|     Two, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl Interned<QueryTerm> { | impl Interned<QueryTerm> { | ||||||
|     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> { |     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> { | ||||||
|         let s = ctx.term_interner.get_mut(self); |         let s = ctx.term_interner.get_mut(self); | ||||||
| @@ -45,7 +38,7 @@ impl Interned<QueryTerm> { | |||||||
| fn find_zero_typo_prefix_derivations( | fn find_zero_typo_prefix_derivations( | ||||||
|     ctx: &mut SearchContext<'_>, |     ctx: &mut SearchContext<'_>, | ||||||
|     word_interned: Interned<String>, |     word_interned: Interned<String>, | ||||||
|     mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>, |     prefix_of: &mut BTreeSet<Interned<String>>, | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let word = ctx.word_interner.get(word_interned).to_owned(); |     let word = ctx.word_interner.get(word_interned).to_owned(); | ||||||
|     let word = word.as_str(); |     let word = word.as_str(); | ||||||
| @@ -65,8 +58,8 @@ fn find_zero_typo_prefix_derivations( | |||||||
|                 let derived_word = derived_word.to_string(); |                 let derived_word = derived_word.to_string(); | ||||||
|                 let derived_word_interned = ctx.word_interner.insert(derived_word); |                 let derived_word_interned = ctx.word_interner.insert(derived_word); | ||||||
|                 if derived_word_interned != word_interned { |                 if derived_word_interned != word_interned { | ||||||
|                     let cf = visit(derived_word_interned)?; |                     prefix_of.insert(derived_word_interned); | ||||||
|                     if cf.is_break() { |                     if prefix_of.len() >= limits::MAX_PREFIX_COUNT { | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -81,7 +74,7 @@ fn find_one_typo_derivations( | |||||||
|     ctx: &mut SearchContext<'_>, |     ctx: &mut SearchContext<'_>, | ||||||
|     word_interned: Interned<String>, |     word_interned: Interned<String>, | ||||||
|     is_prefix: bool, |     is_prefix: bool, | ||||||
|     mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>, |     one_typo_words: &mut BTreeSet<Interned<String>>, | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let fst = ctx.get_words_fst()?; |     let fst = ctx.get_words_fst()?; | ||||||
|     let word = ctx.word_interner.get(word_interned).to_owned(); |     let word = ctx.word_interner.get(word_interned).to_owned(); | ||||||
| @@ -98,8 +91,8 @@ fn find_one_typo_derivations( | |||||||
|             1 => { |             1 => { | ||||||
|                 let derived_word = std::str::from_utf8(derived_word)?; |                 let derived_word = std::str::from_utf8(derived_word)?; | ||||||
|                 let derived_word = ctx.word_interner.insert(derived_word.to_owned()); |                 let derived_word = ctx.word_interner.insert(derived_word.to_owned()); | ||||||
|                 let cf = visit(derived_word)?; |                 one_typo_words.insert(derived_word); | ||||||
|                 if cf.is_break() { |                 if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT { | ||||||
|                     break; |                     break; | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
| @@ -116,7 +109,8 @@ fn find_one_two_typo_derivations( | |||||||
|     is_prefix: bool, |     is_prefix: bool, | ||||||
|     fst: fst::Set<Cow<'_, [u8]>>, |     fst: fst::Set<Cow<'_, [u8]>>, | ||||||
|     word_interner: &mut DedupInterner<String>, |     word_interner: &mut DedupInterner<String>, | ||||||
|     mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>, |     one_typo_words: &mut BTreeSet<Interned<String>>, | ||||||
|  |     two_typo_words: &mut BTreeSet<Interned<String>>, | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let word = word_interner.get(word_interned).to_owned(); |     let word = word_interner.get(word_interned).to_owned(); | ||||||
|     let word = word.as_str(); |     let word = word.as_str(); | ||||||
| @@ -130,16 +124,20 @@ fn find_one_two_typo_derivations( | |||||||
|     let mut stream = fst.search_with_state(automaton).into_stream(); |     let mut stream = fst.search_with_state(automaton).into_stream(); | ||||||
|  |  | ||||||
|     while let Some((derived_word, state)) = stream.next() { |     while let Some((derived_word, state)) = stream.next() { | ||||||
|  |         let finished_one_typo_words = one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT; | ||||||
|  |         let finished_two_typo_words = two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT; | ||||||
|  |         if finished_one_typo_words && finished_two_typo_words { | ||||||
|  |             // No chance we will add either one- or two-typo derivations anymore, stop iterating. | ||||||
|  |             break; | ||||||
|  |         } | ||||||
|         let derived_word = std::str::from_utf8(derived_word)?; |         let derived_word = std::str::from_utf8(derived_word)?; | ||||||
|         // No need to intern here |         // No need to intern here | ||||||
|         // in the case the typo is on the first letter, we know the number of typo |         // in the case the typo is on the first letter, we know the number of typo | ||||||
|         // is two |         // is two | ||||||
|         if get_first(derived_word) != get_first(word) { |         if get_first(derived_word) != get_first(word) && !finished_two_typo_words { | ||||||
|             let derived_word_interned = word_interner.insert(derived_word.to_owned()); |             let derived_word_interned = word_interner.insert(derived_word.to_owned()); | ||||||
|             let cf = visit(derived_word_interned, NumberOfTypos::Two)?; |             two_typo_words.insert(derived_word_interned); | ||||||
|             if cf.is_break() { |             continue; | ||||||
|                 break; |  | ||||||
|             } |  | ||||||
|         } else { |         } else { | ||||||
|             // Else, we know that it is the second dfa that matched and compute the |             // Else, we know that it is the second dfa that matched and compute the | ||||||
|             // correct distance |             // correct distance | ||||||
| @@ -147,18 +145,18 @@ fn find_one_two_typo_derivations( | |||||||
|             match d.to_u8() { |             match d.to_u8() { | ||||||
|                 0 => (), |                 0 => (), | ||||||
|                 1 => { |                 1 => { | ||||||
|                     let derived_word_interned = word_interner.insert(derived_word.to_owned()); |                     if finished_one_typo_words { | ||||||
|                     let cf = visit(derived_word_interned, NumberOfTypos::One)?; |                         continue; | ||||||
|                     if cf.is_break() { |  | ||||||
|                         break; |  | ||||||
|                     } |                     } | ||||||
|  |                     let derived_word_interned = word_interner.insert(derived_word.to_owned()); | ||||||
|  |                     one_typo_words.insert(derived_word_interned); | ||||||
|                 } |                 } | ||||||
|                 2 => { |                 2 => { | ||||||
|                     let derived_word_interned = word_interner.insert(derived_word.to_owned()); |                     if finished_two_typo_words { | ||||||
|                     let cf = visit(derived_word_interned, NumberOfTypos::Two)?; |                         continue; | ||||||
|                     if cf.is_break() { |  | ||||||
|                         break; |  | ||||||
|                     } |                     } | ||||||
|  |                     let derived_word_interned = word_interner.insert(derived_word.to_owned()); | ||||||
|  |                     two_typo_words.insert(derived_word_interned); | ||||||
|                 } |                 } | ||||||
|                 _ => unreachable!("2 typos DFA produced a distance greater than 2"), |                 _ => unreachable!("2 typos DFA produced a distance greater than 2"), | ||||||
|             } |             } | ||||||
| @@ -214,14 +212,7 @@ pub fn partially_initialized_term_from_word( | |||||||
|     } |     } | ||||||
|  |  | ||||||
|     if is_prefix && use_prefix_db.is_none() { |     if is_prefix && use_prefix_db.is_none() { | ||||||
|         find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| { |         find_zero_typo_prefix_derivations(ctx, word_interned, &mut prefix_of)?; | ||||||
|             if prefix_of.len() < limits::MAX_PREFIX_COUNT { |  | ||||||
|                 prefix_of.insert(derived_word); |  | ||||||
|                 Ok(ControlFlow::Continue(())) |  | ||||||
|             } else { |  | ||||||
|                 Ok(ControlFlow::Break(())) |  | ||||||
|             } |  | ||||||
|         })?; |  | ||||||
|     } |     } | ||||||
|     let synonyms = ctx.index.synonyms(ctx.txn)?; |     let synonyms = ctx.index.synonyms(ctx.txn)?; | ||||||
|     let mut synonym_word_count = 0; |     let mut synonym_word_count = 0; | ||||||
| @@ -284,14 +275,7 @@ impl Interned<QueryTerm> { | |||||||
|         let mut one_typo_words = BTreeSet::new(); |         let mut one_typo_words = BTreeSet::new(); | ||||||
|  |  | ||||||
|         if *max_nbr_typos > 0 { |         if *max_nbr_typos > 0 { | ||||||
|             find_one_typo_derivations(ctx, original, is_prefix, |derived_word| { |             find_one_typo_derivations(ctx, original, is_prefix, &mut one_typo_words)?; | ||||||
|                 if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { |  | ||||||
|                     one_typo_words.insert(derived_word); |  | ||||||
|                     Ok(ControlFlow::Continue(())) |  | ||||||
|                 } else { |  | ||||||
|                     Ok(ControlFlow::Break(())) |  | ||||||
|                 } |  | ||||||
|             })?; |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let split_words = if allows_split_words { |         let split_words = if allows_split_words { | ||||||
| @@ -346,27 +330,8 @@ impl Interned<QueryTerm> { | |||||||
|                 *is_prefix, |                 *is_prefix, | ||||||
|                 ctx.index.words_fst(ctx.txn)?, |                 ctx.index.words_fst(ctx.txn)?, | ||||||
|                 &mut ctx.word_interner, |                 &mut ctx.word_interner, | ||||||
|                 |derived_word, nbr_typos| { |                 &mut one_typo_words, | ||||||
|                     if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT |                 &mut two_typo_words, | ||||||
|                         && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT |  | ||||||
|                     { |  | ||||||
|                         // No chance we will add either one- or two-typo derivations anymore, stop iterating. |  | ||||||
|                         return Ok(ControlFlow::Break(())); |  | ||||||
|                     } |  | ||||||
|                     match nbr_typos { |  | ||||||
|                         NumberOfTypos::One => { |  | ||||||
|                             if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { |  | ||||||
|                                 one_typo_words.insert(derived_word); |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|                         NumberOfTypos::Two => { |  | ||||||
|                             if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { |  | ||||||
|                                 two_typo_words.insert(derived_word); |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|                     } |  | ||||||
|                     Ok(ControlFlow::Continue(())) |  | ||||||
|                 }, |  | ||||||
|             )?; |             )?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user