mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-06-09 05:35:41 +00:00
Remove lambdas from the find_*_derivations
Make sure their number of insert in the interner are bounded
This commit is contained in:
parent
ae3c4e27c4
commit
c9b78970c9
@ -1,7 +1,6 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::cmp::Ordering;
|
use std::cmp::Ordering;
|
||||||
use std::collections::BTreeSet;
|
use std::collections::BTreeSet;
|
||||||
use std::ops::ControlFlow;
|
|
||||||
|
|
||||||
use fst::automaton::Str;
|
use fst::automaton::Str;
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
@ -16,12 +15,6 @@ use crate::search::new::{limits, SearchContext};
|
|||||||
use crate::search::{build_dfa, get_first};
|
use crate::search::{build_dfa, get_first};
|
||||||
use crate::{Result, MAX_WORD_LENGTH};
|
use crate::{Result, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
|
||||||
pub enum NumberOfTypos {
|
|
||||||
One,
|
|
||||||
Two,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Interned<QueryTerm> {
|
impl Interned<QueryTerm> {
|
||||||
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
|
||||||
let s = ctx.term_interner.get_mut(self);
|
let s = ctx.term_interner.get_mut(self);
|
||||||
@ -45,7 +38,7 @@ impl Interned<QueryTerm> {
|
|||||||
fn find_zero_typo_prefix_derivations(
|
fn find_zero_typo_prefix_derivations(
|
||||||
ctx: &mut SearchContext<'_>,
|
ctx: &mut SearchContext<'_>,
|
||||||
word_interned: Interned<String>,
|
word_interned: Interned<String>,
|
||||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
prefix_of: &mut BTreeSet<Interned<String>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||||
let word = word.as_str();
|
let word = word.as_str();
|
||||||
@ -65,8 +58,8 @@ fn find_zero_typo_prefix_derivations(
|
|||||||
let derived_word = derived_word.to_string();
|
let derived_word = derived_word.to_string();
|
||||||
let derived_word_interned = ctx.word_interner.insert(derived_word);
|
let derived_word_interned = ctx.word_interner.insert(derived_word);
|
||||||
if derived_word_interned != word_interned {
|
if derived_word_interned != word_interned {
|
||||||
let cf = visit(derived_word_interned)?;
|
prefix_of.insert(derived_word_interned);
|
||||||
if cf.is_break() {
|
if prefix_of.len() >= limits::MAX_PREFIX_COUNT {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -81,7 +74,7 @@ fn find_one_typo_derivations(
|
|||||||
ctx: &mut SearchContext<'_>,
|
ctx: &mut SearchContext<'_>,
|
||||||
word_interned: Interned<String>,
|
word_interned: Interned<String>,
|
||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
|
one_typo_words: &mut BTreeSet<Interned<String>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let fst = ctx.get_words_fst()?;
|
let fst = ctx.get_words_fst()?;
|
||||||
let word = ctx.word_interner.get(word_interned).to_owned();
|
let word = ctx.word_interner.get(word_interned).to_owned();
|
||||||
@ -98,8 +91,8 @@ fn find_one_typo_derivations(
|
|||||||
1 => {
|
1 => {
|
||||||
let derived_word = std::str::from_utf8(derived_word)?;
|
let derived_word = std::str::from_utf8(derived_word)?;
|
||||||
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
let derived_word = ctx.word_interner.insert(derived_word.to_owned());
|
||||||
let cf = visit(derived_word)?;
|
one_typo_words.insert(derived_word);
|
||||||
if cf.is_break() {
|
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -116,7 +109,8 @@ fn find_one_two_typo_derivations(
|
|||||||
is_prefix: bool,
|
is_prefix: bool,
|
||||||
fst: fst::Set<Cow<'_, [u8]>>,
|
fst: fst::Set<Cow<'_, [u8]>>,
|
||||||
word_interner: &mut DedupInterner<String>,
|
word_interner: &mut DedupInterner<String>,
|
||||||
mut visit: impl FnMut(Interned<String>, NumberOfTypos) -> Result<ControlFlow<()>>,
|
one_typo_words: &mut BTreeSet<Interned<String>>,
|
||||||
|
two_typo_words: &mut BTreeSet<Interned<String>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let word = word_interner.get(word_interned).to_owned();
|
let word = word_interner.get(word_interned).to_owned();
|
||||||
let word = word.as_str();
|
let word = word.as_str();
|
||||||
@ -130,16 +124,20 @@ fn find_one_two_typo_derivations(
|
|||||||
let mut stream = fst.search_with_state(automaton).into_stream();
|
let mut stream = fst.search_with_state(automaton).into_stream();
|
||||||
|
|
||||||
while let Some((derived_word, state)) = stream.next() {
|
while let Some((derived_word, state)) = stream.next() {
|
||||||
|
let finished_one_typo_words = one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT;
|
||||||
|
let finished_two_typo_words = two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT;
|
||||||
|
if finished_one_typo_words && finished_two_typo_words {
|
||||||
|
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
||||||
|
break;
|
||||||
|
}
|
||||||
let derived_word = std::str::from_utf8(derived_word)?;
|
let derived_word = std::str::from_utf8(derived_word)?;
|
||||||
// No need to intern here
|
// No need to intern here
|
||||||
// in the case the typo is on the first letter, we know the number of typo
|
// in the case the typo is on the first letter, we know the number of typo
|
||||||
// is two
|
// is two
|
||||||
if get_first(derived_word) != get_first(word) {
|
if get_first(derived_word) != get_first(word) && !finished_two_typo_words {
|
||||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
two_typo_words.insert(derived_word_interned);
|
||||||
if cf.is_break() {
|
continue;
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
// Else, we know that it is the second dfa that matched and compute the
|
// Else, we know that it is the second dfa that matched and compute the
|
||||||
// correct distance
|
// correct distance
|
||||||
@ -147,18 +145,18 @@ fn find_one_two_typo_derivations(
|
|||||||
match d.to_u8() {
|
match d.to_u8() {
|
||||||
0 => (),
|
0 => (),
|
||||||
1 => {
|
1 => {
|
||||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
if finished_one_typo_words {
|
||||||
let cf = visit(derived_word_interned, NumberOfTypos::One)?;
|
continue;
|
||||||
if cf.is_break() {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||||
|
one_typo_words.insert(derived_word_interned);
|
||||||
}
|
}
|
||||||
2 => {
|
2 => {
|
||||||
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
if finished_two_typo_words {
|
||||||
let cf = visit(derived_word_interned, NumberOfTypos::Two)?;
|
continue;
|
||||||
if cf.is_break() {
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
let derived_word_interned = word_interner.insert(derived_word.to_owned());
|
||||||
|
two_typo_words.insert(derived_word_interned);
|
||||||
}
|
}
|
||||||
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
_ => unreachable!("2 typos DFA produced a distance greater than 2"),
|
||||||
}
|
}
|
||||||
@ -214,14 +212,7 @@ pub fn partially_initialized_term_from_word(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if is_prefix && use_prefix_db.is_none() {
|
if is_prefix && use_prefix_db.is_none() {
|
||||||
find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
|
find_zero_typo_prefix_derivations(ctx, word_interned, &mut prefix_of)?;
|
||||||
if prefix_of.len() < limits::MAX_PREFIX_COUNT {
|
|
||||||
prefix_of.insert(derived_word);
|
|
||||||
Ok(ControlFlow::Continue(()))
|
|
||||||
} else {
|
|
||||||
Ok(ControlFlow::Break(()))
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
}
|
}
|
||||||
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
let synonyms = ctx.index.synonyms(ctx.txn)?;
|
||||||
let mut synonym_word_count = 0;
|
let mut synonym_word_count = 0;
|
||||||
@ -284,14 +275,7 @@ impl Interned<QueryTerm> {
|
|||||||
let mut one_typo_words = BTreeSet::new();
|
let mut one_typo_words = BTreeSet::new();
|
||||||
|
|
||||||
if *max_nbr_typos > 0 {
|
if *max_nbr_typos > 0 {
|
||||||
find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
|
find_one_typo_derivations(ctx, original, is_prefix, &mut one_typo_words)?;
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
|
||||||
one_typo_words.insert(derived_word);
|
|
||||||
Ok(ControlFlow::Continue(()))
|
|
||||||
} else {
|
|
||||||
Ok(ControlFlow::Break(()))
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
let split_words = if allows_split_words {
|
let split_words = if allows_split_words {
|
||||||
@ -346,27 +330,8 @@ impl Interned<QueryTerm> {
|
|||||||
*is_prefix,
|
*is_prefix,
|
||||||
ctx.index.words_fst(ctx.txn)?,
|
ctx.index.words_fst(ctx.txn)?,
|
||||||
&mut ctx.word_interner,
|
&mut ctx.word_interner,
|
||||||
|derived_word, nbr_typos| {
|
&mut one_typo_words,
|
||||||
if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
|
&mut two_typo_words,
|
||||||
&& two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
|
|
||||||
{
|
|
||||||
// No chance we will add either one- or two-typo derivations anymore, stop iterating.
|
|
||||||
return Ok(ControlFlow::Break(()));
|
|
||||||
}
|
|
||||||
match nbr_typos {
|
|
||||||
NumberOfTypos::One => {
|
|
||||||
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
|
|
||||||
one_typo_words.insert(derived_word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
NumberOfTypos::Two => {
|
|
||||||
if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
|
|
||||||
two_typo_words.insert(derived_word);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(ControlFlow::Continue(()))
|
|
||||||
},
|
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user