mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-03 17:36:29 +00:00 
			
		
		
		
	No more use FST to find a word without any typo
This commit is contained in:
		@@ -1755,6 +1755,19 @@ impl Index {
 | 
			
		||||
        }
 | 
			
		||||
        Ok(stats)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    /// Check if the word is indexed in the index.
 | 
			
		||||
    ///
 | 
			
		||||
    /// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids.
 | 
			
		||||
    ///
 | 
			
		||||
    /// # Arguments
 | 
			
		||||
    ///
 | 
			
		||||
    /// * `rtxn`: The read transaction.
 | 
			
		||||
    /// * `word`: The word to check.
 | 
			
		||||
    pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result<bool> {
 | 
			
		||||
        Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
 | 
			
		||||
            || self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#[derive(Debug, Deserialize, Serialize)]
 | 
			
		||||
 
 | 
			
		||||
@@ -1,10 +1,12 @@
 | 
			
		||||
use std::borrow::Cow;
 | 
			
		||||
use std::cmp::Ordering;
 | 
			
		||||
use std::collections::BTreeSet;
 | 
			
		||||
use std::ops::ControlFlow;
 | 
			
		||||
 | 
			
		||||
use fst::automaton::Str;
 | 
			
		||||
use fst::{Automaton, IntoStreamer, Streamer};
 | 
			
		||||
use fst::{IntoStreamer, Streamer};
 | 
			
		||||
use heed::types::DecodeIgnore;
 | 
			
		||||
use itertools::{merge_join_by, EitherOrBoth};
 | 
			
		||||
 | 
			
		||||
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
 | 
			
		||||
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
 | 
			
		||||
@@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH};
 | 
			
		||||
 | 
			
		||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 | 
			
		||||
pub enum NumberOfTypos {
 | 
			
		||||
    Zero,
 | 
			
		||||
    One,
 | 
			
		||||
    Two,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
pub enum ZeroOrOneTypo {
 | 
			
		||||
    Zero,
 | 
			
		||||
    One,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl Interned<QueryTerm> {
 | 
			
		||||
    pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
 | 
			
		||||
        let s = ctx.term_interner.get_mut(self);
 | 
			
		||||
@@ -47,34 +43,45 @@ impl Interned<QueryTerm> {
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn find_zero_typo_prefix_derivations(
 | 
			
		||||
    ctx: &mut SearchContext<'_>,
 | 
			
		||||
    word_interned: Interned<String>,
 | 
			
		||||
    fst: fst::Set<Cow<'_, [u8]>>,
 | 
			
		||||
    word_interner: &mut DedupInterner<String>,
 | 
			
		||||
    mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
 | 
			
		||||
) -> Result<()> {
 | 
			
		||||
    let word = word_interner.get(word_interned).to_owned();
 | 
			
		||||
    let word = ctx.word_interner.get(word_interned).to_owned();
 | 
			
		||||
    let word = word.as_str();
 | 
			
		||||
    let prefix = Str::new(word).starts_with();
 | 
			
		||||
    let mut stream = fst.search(prefix).into_stream();
 | 
			
		||||
 | 
			
		||||
    while let Some(derived_word) = stream.next() {
 | 
			
		||||
        let derived_word = std::str::from_utf8(derived_word)?.to_owned();
 | 
			
		||||
        let derived_word_interned = word_interner.insert(derived_word);
 | 
			
		||||
        if derived_word_interned != word_interned {
 | 
			
		||||
            let cf = visit(derived_word_interned)?;
 | 
			
		||||
            if cf.is_break() {
 | 
			
		||||
                break;
 | 
			
		||||
    let words =
 | 
			
		||||
        ctx.index.word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
 | 
			
		||||
    let exact_words =
 | 
			
		||||
        ctx.index.exact_word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
 | 
			
		||||
 | 
			
		||||
    for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) {
 | 
			
		||||
        (Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word),
 | 
			
		||||
        (Err(_), _) | (_, Err(_)) => Ordering::Equal,
 | 
			
		||||
    }) {
 | 
			
		||||
        match eob {
 | 
			
		||||
            EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => {
 | 
			
		||||
                let (derived_word, _) = kv?;
 | 
			
		||||
                let derived_word = derived_word.to_string();
 | 
			
		||||
                let derived_word_interned = ctx.word_interner.insert(derived_word);
 | 
			
		||||
                if derived_word_interned != word_interned {
 | 
			
		||||
                    let cf = visit(derived_word_interned)?;
 | 
			
		||||
                    if cf.is_break() {
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn find_zero_one_typo_derivations(
 | 
			
		||||
fn find_one_typo_derivations(
 | 
			
		||||
    ctx: &mut SearchContext<'_>,
 | 
			
		||||
    word_interned: Interned<String>,
 | 
			
		||||
    is_prefix: bool,
 | 
			
		||||
    mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>,
 | 
			
		||||
    mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
 | 
			
		||||
) -> Result<()> {
 | 
			
		||||
    let fst = ctx.get_words_fst()?;
 | 
			
		||||
    let word = ctx.word_interner.get(word_interned).to_owned();
 | 
			
		||||
@@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations(
 | 
			
		||||
        let derived_word = ctx.word_interner.insert(derived_word.to_owned());
 | 
			
		||||
        let d = dfa.distance(state.1);
 | 
			
		||||
        match d.to_u8() {
 | 
			
		||||
            0 => {
 | 
			
		||||
                if derived_word != word_interned {
 | 
			
		||||
                    let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
 | 
			
		||||
                    if cf.is_break() {
 | 
			
		||||
                        break;
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            0 => (),
 | 
			
		||||
            1 => {
 | 
			
		||||
                let cf = visit(derived_word, ZeroOrOneTypo::One)?;
 | 
			
		||||
                let cf = visit(derived_word)?;
 | 
			
		||||
                if cf.is_break() {
 | 
			
		||||
                    break;
 | 
			
		||||
                }
 | 
			
		||||
@@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations(
 | 
			
		||||
    Ok(())
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
fn find_zero_one_two_typo_derivations(
 | 
			
		||||
fn find_one_two_typo_derivations(
 | 
			
		||||
    word_interned: Interned<String>,
 | 
			
		||||
    is_prefix: bool,
 | 
			
		||||
    fst: fst::Set<Cow<'_, [u8]>>,
 | 
			
		||||
@@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations(
 | 
			
		||||
            // correct distance
 | 
			
		||||
            let d = second_dfa.distance((state.1).0);
 | 
			
		||||
            match d.to_u8() {
 | 
			
		||||
                0 => {
 | 
			
		||||
                    if derived_word_interned != word_interned {
 | 
			
		||||
                        let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
 | 
			
		||||
                        if cf.is_break() {
 | 
			
		||||
                            break;
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                0 => (),
 | 
			
		||||
                1 => {
 | 
			
		||||
                    let cf = visit(derived_word_interned, NumberOfTypos::One)?;
 | 
			
		||||
                    if cf.is_break() {
 | 
			
		||||
@@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word(
 | 
			
		||||
        });
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    let fst = ctx.index.words_fst(ctx.txn)?;
 | 
			
		||||
 | 
			
		||||
    let use_prefix_db = is_prefix
 | 
			
		||||
        && (ctx
 | 
			
		||||
            .index
 | 
			
		||||
@@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word(
 | 
			
		||||
    let mut zero_typo = None;
 | 
			
		||||
    let mut prefix_of = BTreeSet::new();
 | 
			
		||||
 | 
			
		||||
    if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() {
 | 
			
		||||
    if ctx.index.contains_word(ctx.txn, word)? {
 | 
			
		||||
        zero_typo = Some(word_interned);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    if is_prefix && use_prefix_db.is_none() {
 | 
			
		||||
        find_zero_typo_prefix_derivations(
 | 
			
		||||
            word_interned,
 | 
			
		||||
            fst,
 | 
			
		||||
            &mut ctx.word_interner,
 | 
			
		||||
            |derived_word| {
 | 
			
		||||
                if prefix_of.len() < limits::MAX_PREFIX_COUNT {
 | 
			
		||||
                    prefix_of.insert(derived_word);
 | 
			
		||||
                    Ok(ControlFlow::Continue(()))
 | 
			
		||||
                } else {
 | 
			
		||||
                    Ok(ControlFlow::Break(()))
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
        )?;
 | 
			
		||||
        find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
 | 
			
		||||
            if prefix_of.len() < limits::MAX_PREFIX_COUNT {
 | 
			
		||||
                prefix_of.insert(derived_word);
 | 
			
		||||
                Ok(ControlFlow::Continue(()))
 | 
			
		||||
            } else {
 | 
			
		||||
                Ok(ControlFlow::Break(()))
 | 
			
		||||
            }
 | 
			
		||||
        })?;
 | 
			
		||||
    }
 | 
			
		||||
    let synonyms = ctx.index.synonyms(ctx.txn)?;
 | 
			
		||||
    let mut synonym_word_count = 0;
 | 
			
		||||
@@ -295,18 +281,13 @@ impl Interned<QueryTerm> {
 | 
			
		||||
        let mut one_typo_words = BTreeSet::new();
 | 
			
		||||
 | 
			
		||||
        if *max_nbr_typos > 0 {
 | 
			
		||||
            find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
 | 
			
		||||
                match nbr_typos {
 | 
			
		||||
                    ZeroOrOneTypo::Zero => {}
 | 
			
		||||
                    ZeroOrOneTypo::One => {
 | 
			
		||||
                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
 | 
			
		||||
                            one_typo_words.insert(derived_word);
 | 
			
		||||
                        } else {
 | 
			
		||||
                            return Ok(ControlFlow::Break(()));
 | 
			
		||||
                        }
 | 
			
		||||
                    }
 | 
			
		||||
            find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
 | 
			
		||||
                if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
 | 
			
		||||
                    one_typo_words.insert(derived_word);
 | 
			
		||||
                    Ok(ControlFlow::Continue(()))
 | 
			
		||||
                } else {
 | 
			
		||||
                    Ok(ControlFlow::Break(()))
 | 
			
		||||
                }
 | 
			
		||||
                Ok(ControlFlow::Continue(()))
 | 
			
		||||
            })?;
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
@@ -357,7 +338,7 @@ impl Interned<QueryTerm> {
 | 
			
		||||
        let mut two_typo_words = BTreeSet::new();
 | 
			
		||||
 | 
			
		||||
        if *max_nbr_typos > 0 {
 | 
			
		||||
            find_zero_one_two_typo_derivations(
 | 
			
		||||
            find_one_two_typo_derivations(
 | 
			
		||||
                *original,
 | 
			
		||||
                *is_prefix,
 | 
			
		||||
                ctx.index.words_fst(ctx.txn)?,
 | 
			
		||||
@@ -370,7 +351,6 @@ impl Interned<QueryTerm> {
 | 
			
		||||
                        return Ok(ControlFlow::Break(()));
 | 
			
		||||
                    }
 | 
			
		||||
                    match nbr_typos {
 | 
			
		||||
                        NumberOfTypos::Zero => {}
 | 
			
		||||
                        NumberOfTypos::One => {
 | 
			
		||||
                            if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
 | 
			
		||||
                                one_typo_words.insert(derived_word);
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user