No more use FST to find a word without any typo

This commit is contained in:
ManyTheFish
2025-03-12 15:44:41 +01:00
parent b0b1888ef9
commit bf144a94d8
2 changed files with 61 additions and 68 deletions

View File

@@ -1755,6 +1755,19 @@ impl Index {
} }
Ok(stats) Ok(stats)
} }
/// Check if the word is indexed in the index.
///
/// This function checks if the word is indexed in the index by looking at the word_docids and exact_word_docids.
///
/// # Arguments
///
/// * `rtxn`: The read transaction.
/// * `word`: The word to check.
pub fn contains_word(&self, rtxn: &RoTxn<'_>, word: &str) -> Result<bool> {
Ok(self.word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some()
|| self.exact_word_docids.remap_data_type::<DecodeIgnore>().get(rtxn, word)?.is_some())
}
} }
#[derive(Debug, Deserialize, Serialize)] #[derive(Debug, Deserialize, Serialize)]

View File

@@ -1,10 +1,12 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::BTreeSet; use std::collections::BTreeSet;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use fst::automaton::Str; use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use heed::types::DecodeIgnore; use heed::types::DecodeIgnore;
use itertools::{merge_join_by, EitherOrBoth};
use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm}; use super::{OneTypoTerm, Phrase, QueryTerm, ZeroTypoTerm};
use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
@@ -16,16 +18,10 @@ use crate::{Result, MAX_WORD_LENGTH};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum NumberOfTypos { pub enum NumberOfTypos {
Zero,
One, One,
Two, Two,
} }
pub enum ZeroOrOneTypo {
Zero,
One,
}
impl Interned<QueryTerm> { impl Interned<QueryTerm> {
pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext<'_>) -> Result<()> {
let s = ctx.term_interner.get_mut(self); let s = ctx.term_interner.get_mut(self);
@@ -47,34 +43,45 @@ impl Interned<QueryTerm> {
} }
fn find_zero_typo_prefix_derivations( fn find_zero_typo_prefix_derivations(
ctx: &mut SearchContext<'_>,
word_interned: Interned<String>, word_interned: Interned<String>,
fst: fst::Set<Cow<'_, [u8]>>,
word_interner: &mut DedupInterner<String>,
mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>, mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
) -> Result<()> { ) -> Result<()> {
let word = word_interner.get(word_interned).to_owned(); let word = ctx.word_interner.get(word_interned).to_owned();
let word = word.as_str(); let word = word.as_str();
let prefix = Str::new(word).starts_with();
let mut stream = fst.search(prefix).into_stream();
while let Some(derived_word) = stream.next() { let words =
let derived_word = std::str::from_utf8(derived_word)?.to_owned(); ctx.index.word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
let derived_word_interned = word_interner.insert(derived_word); let exact_words =
if derived_word_interned != word_interned { ctx.index.exact_word_docids.remap_data_type::<DecodeIgnore>().prefix_iter(ctx.txn, word)?;
let cf = visit(derived_word_interned)?;
if cf.is_break() { for eob in merge_join_by(words, exact_words, |lhs, rhs| match (lhs, rhs) {
break; (Ok((word, _)), Ok((exact_word, _))) => word.cmp(exact_word),
(Err(_), _) | (_, Err(_)) => Ordering::Equal,
}) {
match eob {
EitherOrBoth::Both(kv, _) | EitherOrBoth::Left(kv) | EitherOrBoth::Right(kv) => {
let (derived_word, _) = kv?;
let derived_word = derived_word.to_string();
let derived_word_interned = ctx.word_interner.insert(derived_word);
if derived_word_interned != word_interned {
let cf = visit(derived_word_interned)?;
if cf.is_break() {
break;
}
}
} }
} }
} }
Ok(()) Ok(())
} }
fn find_zero_one_typo_derivations( fn find_one_typo_derivations(
ctx: &mut SearchContext<'_>, ctx: &mut SearchContext<'_>,
word_interned: Interned<String>, word_interned: Interned<String>,
is_prefix: bool, is_prefix: bool,
mut visit: impl FnMut(Interned<String>, ZeroOrOneTypo) -> Result<ControlFlow<()>>, mut visit: impl FnMut(Interned<String>) -> Result<ControlFlow<()>>,
) -> Result<()> { ) -> Result<()> {
let fst = ctx.get_words_fst()?; let fst = ctx.get_words_fst()?;
let word = ctx.word_interner.get(word_interned).to_owned(); let word = ctx.word_interner.get(word_interned).to_owned();
@@ -89,16 +96,9 @@ fn find_zero_one_typo_derivations(
let derived_word = ctx.word_interner.insert(derived_word.to_owned()); let derived_word = ctx.word_interner.insert(derived_word.to_owned());
let d = dfa.distance(state.1); let d = dfa.distance(state.1);
match d.to_u8() { match d.to_u8() {
0 => { 0 => (),
if derived_word != word_interned {
let cf = visit(derived_word, ZeroOrOneTypo::Zero)?;
if cf.is_break() {
break;
}
}
}
1 => { 1 => {
let cf = visit(derived_word, ZeroOrOneTypo::One)?; let cf = visit(derived_word)?;
if cf.is_break() { if cf.is_break() {
break; break;
} }
@@ -111,7 +111,7 @@ fn find_zero_one_typo_derivations(
Ok(()) Ok(())
} }
fn find_zero_one_two_typo_derivations( fn find_one_two_typo_derivations(
word_interned: Interned<String>, word_interned: Interned<String>,
is_prefix: bool, is_prefix: bool,
fst: fst::Set<Cow<'_, [u8]>>, fst: fst::Set<Cow<'_, [u8]>>,
@@ -144,14 +144,7 @@ fn find_zero_one_two_typo_derivations(
// correct distance // correct distance
let d = second_dfa.distance((state.1).0); let d = second_dfa.distance((state.1).0);
match d.to_u8() { match d.to_u8() {
0 => { 0 => (),
if derived_word_interned != word_interned {
let cf = visit(derived_word_interned, NumberOfTypos::Zero)?;
if cf.is_break() {
break;
}
}
}
1 => { 1 => {
let cf = visit(derived_word_interned, NumberOfTypos::One)?; let cf = visit(derived_word_interned, NumberOfTypos::One)?;
if cf.is_break() { if cf.is_break() {
@@ -194,8 +187,6 @@ pub fn partially_initialized_term_from_word(
}); });
} }
let fst = ctx.index.words_fst(ctx.txn)?;
let use_prefix_db = is_prefix let use_prefix_db = is_prefix
&& (ctx && (ctx
.index .index
@@ -215,24 +206,19 @@ pub fn partially_initialized_term_from_word(
let mut zero_typo = None; let mut zero_typo = None;
let mut prefix_of = BTreeSet::new(); let mut prefix_of = BTreeSet::new();
if fst.contains(word) || ctx.index.exact_word_docids.get(ctx.txn, word)?.is_some() { if ctx.index.contains_word(ctx.txn, word)? {
zero_typo = Some(word_interned); zero_typo = Some(word_interned);
} }
if is_prefix && use_prefix_db.is_none() { if is_prefix && use_prefix_db.is_none() {
find_zero_typo_prefix_derivations( find_zero_typo_prefix_derivations(ctx, word_interned, |derived_word| {
word_interned, if prefix_of.len() < limits::MAX_PREFIX_COUNT {
fst, prefix_of.insert(derived_word);
&mut ctx.word_interner, Ok(ControlFlow::Continue(()))
|derived_word| { } else {
if prefix_of.len() < limits::MAX_PREFIX_COUNT { Ok(ControlFlow::Break(()))
prefix_of.insert(derived_word); }
Ok(ControlFlow::Continue(())) })?;
} else {
Ok(ControlFlow::Break(()))
}
},
)?;
} }
let synonyms = ctx.index.synonyms(ctx.txn)?; let synonyms = ctx.index.synonyms(ctx.txn)?;
let mut synonym_word_count = 0; let mut synonym_word_count = 0;
@@ -295,18 +281,13 @@ impl Interned<QueryTerm> {
let mut one_typo_words = BTreeSet::new(); let mut one_typo_words = BTreeSet::new();
if *max_nbr_typos > 0 { if *max_nbr_typos > 0 {
find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { find_one_typo_derivations(ctx, original, is_prefix, |derived_word| {
match nbr_typos { if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
ZeroOrOneTypo::Zero => {} one_typo_words.insert(derived_word);
ZeroOrOneTypo::One => { Ok(ControlFlow::Continue(()))
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { } else {
one_typo_words.insert(derived_word); Ok(ControlFlow::Break(()))
} else {
return Ok(ControlFlow::Break(()));
}
}
} }
Ok(ControlFlow::Continue(()))
})?; })?;
} }
@@ -357,7 +338,7 @@ impl Interned<QueryTerm> {
let mut two_typo_words = BTreeSet::new(); let mut two_typo_words = BTreeSet::new();
if *max_nbr_typos > 0 { if *max_nbr_typos > 0 {
find_zero_one_two_typo_derivations( find_one_two_typo_derivations(
*original, *original,
*is_prefix, *is_prefix,
ctx.index.words_fst(ctx.txn)?, ctx.index.words_fst(ctx.txn)?,
@@ -370,7 +351,6 @@ impl Interned<QueryTerm> {
return Ok(ControlFlow::Break(())); return Ok(ControlFlow::Break(()));
} }
match nbr_typos { match nbr_typos {
NumberOfTypos::Zero => {}
NumberOfTypos::One => { NumberOfTypos::One => {
if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
one_typo_words.insert(derived_word); one_typo_words.insert(derived_word);