Remove old query_tree code and make clippy happy

This commit is contained in:
Loïc Lecrenier
2023-03-23 09:39:16 +01:00
parent f5f5f03ec0
commit 7169d85115
8 changed files with 82 additions and 1590 deletions

View File

@ -1,21 +1,14 @@
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
use self::fst_utils::{Complement, Intersection, StartsWith, Union};
pub use self::matches::{
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
};
use crate::{
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
};
use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap;
use std::borrow::Cow;
use std::collections::hash_map::{Entry, HashMap};
use std::fmt;
use std::result::Result as StdResult;
use std::str::Utf8Error;
// Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -26,7 +19,6 @@ pub mod facet;
mod fst_utils;
mod matches;
pub mod new;
mod query_tree;
pub struct Search<'a> {
query: Option<String>,
@ -200,70 +192,6 @@ impl Default for TermsMatchingStrategy {
}
}
pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>;
pub fn word_derivations<'c>(
word: &str,
is_prefix: bool,
max_typo: u8,
fst: &fst::Set<Cow<[u8]>>,
cache: &'c mut WordDerivationsCache,
) -> StdResult<&'c [(String, u8)], Utf8Error> {
match cache.entry((word.to_string(), is_prefix, max_typo)) {
Entry::Occupied(entry) => Ok(entry.into_mut()),
Entry::Vacant(entry) => {
// println!("word derivations {word} {is_prefix} {max_typo}");
let mut derived_words = Vec::new();
if max_typo == 0 {
if is_prefix {
let prefix = Str::new(word).starts_with();
let mut stream = fst.search(prefix).into_stream();
while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?;
derived_words.push((word.to_string(), 0));
}
} else if fst.contains(word) {
derived_words.push((word.to_string(), 0));
}
} else if max_typo == 1 {
let dfa = build_dfa(word, 1, is_prefix);
let starts = StartsWith(Str::new(get_first(word)));
let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream();
while let Some((word, state)) = stream.next() {
let word = std::str::from_utf8(word)?;
let d = dfa.distance(state.1);
derived_words.push((word.to_string(), d.to_u8()));
}
} else {
let starts = StartsWith(Str::new(get_first(word)));
let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts));
let second_dfa = build_dfa(word, 2, is_prefix);
let second = Intersection(&second_dfa, &starts);
let automaton = Union(first, &second);
let mut stream = fst.search_with_state(automaton).into_stream();
while let Some((found_word, state)) = stream.next() {
let found_word = std::str::from_utf8(found_word)?;
// in the case the typo is on the first letter, we know the number of typo
// is two
if get_first(found_word) != get_first(word) {
derived_words.push((found_word.to_string(), 2));
} else {
// Else, we know that it is the second dfa that matched and compute the
// correct distance
let d = second_dfa.distance((state.1).0);
derived_words.push((found_word.to_string(), d.to_u8()));
}
}
}
Ok(entry.insert(derived_words))
}
}
}
fn get_first(s: &str) -> &str {
match s.chars().next() {
Some(c) => &s[..c.len_utf8()],
@ -337,66 +265,66 @@ mod test {
assert!(!search.is_typo_authorized().unwrap());
}
#[test]
fn test_one_typos_tolerance() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
// #[test]
// fn test_one_typos_tolerance() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 1)]);
}
// assert_eq!(found, &[("zealand".to_string(), 1)]);
// }
#[test]
fn test_one_typos_first_letter() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
// #[test]
// fn test_one_typos_first_letter() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap();
assert_eq!(found, &[]);
}
// assert_eq!(found, &[]);
// }
#[test]
fn test_two_typos_tolerance() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
// #[test]
// fn test_two_typos_tolerance() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 2)]);
}
// assert_eq!(found, &[("zealand".to_string(), 2)]);
// }
#[test]
fn test_two_typos_first_letter() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
// #[test]
// fn test_two_typos_first_letter() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 2)]);
}
// assert_eq!(found, &[("zealand".to_string(), 2)]);
// }
#[test]
fn test_prefix() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
// #[test]
// fn test_prefix() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 0)]);
}
// assert_eq!(found, &[("zealand".to_string(), 0)]);
// }
#[test]
fn test_bad_prefix() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
// #[test]
// fn test_bad_prefix() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap();
assert_eq!(found, &[]);
}
// assert_eq!(found, &[]);
// }
#[test]
fn test_prefix_with_typo() {
let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
let mut cache = HashMap::new();
let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
// #[test]
// fn test_prefix_with_typo() {
// let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap();
// let mut cache = HashMap::new();
// let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap();
assert_eq!(found, &[("zealand".to_string(), 1)]);
}
// assert_eq!(found, &[("zealand".to_string(), 1)]);
// }
}