Avoid creating a MatchingWord for words that exceed the length limit

This commit is contained in:
Loïc Lecrenier
2022-11-24 09:00:53 +01:00
parent 86c34a996b
commit 8d0ace2d64
8 changed files with 111 additions and 62 deletions

View File

@ -8,6 +8,7 @@ use charabia::Token;
use levenshtein_automata::{Distance, DFA};
use crate::search::build_dfa;
use crate::MAX_WORD_LENGTH;
type IsPrefix = bool;
@ -18,6 +19,17 @@ pub struct MatchingWords {
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
}
impl fmt::Debug for MatchingWords {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "[")?;
for (matching_words, primitive_word_id) in self.inner.iter() {
writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
}
writeln!(f, "]")?;
Ok(())
}
}
impl MatchingWords {
pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
// Sort word by len in DESC order prioritizing the longuest matches,
@ -93,10 +105,13 @@ impl PartialEq for MatchingWord {
}
impl MatchingWord {
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
if word.len() > MAX_WORD_LENGTH {
return None;
}
let dfa = build_dfa(&word, typo, prefix);
Self { dfa, word, typo, prefix }
Some(Self { dfa, word, typo, prefix })
}
/// Returns the lenght in chars of the match in case of the token matches the term.
@ -335,9 +350,9 @@ mod tests {
#[test]
fn matching_words() {
let all = vec![
Rc::new(MatchingWord::new("split".to_string(), 1, true)),
Rc::new(MatchingWord::new("this".to_string(), 0, false)),
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
];
let matching_words = vec![
(vec![all[0].clone()], vec![0]),

View File

@ -503,9 +503,9 @@ mod tests {
fn matching_words() -> MatchingWords {
let all = vec![
Rc::new(MatchingWord::new("split".to_string(), 0, false)),
Rc::new(MatchingWord::new("the".to_string(), 0, false)),
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
];
let matching_words = vec![
(vec![all[0].clone()], vec![0]),
@ -595,8 +595,8 @@ mod tests {
#[test]
fn highlight_unicode() {
let all = vec![
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)),
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
];
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
@ -832,12 +832,12 @@ mod tests {
#[test]
fn partial_matches() {
let all = vec![
Rc::new(MatchingWord::new("the".to_string(), 0, false)),
Rc::new(MatchingWord::new("t".to_string(), 0, false)),
Rc::new(MatchingWord::new("he".to_string(), 0, false)),
Rc::new(MatchingWord::new("door".to_string(), 0, false)),
Rc::new(MatchingWord::new("do".to_string(), 0, false)),
Rc::new(MatchingWord::new("or".to_string(), 0, false)),
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
];
let matching_words = vec![
(vec![all[0].clone()], vec![0]),