mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-29 18:04:47 +00:00
Deduplicate matching words
This commit is contained in:
@ -2,6 +2,7 @@ use std::cmp::{min, Reverse};
|
||||
use std::collections::BTreeMap;
|
||||
use std::fmt;
|
||||
use std::ops::{Index, IndexMut};
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::Token;
|
||||
use levenshtein_automata::{Distance, DFA};
|
||||
@ -14,11 +15,11 @@ type IsPrefix = bool;
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
||||
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
|
||||
pub fn new(mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>) -> Self {
|
||||
// Sort word by len in DESC order prioritizing the longuest matches,
|
||||
// in order to highlight the longuest part of the matched word.
|
||||
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
|
||||
@ -35,7 +36,8 @@ impl MatchingWords {
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
|
||||
#[allow(clippy::type_complexity)]
|
||||
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
@ -126,7 +128,7 @@ pub enum MatchType<'a> {
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: &'a [MatchingWord],
|
||||
matching_words: &'a [Rc<MatchingWord>],
|
||||
ids: &'a [PrimitiveWordId],
|
||||
char_len: usize,
|
||||
}
|
||||
@ -332,10 +334,15 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 1, true)),
|
||||
Rc::new(MatchingWord::new("this".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
|
||||
(vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
|
||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
|
@ -494,16 +494,23 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::rc::Rc;
|
||||
|
||||
use charabia::TokenizerBuilder;
|
||||
|
||||
use super::*;
|
||||
use crate::search::matches::matching_words::MatchingWord;
|
||||
|
||||
fn matching_words() -> MatchingWords {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("split".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
|
||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
|
||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone()], vec![1]),
|
||||
(vec![all[2].clone()], vec![2]),
|
||||
];
|
||||
|
||||
MatchingWords::new(matching_words)
|
||||
@ -587,10 +594,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
|
||||
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true)),
|
||||
Rc::new(MatchingWord::new("world".to_string(), 1, true)),
|
||||
];
|
||||
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
|
||||
@ -823,24 +831,20 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let all = vec![
|
||||
Rc::new(MatchingWord::new("the".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("t".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("he".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("door".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("do".to_string(), 0, false)),
|
||||
Rc::new(MatchingWord::new("or".to_string(), 0, false)),
|
||||
];
|
||||
let matching_words = vec![
|
||||
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
|
||||
(
|
||||
vec![
|
||||
MatchingWord::new("t".to_string(), 0, false),
|
||||
MatchingWord::new("he".to_string(), 0, false),
|
||||
],
|
||||
vec![0],
|
||||
),
|
||||
(vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]),
|
||||
(
|
||||
vec![
|
||||
MatchingWord::new("do".to_string(), 0, false),
|
||||
MatchingWord::new("or".to_string(), 0, false),
|
||||
],
|
||||
vec![1],
|
||||
),
|
||||
(vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]),
|
||||
(vec![all[0].clone()], vec![0]),
|
||||
(vec![all[1].clone(), all[2].clone()], vec![0]),
|
||||
(vec![all[3].clone()], vec![1]),
|
||||
(vec![all[4].clone(), all[5].clone()], vec![1]),
|
||||
(vec![all[4].clone()], vec![2]),
|
||||
];
|
||||
|
||||
let matching_words = MatchingWords::new(matching_words);
|
||||
|
Reference in New Issue
Block a user