Fix match count

This commit is contained in:
ManyTheFish
2022-04-04 18:56:59 +02:00
parent 56e0edd621
commit 3bb1e35ada
4 changed files with 469 additions and 231 deletions

View File

@ -1,12 +1,12 @@
use std::cmp::{min, Reverse};
use std::collections::{BTreeMap, HashMap};
use std::collections::BTreeMap;
use std::fmt;
use std::ops::{Index, IndexMut};
use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;
use crate::search::build_dfa;
use crate::search::query_tree::{Operation, Query};
type IsPrefix = bool;
@ -14,83 +14,129 @@ type IsPrefix = bool;
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
dfas: Vec<(DFA, String, u8, IsPrefix, usize)>,
inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
}
impl MatchingWords {
pub fn from_query_tree(tree: &Operation) -> Self {
// fetch matchable words from the query tree
let mut dfas: Vec<_> = fetch_queries(tree)
.into_iter()
// create DFAs for each word
.map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id))
.collect();
// Sort word by len in DESC order prioritizing the longuest word,
pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
// Sort word by len in DESC order prioritizing the longuest matches,
// in order to highlight the longuest part of the matched word.
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| {
Reverse(query_word.len())
});
Self { dfas }
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
Self { inner: matching_words }
}
/// Returns the number of matching bytes if the word matches one of the query words.
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len)
}
pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| {
match dfa.eval(word_to_highlight.text()) {
Distance::Exact(t) if t <= *typo => {
if *is_prefix {
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
Some((word_to_highlight.num_chars_from_bytes(len), *id))
} else {
Some((
word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()),
*id,
))
}
}
_otherwise => None,
}
})
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
MatchesIter { inner: Box::new(self.inner.iter()), token }
}
}
/// Lists all words which can be considered as a match for the query tree.
fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> {
fn resolve_ops<'a>(
tree: &'a Operation,
out: &mut HashMap<(&'a str, u8, IsPrefix), usize>,
id: &mut usize,
) {
match tree {
Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id));
}
Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.entry((kind.word(), typo, *prefix)).or_insert_with(|| {
*id += 1;
*id
});
}
Operation::Phrase(words) => {
for word in words {
out.entry((word, 0, false)).or_insert_with(|| {
*id += 1;
*id
});
pub struct MatchesIter<'a, 'b> {
inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
token: &'b Token<'b>,
}
impl<'a> Iterator for MatchesIter<'a, '_> {
type Item = MatchType<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) {
Some(char_len) => {
if matching_words.len() > 1 {
Some(MatchType::Partial(PartialMatch {
matching_words: &matching_words[1..],
ids,
char_len,
}))
} else {
Some(MatchType::Full { char_len, ids })
}
}
}
None => self.next(),
},
None => None,
}
}
}
let mut queries = HashMap::new();
let mut id = 0;
resolve_ops(tree, &mut queries, &mut id);
queries
pub type PrimitiveWordId = u8;
pub struct MatchingWord {
pub dfa: DFA,
pub word: String,
pub typo: u8,
pub prefix: IsPrefix,
}
impl fmt::Debug for MatchingWord {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("MatchingWord")
.field("word", &self.word)
.field("typo", &self.typo)
.field("prefix", &self.prefix)
.finish()
}
}
impl PartialEq for MatchingWord {
fn eq(&self, other: &Self) -> bool {
self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
}
}
impl MatchingWord {
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
let dfa = build_dfa(&word, typo, prefix);
Self { dfa, word, typo, prefix }
}
pub fn match_token(&self, token: &Token) -> Option<usize> {
match self.dfa.eval(token.text()) {
Distance::Exact(t) if t <= self.typo => {
if self.prefix {
let len = bytes_to_highlight(token.text(), &self.word);
Some(token.num_chars_from_bytes(len))
} else {
Some(token.num_chars_from_bytes(token.text().len()))
}
}
_otherwise => None,
}
}
}
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
Full { char_len: usize, ids: &'a [PrimitiveWordId] },
Partial(PartialMatch<'a>),
}
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: &'a [MatchingWord],
ids: &'a [PrimitiveWordId],
char_len: usize,
}
impl<'a> PartialMatch<'a> {
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
self.matching_words[0].match_token(token).map(|char_len| {
if self.matching_words.len() > 1 {
MatchType::Partial(PartialMatch {
matching_words: &self.matching_words[1..],
ids: self.ids,
char_len,
})
} else {
MatchType::Full { char_len, ids: self.ids }
}
})
}
pub fn char_len(&self) -> usize {
self.char_len
}
}
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
@ -203,7 +249,6 @@ mod tests {
use meilisearch_tokenizer::TokenKind;
use super::*;
use crate::search::query_tree::{Operation, Query, QueryKind};
use crate::MatchingWords;
#[test]
@ -271,102 +316,104 @@ mod tests {
#[test]
fn matching_words() {
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: true,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let matching_words = vec![
(vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
(vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
];
let matching_words = MatchingWords::from_query_tree(&query_tree);
let matching_words = MatchingWords::new(matching_words);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("word"),
byte_start: 0,
char_index: 0,
byte_end: "word".len(),
char_map: None,
}),
Some(3)
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("word"),
byte_start: 0,
char_index: 0,
byte_end: "word".len(),
char_map: None,
})
.next(),
Some(MatchType::Full { char_len: 3, ids: &[2] })
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("nyc"),
byte_start: 0,
char_index: 0,
byte_end: "nyc".len(),
char_map: None,
}),
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("nyc"),
byte_start: 0,
char_index: 0,
byte_end: "nyc".len(),
char_map: None,
})
.next(),
None
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("world"),
byte_start: 0,
char_index: 0,
byte_end: "world".len(),
char_map: None,
}),
Some(5)
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("world"),
byte_start: 0,
char_index: 0,
byte_end: "world".len(),
char_map: None,
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("splitted"),
byte_start: 0,
char_index: 0,
byte_end: "splitted".len(),
char_map: None,
}),
Some(5)
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("splitted"),
byte_start: 0,
char_index: 0,
byte_end: "splitted".len(),
char_map: None,
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[0] })
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("thisnew"),
byte_start: 0,
char_index: 0,
byte_end: "thisnew".len(),
char_map: None,
}),
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("thisnew"),
byte_start: 0,
char_index: 0,
byte_end: "thisnew".len(),
char_map: None,
})
.next(),
None
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("borld"),
byte_start: 0,
char_index: 0,
byte_end: "borld".len(),
char_map: None,
}),
Some(5)
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("borld"),
byte_start: 0,
char_index: 0,
byte_end: "borld".len(),
char_map: None,
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("wordsplit"),
byte_start: 0,
char_index: 0,
byte_end: "wordsplit".len(),
char_map: None,
}),
Some(4)
matching_words
.match_token(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("wordsplit"),
byte_start: 0,
char_index: 0,
byte_end: "wordsplit".len(),
char_map: None,
})
.next(),
Some(MatchType::Full { char_len: 4, ids: &[2] })
);
}
}

View File

@ -1,11 +1,10 @@
use std::borrow::Cow;
pub use matching_words::MatchingWords;
use matching_words::{MatchType, PrimitiveWordId};
use meilisearch_tokenizer::token::{SeparatorKind, Token};
use crate::search::query_tree::Operation;
mod matching_words;
pub mod matching_words;
const DEFAULT_CROP_SIZE: usize = 10;
const DEFAULT_CROP_MARKER: &'static str = "";
@ -21,18 +20,6 @@ pub struct MatcherBuilder {
}
impl MatcherBuilder {
pub fn from_query_tree(query_tree: &Operation) -> Self {
let matching_words = MatchingWords::from_query_tree(query_tree);
Self {
matching_words,
crop_size: DEFAULT_CROP_SIZE,
crop_marker: None,
highlight_prefix: None,
highlight_suffix: None,
}
}
pub fn from_matching_words(matching_words: MatchingWords) -> Self {
Self {
matching_words,
@ -93,8 +80,8 @@ impl MatcherBuilder {
#[derive(Clone, Debug)]
pub struct Match {
match_len: usize,
// id of the query word that matches.
id: usize,
// ids of the query words that matches.
ids: Vec<PrimitiveWordId>,
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
@ -123,10 +110,72 @@ impl<'t> Matcher<'t, '_> {
let mut matches = Vec::new();
let mut word_position = 0;
let mut token_position = 0;
for token in self.tokens {
while let Some(token) = self.tokens.get(token_position) {
if token.is_separator().is_none() {
if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) {
matches.push(Match { match_len, id, word_position, token_position });
'matches: for match_type in self.matching_words.match_token(&token) {
match match_type {
MatchType::Full { char_len, ids } => {
matches.push(Match {
match_len: char_len,
ids: ids.to_vec(),
word_position,
token_position,
});
// stop on the first match
break;
}
MatchType::Partial(mut partial) => {
let mut potential_matches =
vec![(token_position, word_position, partial.char_len())];
let mut t_position = 1;
let mut w_position = 1;
'partials: for token in &self.tokens[token_position + 1..] {
if token.is_separator().is_none() {
partial = match partial.match_token(&token) {
Some(MatchType::Partial(partial)) => {
potential_matches.push((
token_position + t_position,
word_position + w_position,
partial.char_len(),
));
partial
}
// partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { char_len, ids }) => {
let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| {
Match {
match_len,
ids: ids.to_vec(),
word_position,
token_position,
}
},
);
matches.extend(iter);
word_position += w_position;
token_position += t_position;
matches.push(Match {
match_len: char_len,
ids: ids.to_vec(),
word_position,
token_position,
});
break 'matches;
}
// no match, continue to next match.
None => break 'partials,
};
w_position += 1;
}
t_position += 1;
}
}
}
}
word_position += 1;
}
@ -229,7 +278,7 @@ impl<'t> Matcher<'t, '_> {
}
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
let mut ids = Vec::with_capacity(matches.len());
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
let mut order_score = 0;
let mut distance_score = 0;
@ -237,7 +286,7 @@ impl<'t> Matcher<'t, '_> {
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
if next_match.id > m.id {
if next_match.ids.iter().min() > m.ids.iter().min() {
order_score += 1;
}
@ -245,7 +294,7 @@ impl<'t> Matcher<'t, '_> {
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
}
ids.push(m.id);
ids.extend(m.ids.iter());
}
ids.sort_unstable();
@ -348,7 +397,8 @@ impl<'t> Matcher<'t, '_> {
.char_indices()
.enumerate()
.find(|(i, _)| *i == m.match_len)
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
.min(token.byte_end);
formatted.push(self.highlight_prefix);
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
formatted.push(self.highlight_suffix);
@ -386,33 +436,23 @@ mod tests {
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use super::*;
use crate::search::query_tree::{Query, QueryKind};
use crate::search::matches::matching_words::MatchingWord;
fn query_tree() -> Operation {
Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: true,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("the".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
)
fn matching_words() -> MatchingWords {
let matching_words = vec![
(vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
];
MatchingWords::new(matching_words)
}
#[test]
fn format_identity() {
let query_tree = query_tree();
let matching_words = matching_words();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
@ -445,9 +485,9 @@ mod tests {
#[test]
fn format_highlight() {
let query_tree = query_tree();
let matching_words = matching_words();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
@ -497,21 +537,14 @@ mod tests {
#[test]
fn highlight_unicode() {
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "wessfalia".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let matching_words = vec![
(vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
];
let builder = MatcherBuilder::from_query_tree(&query_tree);
let matching_words = MatchingWords::new(matching_words);
let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
@ -539,14 +572,14 @@ mod tests {
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(highlight, crop), "<em>Westfália</em>");
assert_eq!(&matcher.format(highlight, crop), "<em>Westfáli</em>a");
}
#[test]
fn format_crop() {
let query_tree = query_tree();
let matching_words = matching_words();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
@ -657,9 +690,9 @@ mod tests {
#[test]
fn format_highlight_crop() {
let query_tree = query_tree();
let matching_words = matching_words();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
@ -724,9 +757,9 @@ mod tests {
#[test]
fn smaller_crop_size() {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let query_tree = query_tree();
let matching_words = matching_words();
let mut builder = MatcherBuilder::from_query_tree(&query_tree);
let mut builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;