Create formater with some tests

This commit is contained in:
ManyTheFish
2022-03-22 15:22:14 +01:00
parent 900825bac0
commit d96e72e5dc
3 changed files with 469 additions and 17 deletions

View File

@ -0,0 +1,372 @@
use std::cmp::{min, Reverse};
use std::collections::{BTreeMap, HashMap};
use std::ops::{Index, IndexMut};
use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;
use crate::search::build_dfa;
use crate::search::query_tree::{Operation, Query};
type IsPrefix = bool;
/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
dfas: Vec<(DFA, String, u8, IsPrefix, usize)>,
}
impl MatchingWords {
pub fn from_query_tree(tree: &Operation) -> Self {
// fetch matchable words from the query tree
let mut dfas: Vec<_> = fetch_queries(tree)
.into_iter()
// create DFAs for each word
.map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id))
.collect();
// Sort word by len in DESC order prioritizing the longuest word,
// in order to highlight the longuest part of the matched word.
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| {
Reverse(query_word.len())
});
Self { dfas }
}
/// Returns the number of matching bytes if the word matches one of the query words.
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len)
}
pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> {
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| {
match dfa.eval(word_to_highlight.text()) {
Distance::Exact(t) if t <= *typo => {
if *is_prefix {
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
Some((word_to_highlight.num_chars_from_bytes(len), *id))
} else {
Some((
word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()),
*id,
))
}
}
_otherwise => None,
}
})
}
}
/// Lists all words which can be considered as a match for the query tree.
fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> {
fn resolve_ops<'a>(
tree: &'a Operation,
out: &mut HashMap<(&'a str, u8, IsPrefix), usize>,
id: &mut usize,
) {
match tree {
Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id));
}
Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.entry((kind.word(), typo, *prefix)).or_insert_with(|| {
*id += 1;
*id
});
}
Operation::Phrase(words) => {
for word in words {
out.entry((word, 0, false)).or_insert_with(|| {
*id += 1;
*id
});
}
}
}
}
let mut queries = HashMap::new();
let mut id = 0;
resolve_ops(tree, &mut queries, &mut id);
queries
}
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array { y_size: y, buf: vec![value; x * y] }
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
/// Returns the number of **bytes** we want to highlight in the `source` word.
/// Basically we want to highlight as much characters as possible in the source until it has too much
/// typos (= 2)
/// The algorithm is a modified
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
fn bytes_to_highlight(source: &str, target: &str) -> usize {
let n = source.chars().count();
let m = target.chars().count();
if n == 0 {
return 0;
}
// since we allow two typos we can send two characters even if it's completely wrong
if m < 3 {
return source.chars().take(m).map(|c| c.len_utf8()).sum();
}
if n == m && source == target {
return source.len();
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..=n {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..=m {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.chars().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.chars().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = if char_s == char_t { 0 } else { 1 };
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in 0..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x);
}
}
// everything was done characters wise and now we want to returns a number of bytes
source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use std::str::from_utf8;
use meilisearch_tokenizer::TokenKind;
use super::*;
use crate::search::query_tree::{Operation, Query, QueryKind};
use crate::MatchingWords;
#[test]
fn test_bytes_to_highlight() {
struct TestBytesToHighlight {
query: &'static str,
text: &'static str,
length: usize,
}
let tests = [
TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
TestBytesToHighlight {
query: "Levenshtein",
text: "Levenshtein",
length: "Levenshtein".len(),
},
// we get to the end of our word with only one typo
TestBytesToHighlight {
query: "Levenste",
text: "Levenshtein",
length: "Levenste".len(),
},
// we get our third and last authorized typo right on the last character
TestBytesToHighlight {
query: "Levenstein",
text: "Levenshte",
length: "Levenste".len(),
},
// we get to the end of our word with only two typos at the beginning
TestBytesToHighlight {
query: "Bavenshtein",
text: "Levenshtein",
length: "Bavenshtein".len(),
},
TestBytesToHighlight {
query: "Альфа", text: "Альфой", length: "Альф".len()
},
TestBytesToHighlight {
query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
},
TestBytesToHighlight {
query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
},
TestBytesToHighlight {
query: "chäräcters",
text: "chäräcters",
length: "chäräcters".len(),
},
TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
];
for test in &tests {
let length = bytes_to_highlight(test.text, test.query);
assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
assert!(
from_utf8(&test.query.as_bytes()[..length]).is_ok(),
r#"converting {}[..{}] to an utf8 str failed"#,
test.query,
length
);
}
}
#[test]
fn matching_words() {
let query_tree = Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: true,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("this".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
);
let matching_words = MatchingWords::from_query_tree(&query_tree);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("word"),
byte_start: 0,
char_index: 0,
byte_end: "word".len(),
char_map: None,
}),
Some(3)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("nyc"),
byte_start: 0,
char_index: 0,
byte_end: "nyc".len(),
char_map: None,
}),
None
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("world"),
byte_start: 0,
char_index: 0,
byte_end: "world".len(),
char_map: None,
}),
Some(5)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("splitted"),
byte_start: 0,
char_index: 0,
byte_end: "splitted".len(),
char_map: None,
}),
Some(5)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("thisnew"),
byte_start: 0,
char_index: 0,
byte_end: "thisnew".len(),
char_map: None,
}),
None
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("borld"),
byte_start: 0,
char_index: 0,
byte_end: "borld".len(),
char_map: None,
}),
Some(5)
);
assert_eq!(
matching_words.matching_bytes(&Token {
kind: TokenKind::Word,
word: Cow::Borrowed("wordsplit"),
byte_start: 0,
char_index: 0,
byte_end: "wordsplit".len(),
char_map: None,
}),
Some(4)
);
}
}

View File

@ -0,0 +1,434 @@
use std::borrow::Cow;
use matching_words::MatchingWords;
use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token};
use crate::search::query_tree::Operation;
pub mod matching_words;
const DEFAULT_CROP_SIZE: usize = 10;
const DEFAULT_CROP_MARKER: &'static str = "";
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
pub struct MatcherBuilder {
matching_words: MatchingWords,
crop_size: usize,
crop_marker: Option<String>,
highlight_prefix: Option<String>,
highlight_suffix: Option<String>,
}
impl MatcherBuilder {
pub fn from_query_tree(query_tree: &Operation) -> Self {
let matching_words = MatchingWords::from_query_tree(query_tree);
Self {
matching_words,
crop_size: DEFAULT_CROP_SIZE,
crop_marker: None,
highlight_prefix: None,
highlight_suffix: None,
}
}
pub fn crop_size(&mut self, word_count: usize) -> &Self {
self.crop_size = word_count;
self
}
pub fn crop_marker(&mut self, marker: String) -> &Self {
self.crop_marker = Some(marker);
self
}
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
self.highlight_prefix = Some(prefix);
self
}
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
self.highlight_suffix = Some(suffix);
self
}
pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> {
let crop_marker = match &self.crop_marker {
Some(marker) => marker.as_str(),
None => &DEFAULT_CROP_MARKER,
};
let highlight_prefix = match &self.highlight_prefix {
Some(marker) => marker.as_str(),
None => &DEFAULT_HIGHLIGHT_PREFIX,
};
let highlight_suffix = match &self.highlight_suffix {
Some(marker) => marker.as_str(),
None => &DEFAULT_HIGHLIGHT_SUFFIX,
};
Matcher {
text,
tokens,
matching_words: &self.matching_words,
crop_size: self.crop_size,
crop_marker,
highlight_prefix,
highlight_suffix,
matches: None,
}
}
}
// impl Default for MatcherBuilder {
// fn default() -> Self {
// Self {
// crop_size: DEFAULT_CROP_SIZE,
// crop_marker: None,
// highlight_prefix: None,
// highlight_suffix: None,
// }
// }
// }
pub struct Match<'t> {
token: &'t Token<'t>,
match_len: usize,
// id of the query word that matches.
id: usize,
// position of the word in the whole text.
position: usize,
}
pub struct MatchBounds {
start: usize,
length: usize,
}
impl<'t> From<&Match<'t>> for MatchBounds {
fn from(m: &Match) -> Self {
MatchBounds { start: m.token.byte_start, length: m.match_len }
}
}
pub struct Matcher<'t, 'm> {
text: &'t str,
tokens: &'t [Token<'t>],
matching_words: &'m MatchingWords,
crop_size: usize,
crop_marker: &'m str,
highlight_prefix: &'m str,
highlight_suffix: &'m str,
matches: Option<Vec<Match<'t>>>,
}
impl<'t> Matcher<'t, '_> {
fn compute_matches(&mut self) -> &mut Self {
let mut matches = Vec::new();
let mut position = 0;
for token in self.tokens {
match token.is_separator() {
Some(SeparatorKind::Hard) => position += 7,
None => {
if let Some((match_len, id)) =
self.matching_words.matching_bytes_with_id(&token)
{
matches.push(Match { token, match_len, id, position });
}
position += 1;
}
_otherwise => {}
}
}
self.matches = Some(matches);
self
}
pub fn matches(&mut self) -> Vec<MatchBounds> {
match &self.matches {
None => self.compute_matches().matches(),
Some(matches) => matches.iter().map(MatchBounds::from).collect(),
}
}
fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) {
let byte_end = self
.tokens
.iter()
.filter(|t| t.is_separator().is_none())
.enumerate()
.take_while(|(i, _)| *i < self.crop_size)
.last()
.map_or(self.text.len(), |(_, t)| t.byte_end);
(0, byte_end)
}
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
if !highlight && !crop {
// compute matches is not needed if no highlight or crop is requested.
Cow::Borrowed(self.text)
} else {
match &self.matches {
Some(matches) => {
let (byte_start, byte_end) =
if crop { self.crop_bounds(matches) } else { (0, self.text.len()) };
let mut formatted = Vec::new();
// push crop marker if it's not the start of the text.
if byte_start > 0 && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker);
}
let mut byte_index = byte_start;
if highlight {
// insert highlight markers around matches.
for m in matches
.iter()
.skip_while(|m| m.token.byte_start < byte_start)
.take_while(|m| m.token.byte_start < byte_end)
{
if byte_index < m.token.byte_start {
formatted.push(&self.text[byte_index..m.token.byte_start]);
}
formatted.push(self.highlight_prefix);
formatted.push(&self.text[m.token.byte_start..m.token.byte_end]);
formatted.push(self.highlight_suffix);
byte_index = m.token.byte_end;
}
}
// push the rest of the text between last match and the end of crop.
if byte_index < byte_end {
formatted.push(&self.text[byte_index..byte_end]);
}
// push crop marker if it's not the end of the text.
if byte_end < self.text.len() && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker);
}
if formatted.len() == 1 {
// avoid concatenating if there is already 1 slice.
Cow::Borrowed(&self.text[byte_start..byte_end])
} else {
Cow::Owned(formatted.concat())
}
}
None => self.compute_matches().format(highlight, crop),
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::search::query_tree::{Query, QueryKind};
fn query_tree() -> Operation {
Operation::Or(
false,
vec![Operation::And(vec![
Operation::Query(Query {
prefix: true,
kind: QueryKind::exact("split".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("the".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
])],
)
}
#[test]
fn format_identity() {
let query_tree = query_tree();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
let crop = false;
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(highlight, crop), &text);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(highlight, crop), &text);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(highlight, crop), &text);
}
#[test]
fn format_highlight() {
let query_tree = query_tree();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
let crop = false;
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text, because there is no matches.
assert_eq!(&matcher.format(highlight, crop), &text);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(
&matcher.format(highlight, crop),
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
);
}
#[test]
fn format_crop() {
let query_tree = query_tree();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
let crop = true;
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(highlight, crop),
"A quick brown fox can not jump 32 feet, right…"
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
"…she loves. Emily Henry: The Love That Split The World"
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
"…future to build a world with the boy she loves."
);
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
"…void void void void void split the world void void"
);
}
#[test]
fn format_highlight_crop() {
let query_tree = query_tree();
let builder = MatcherBuilder::from_query_tree(&query_tree);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
let crop = true;
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(highlight, crop),
"A quick brown fox can not jump 32 feet, right…"
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(
&matcher.format(highlight, crop),
"…future to build a <em>world</em> with <em>the</em> boy she loves."
);
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
);
}
}