mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-21 12:16:26 +00:00
Merge with main branch
This commit is contained in:
139
crates/milli/src/search/new/matches/best_match_interval.rs
Normal file
139
crates/milli/src/search/new/matches/best_match_interval.rs
Normal file
@ -0,0 +1,139 @@
|
||||
use super::matching_words::WordId;
|
||||
use super::{Match, MatchPosition};
|
||||
|
||||
struct MatchIntervalWithScore {
|
||||
interval: [usize; 2],
|
||||
score: [i16; 3],
|
||||
}
|
||||
|
||||
// count score for phrases
|
||||
fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
|
||||
let words_in_phrase_minus_one = (lwp - fwp) as i16;
|
||||
// will always be ordered, so +1 for each space between words
|
||||
*order_score += words_in_phrase_minus_one;
|
||||
// distance will always be 1, so -1 for each space between words
|
||||
*distance_score -= words_in_phrase_minus_one;
|
||||
}
|
||||
|
||||
/// Compute the score of a match interval:
|
||||
/// 1) count unique matches
|
||||
/// 2) calculate distance between matches
|
||||
/// 3) count ordered matches
|
||||
fn get_interval_score(matches: &[Match]) -> [i16; 3] {
|
||||
let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
|
||||
let mut order_score = 0;
|
||||
let mut distance_score = 0;
|
||||
|
||||
let mut iter = matches.iter().peekable();
|
||||
while let Some(m) = iter.next() {
|
||||
if let Some(next_match) = iter.peek() {
|
||||
// if matches are ordered
|
||||
if next_match.ids.iter().min() > m.ids.iter().min() {
|
||||
order_score += 1;
|
||||
}
|
||||
|
||||
let m_last_word_pos = match m.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
lwp
|
||||
}
|
||||
};
|
||||
let next_match_first_word_pos = next_match.get_first_word_pos();
|
||||
|
||||
// compute distance between matches
|
||||
distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
|
||||
} else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
|
||||
// in case last match is a phrase, count score for its words
|
||||
tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
|
||||
}
|
||||
|
||||
ids.extend(m.ids.iter());
|
||||
}
|
||||
|
||||
ids.sort_unstable();
|
||||
ids.dedup();
|
||||
let uniq_score = ids.len() as i16;
|
||||
|
||||
// rank by unique match count, then by distance between matches, then by ordered match count.
|
||||
[uniq_score, distance_score, order_score]
|
||||
}
|
||||
|
||||
/// Returns the first and last match where the score computed by match_interval_score is the best.
|
||||
pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
|
||||
if matches.is_empty() {
|
||||
panic!("`matches` should not be empty at this point");
|
||||
}
|
||||
|
||||
// positions of the first and the last match of the best matches interval in `matches`.
|
||||
let mut best_interval: Option<MatchIntervalWithScore> = None;
|
||||
|
||||
let mut save_best_interval = |interval_first, interval_last| {
|
||||
let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
|
||||
let is_interval_score_better = &best_interval
|
||||
.as_ref()
|
||||
.map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
|
||||
|
||||
if *is_interval_score_better {
|
||||
best_interval = Some(MatchIntervalWithScore {
|
||||
interval: [interval_first, interval_last],
|
||||
score: interval_score,
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// we compute the matches interval if we have at least 2 matches.
|
||||
// current interval positions.
|
||||
let mut interval_first = 0;
|
||||
let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
for (index, next_match) in matches.iter().enumerate() {
|
||||
// if next match would make interval gross more than crop_size,
|
||||
// we compare the current interval with the best one,
|
||||
// then we increase `interval_first` until next match can be added.
|
||||
let next_match_last_word_pos = next_match.get_last_word_pos();
|
||||
|
||||
// if the next match would mean that we pass the crop size window,
|
||||
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
|
||||
// and calculate a score for it, and check if it's better than our best so far
|
||||
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
|
||||
// if index is 0 there is no last viable match
|
||||
if index != 0 {
|
||||
let interval_last = index - 1;
|
||||
// keep interval if it's the best
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// advance start of the interval while interval is longer than crop_size.
|
||||
loop {
|
||||
interval_first += 1;
|
||||
if interval_first == matches.len() {
|
||||
interval_first -= 1;
|
||||
break;
|
||||
}
|
||||
|
||||
interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
|
||||
|
||||
if interval_first_match_first_word_pos > next_match_last_word_pos
|
||||
|| next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
|
||||
{
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// compute the last interval score and compare it to the best one.
|
||||
let interval_last = matches.len() - 1;
|
||||
// if it's the last match with itself, we need to make sure it's
|
||||
// not a phrase longer than the crop window
|
||||
if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
|
||||
save_best_interval(interval_first, interval_last);
|
||||
}
|
||||
|
||||
// if none of the matches fit the criteria above, default to the first one
|
||||
best_interval.map_or(
|
||||
[&matches[0], &matches[0]],
|
||||
|MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
|
||||
)
|
||||
}
|
62
crates/milli/src/search/new/matches/match.rs
Normal file
62
crates/milli/src/search/new/matches/match.rs
Normal file
@ -0,0 +1,62 @@
|
||||
use super::matching_words::WordId;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub enum MatchPosition {
|
||||
Word {
|
||||
// position of the word in the whole text.
|
||||
word_position: usize,
|
||||
// position of the token in the whole text.
|
||||
token_position: usize,
|
||||
},
|
||||
Phrase {
|
||||
// position of the first and last word in the phrase in the whole text.
|
||||
word_positions: [usize; 2],
|
||||
// position of the first and last token in the phrase in the whole text.
|
||||
token_positions: [usize; 2],
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct Match {
|
||||
pub char_count: usize,
|
||||
// ids of the query words that matches.
|
||||
pub ids: Vec<WordId>,
|
||||
pub position: MatchPosition,
|
||||
}
|
||||
|
||||
impl Match {
|
||||
pub(super) fn get_first_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_word_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { word_position, .. } => word_position,
|
||||
MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_first_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_last_token_pos(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { token_position, .. } => token_position,
|
||||
MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn get_word_count(&self) -> usize {
|
||||
match self.position {
|
||||
MatchPosition::Word { .. } => 1,
|
||||
MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
|
||||
}
|
||||
}
|
||||
}
|
331
crates/milli/src/search/new/matches/matching_words.rs
Normal file
331
crates/milli/src/search/new/matches/matching_words.rs
Normal file
@ -0,0 +1,331 @@
|
||||
use std::cmp::Reverse;
|
||||
use std::fmt;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use charabia::Token;
|
||||
|
||||
use super::super::interner::Interned;
|
||||
use super::super::query_term::LocatedQueryTerm;
|
||||
use super::super::{DedupInterner, Phrase};
|
||||
use crate::SearchContext;
|
||||
|
||||
pub struct LocatedMatchingPhrase {
|
||||
pub value: Interned<Phrase>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
pub struct LocatedMatchingWords {
|
||||
pub value: Vec<Interned<String>>,
|
||||
pub positions: RangeInclusive<WordId>,
|
||||
pub is_prefix: bool,
|
||||
pub original_char_count: usize,
|
||||
}
|
||||
|
||||
/// Structure created from a query tree
|
||||
/// referencing words that match the given query tree.
|
||||
#[derive(Default)]
|
||||
pub struct MatchingWords {
|
||||
word_interner: DedupInterner<String>,
|
||||
phrase_interner: DedupInterner<Phrase>,
|
||||
phrases: Vec<LocatedMatchingPhrase>,
|
||||
words: Vec<LocatedMatchingWords>,
|
||||
}
|
||||
|
||||
impl MatchingWords {
|
||||
pub fn new(ctx: SearchContext<'_>, located_terms: Vec<LocatedQueryTerm>) -> Self {
|
||||
let mut phrases = Vec::new();
|
||||
let mut words = Vec::new();
|
||||
|
||||
// Extract and centralize the different phrases and words to match stored in a QueryTerm
|
||||
// and wrap them in dedicated structures.
|
||||
for located_term in located_terms {
|
||||
let term = ctx.term_interner.get(located_term.value);
|
||||
let (matching_words, matching_phrases) = term.all_computed_derivations();
|
||||
|
||||
for matching_phrase in matching_phrases {
|
||||
phrases.push(LocatedMatchingPhrase {
|
||||
value: matching_phrase,
|
||||
positions: located_term.positions.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
positions: located_term.positions.clone(),
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
}
|
||||
|
||||
// Sort word to put prefixes at the bottom prioritizing the exact matches.
|
||||
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
|
||||
|
||||
Self {
|
||||
phrases,
|
||||
words,
|
||||
word_interner: ctx.word_interner,
|
||||
phrase_interner: ctx.phrase_interner,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over terms that match or partially match the given token.
|
||||
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
|
||||
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
|
||||
}
|
||||
|
||||
/// Try to match the token with one of the located_words.
|
||||
fn match_unique_words<'a>(&'a self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
for located_words in &self.words {
|
||||
for word in &located_words.value {
|
||||
let word = self.word_interner.get(*word);
|
||||
// if the word is a prefix we match using starts_with.
|
||||
if located_words.is_prefix && token.lemma().starts_with(word) {
|
||||
let Some((char_index, c)) =
|
||||
word.char_indices().take(located_words.original_char_count).last()
|
||||
else {
|
||||
continue;
|
||||
};
|
||||
let prefix_length = char_index + c.len_utf8();
|
||||
let (char_count, byte_len) = token.original_lengths(prefix_length);
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full { ids, char_count, byte_len });
|
||||
// else we exact match the token.
|
||||
} else if token.lemma() == word {
|
||||
let ids = &located_words.positions;
|
||||
return Some(MatchType::Full {
|
||||
char_count: token.char_end - token.char_start,
|
||||
byte_len: token.byte_end - token.byte_start,
|
||||
ids,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterator over terms that match the given token,
|
||||
/// This allow to lazily evaluate matches.
|
||||
pub struct MatchesIter<'a, 'b> {
|
||||
matching_words: &'a MatchingWords,
|
||||
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
|
||||
token: &'b Token<'b>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for MatchesIter<'a, '_> {
|
||||
type Item = MatchType<'a>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.phrases.next() {
|
||||
// Try to match all the phrases first.
|
||||
Some(located_phrase) => {
|
||||
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
|
||||
|
||||
// create a PartialMatch struct to make it compute the first match
|
||||
// instead of duplicating the code.
|
||||
let ids = &located_phrase.positions;
|
||||
// collect the references of words from the interner.
|
||||
let words = phrase
|
||||
.words
|
||||
.iter()
|
||||
.map(|word| {
|
||||
word.map(|word| self.matching_words.word_interner.get(word).as_str())
|
||||
})
|
||||
.collect();
|
||||
let partial = PartialMatch { matching_words: words, ids };
|
||||
|
||||
partial.match_token(self.token).or_else(|| self.next())
|
||||
}
|
||||
// If no phrases matches, try to match uiques words.
|
||||
None => self.matching_words.match_unique_words(self.token),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Id of a matching term corespounding to a word written by the end user.
|
||||
pub type WordId = u16;
|
||||
|
||||
/// A given token can partially match a query word for several reasons:
|
||||
/// - split words
|
||||
/// - multi-word synonyms
|
||||
/// In these cases we need to match consecutively several tokens to consider that the match is full.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum MatchType<'a> {
|
||||
Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
|
||||
Partial(PartialMatch<'a>),
|
||||
}
|
||||
|
||||
/// Structure helper to match several tokens in a row in order to complete a partial match.
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct PartialMatch<'a> {
|
||||
matching_words: Vec<Option<&'a str>>,
|
||||
ids: &'a RangeInclusive<WordId>,
|
||||
}
|
||||
|
||||
impl<'a> PartialMatch<'a> {
|
||||
/// Returns:
|
||||
/// - None if the given token breaks the partial match
|
||||
/// - Partial if the given token matches the partial match but doesn't complete it
|
||||
/// - Full if the given token completes the partial match
|
||||
pub fn match_token(self, token: &Token<'_>) -> Option<MatchType<'a>> {
|
||||
let Self { mut matching_words, ids, .. } = self;
|
||||
|
||||
let is_matching = match matching_words.first()? {
|
||||
Some(word) => &token.lemma() == word,
|
||||
// a None value in the phrase corresponds to a stop word,
|
||||
// the walue is considered a match if the current token is categorized as a stop word.
|
||||
None => token.is_stopword(),
|
||||
};
|
||||
|
||||
// if there are remaining words to match in the phrase and the current token is matching,
|
||||
// return a new Partial match allowing the highlighter to continue.
|
||||
if is_matching && matching_words.len() > 1 {
|
||||
matching_words.remove(0);
|
||||
Some(MatchType::Partial(Self { matching_words, ids }))
|
||||
// if there is no remaining word to match in the phrase and the current token is matching,
|
||||
// return a Full match.
|
||||
} else if is_matching {
|
||||
Some(MatchType::Full {
|
||||
char_count: token.char_end - token.char_start,
|
||||
byte_len: token.byte_end - token.byte_start,
|
||||
ids,
|
||||
})
|
||||
// if the current token doesn't match, return None to break the match sequence.
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for MatchingWords {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
|
||||
|
||||
let phrases: Vec<_> = phrases
|
||||
.iter()
|
||||
.map(|p| {
|
||||
(
|
||||
phrase_interner
|
||||
.get(p.value)
|
||||
.words
|
||||
.iter()
|
||||
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
|
||||
.collect::<Vec<_>>()
|
||||
.join(" "),
|
||||
p.positions.clone(),
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let words: Vec<_> = words
|
||||
.iter()
|
||||
.flat_map(|w| {
|
||||
w.value
|
||||
.iter()
|
||||
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
|
||||
.collect::<Vec<_>>()
|
||||
})
|
||||
.collect();
|
||||
|
||||
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use std::borrow::Cow;
|
||||
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::query_term::ExtractedTokens;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
{ "id": 2, "name": "Westfália" },
|
||||
{ "id": 3, "name": "Ŵôřlḑôle" },
|
||||
]))
|
||||
.unwrap();
|
||||
temp_index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
|
||||
);
|
||||
assert_eq!(
|
||||
matching_words
|
||||
.match_token(&Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
})
|
||||
.next(),
|
||||
None
|
||||
);
|
||||
}
|
||||
}
|
929
crates/milli/src/search/new/matches/mod.rs
Normal file
929
crates/milli/src/search/new/matches/mod.rs
Normal file
@ -0,0 +1,929 @@
|
||||
mod best_match_interval;
|
||||
mod r#match;
|
||||
mod matching_words;
|
||||
mod simple_token_kind;
|
||||
|
||||
use charabia::{Language, SeparatorKind, Token, Tokenizer};
|
||||
use either::Either;
|
||||
pub use matching_words::MatchingWords;
|
||||
use matching_words::{MatchType, PartialMatch};
|
||||
use r#match::{Match, MatchPosition};
|
||||
use serde::Serialize;
|
||||
use simple_token_kind::SimpleTokenKind;
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
cmp::{max, min},
|
||||
};
|
||||
|
||||
const DEFAULT_CROP_MARKER: &str = "…";
|
||||
const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
|
||||
const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
|
||||
|
||||
/// Structure used to build a Matcher allowing to customize formating tags.
|
||||
pub struct MatcherBuilder<'m> {
|
||||
matching_words: MatchingWords,
|
||||
tokenizer: Tokenizer<'m>,
|
||||
crop_marker: Option<String>,
|
||||
highlight_prefix: Option<String>,
|
||||
highlight_suffix: Option<String>,
|
||||
}
|
||||
|
||||
impl<'m> MatcherBuilder<'m> {
|
||||
pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self {
|
||||
Self {
|
||||
matching_words,
|
||||
tokenizer,
|
||||
crop_marker: None,
|
||||
highlight_prefix: None,
|
||||
highlight_suffix: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn crop_marker(&mut self, marker: String) -> &Self {
|
||||
self.crop_marker = Some(marker);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
|
||||
self.highlight_prefix = Some(prefix);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
|
||||
self.highlight_suffix = Some(suffix);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn build<'t, 'lang>(
|
||||
&self,
|
||||
text: &'t str,
|
||||
locales: Option<&'lang [Language]>,
|
||||
) -> Matcher<'t, 'm, '_, 'lang> {
|
||||
let crop_marker = match &self.crop_marker {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_CROP_MARKER,
|
||||
};
|
||||
|
||||
let highlight_prefix = match &self.highlight_prefix {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_HIGHLIGHT_PREFIX,
|
||||
};
|
||||
let highlight_suffix = match &self.highlight_suffix {
|
||||
Some(marker) => marker.as_str(),
|
||||
None => DEFAULT_HIGHLIGHT_SUFFIX,
|
||||
};
|
||||
Matcher {
|
||||
text,
|
||||
matching_words: &self.matching_words,
|
||||
tokenizer: &self.tokenizer,
|
||||
crop_marker,
|
||||
highlight_prefix,
|
||||
highlight_suffix,
|
||||
matches: None,
|
||||
locales,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default, Debug)]
|
||||
pub struct FormatOptions {
|
||||
pub highlight: bool,
|
||||
pub crop: Option<usize>,
|
||||
}
|
||||
|
||||
impl FormatOptions {
|
||||
pub fn merge(self, other: Self) -> Self {
|
||||
Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
|
||||
}
|
||||
|
||||
pub fn should_format(&self) -> bool {
|
||||
self.highlight || self.crop.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct MatchBounds {
|
||||
pub start: usize,
|
||||
pub length: usize,
|
||||
}
|
||||
|
||||
/// Structure used to analyze a string, compute words that match,
|
||||
/// and format the source string, returning a highlighted and cropped sub-string.
|
||||
pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
|
||||
text: &'t str,
|
||||
matching_words: &'b MatchingWords,
|
||||
tokenizer: &'b Tokenizer<'tokenizer>,
|
||||
locales: Option<&'lang [Language]>,
|
||||
crop_marker: &'b str,
|
||||
highlight_prefix: &'b str,
|
||||
highlight_suffix: &'b str,
|
||||
matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
|
||||
}
|
||||
|
||||
impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
||||
/// Iterates over tokens and save any of them that matches the query.
|
||||
fn compute_matches(&mut self) -> &mut Self {
|
||||
/// some words are counted as matches only if they are close together and in the good order,
|
||||
/// compute_partial_match peek into next words to validate if the match is complete.
|
||||
fn compute_partial_match<'a>(
|
||||
mut partial: PartialMatch<'a>,
|
||||
first_token_position: usize,
|
||||
first_word_position: usize,
|
||||
first_word_char_start: &usize,
|
||||
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
||||
matches: &mut Vec<Match>,
|
||||
) -> bool {
|
||||
for (token_position, word_position, word) in words_positions {
|
||||
partial = match partial.match_token(word) {
|
||||
// token matches the partial match, but the match is not full,
|
||||
// we temporarily save the current token then we try to match the next one.
|
||||
Some(MatchType::Partial(partial)) => partial,
|
||||
// partial match is now full, we keep this matches and we advance positions
|
||||
Some(MatchType::Full { ids, .. }) => {
|
||||
// save the token that closes the partial match as a match.
|
||||
matches.push(Match {
|
||||
char_count: word.char_end - *first_word_char_start,
|
||||
ids: ids.clone().collect(),
|
||||
position: MatchPosition::Phrase {
|
||||
word_positions: [first_word_position, word_position],
|
||||
token_positions: [first_token_position, token_position],
|
||||
},
|
||||
});
|
||||
|
||||
// the match is complete, we return true.
|
||||
return true;
|
||||
}
|
||||
// no match, continue to next match.
|
||||
None => break,
|
||||
};
|
||||
}
|
||||
|
||||
// the match is not complete, we return false.
|
||||
false
|
||||
}
|
||||
|
||||
let tokens: Vec<_> =
|
||||
self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect();
|
||||
let mut matches = Vec::new();
|
||||
|
||||
let mut words_positions = tokens
|
||||
.iter()
|
||||
.scan((0, 0), |(token_position, word_position), token| {
|
||||
let current_token_position = *token_position;
|
||||
let current_word_position = *word_position;
|
||||
*token_position += 1;
|
||||
if !token.is_separator() {
|
||||
*word_position += 1;
|
||||
}
|
||||
|
||||
Some((current_token_position, current_word_position, token))
|
||||
})
|
||||
.filter(|(_, _, token)| !token.is_separator());
|
||||
|
||||
while let Some((token_position, word_position, word)) = words_positions.next() {
|
||||
for match_type in self.matching_words.match_token(word) {
|
||||
match match_type {
|
||||
// we match, we save the current token as a match,
|
||||
// then we continue the rest of the tokens.
|
||||
MatchType::Full { ids, char_count, .. } => {
|
||||
let ids: Vec<_> = ids.clone().collect();
|
||||
matches.push(Match {
|
||||
char_count,
|
||||
ids,
|
||||
position: MatchPosition::Word { word_position, token_position },
|
||||
});
|
||||
break;
|
||||
}
|
||||
// we match partially, iterate over next tokens to check if we can complete the match.
|
||||
MatchType::Partial(partial) => {
|
||||
// if match is completed, we break the matching loop over the current token,
|
||||
// then we continue the rest of the tokens.
|
||||
let mut wp = words_positions.clone();
|
||||
if compute_partial_match(
|
||||
partial,
|
||||
token_position,
|
||||
word_position,
|
||||
&word.char_start,
|
||||
&mut wp,
|
||||
&mut matches,
|
||||
) {
|
||||
words_positions = wp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
self.matches = Some((tokens, matches));
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns boundaries of the words that match the query.
|
||||
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
||||
match &self.matches {
|
||||
None => self.compute_matches().matches(),
|
||||
Some((tokens, matches)) => matches
|
||||
.iter()
|
||||
.map(|m| MatchBounds {
|
||||
start: tokens[m.get_first_token_pos()].byte_start,
|
||||
// TODO: Why is this in chars, while start is in bytes?
|
||||
length: m.char_count,
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the bounds in byte index of the crop window.
|
||||
fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
|
||||
let (
|
||||
mut remaining_words,
|
||||
is_iterating_forward,
|
||||
before_tokens_starting_index,
|
||||
after_tokens_starting_index,
|
||||
) = if !matches.is_empty() {
|
||||
let [matches_first, matches_last] =
|
||||
best_match_interval::find_best_match_interval(matches, crop_size);
|
||||
|
||||
let matches_size =
|
||||
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
|
||||
|
||||
let is_crop_size_gte_match_size = crop_size >= matches_size;
|
||||
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
|
||||
|
||||
let remaining_words = if is_crop_size_gte_match_size {
|
||||
crop_size - matches_size
|
||||
} else {
|
||||
// in case matches size is greater than crop size, which implies there's only one match,
|
||||
// we count words backwards, because we have to remove words, as they're extra words outside of
|
||||
// crop window
|
||||
matches_size - crop_size
|
||||
};
|
||||
|
||||
let after_tokens_starting_index = if matches_size == 0 {
|
||||
0
|
||||
} else {
|
||||
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
|
||||
if last_match_last_token_position_plus_one < tokens.len() {
|
||||
last_match_last_token_position_plus_one
|
||||
} else {
|
||||
// we have matched the end of possible tokens, there's nothing to advance
|
||||
tokens.len() - 1
|
||||
}
|
||||
};
|
||||
|
||||
(
|
||||
remaining_words,
|
||||
is_iterating_forward,
|
||||
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
|
||||
after_tokens_starting_index,
|
||||
)
|
||||
} else {
|
||||
(crop_size, true, 0, 0)
|
||||
};
|
||||
|
||||
// create the initial state of the crop window: 2 iterators starting from the matches positions,
|
||||
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
|
||||
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
|
||||
// an iterator ...
|
||||
let mut after_tokens = if is_iterating_forward {
|
||||
// ... starting from the last match token position and going towards the end of the text.
|
||||
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
|
||||
} else {
|
||||
// ... starting from the last match token position and going towards the start of the text.
|
||||
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
|
||||
};
|
||||
|
||||
// grows the crop window peeking in both directions
|
||||
// until the window contains the good number of words:
|
||||
while remaining_words > 0 {
|
||||
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
|
||||
let after_token_kind =
|
||||
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
|
||||
|
||||
match (before_token_kind, after_token_kind) {
|
||||
// we can expand both sides.
|
||||
(Some(before_token_kind), Some(after_token_kind)) => {
|
||||
match (before_token_kind, after_token_kind) {
|
||||
// if they are both separators and are the same kind then advance both,
|
||||
// or expand in the soft separator separator side.
|
||||
(
|
||||
SimpleTokenKind::Separator(before_token_separator_kind),
|
||||
SimpleTokenKind::Separator(after_token_separator_kind),
|
||||
) => {
|
||||
if before_token_separator_kind == after_token_separator_kind {
|
||||
before_tokens.next();
|
||||
|
||||
// this avoid having an ending separator before crop marker.
|
||||
if remaining_words > 1 {
|
||||
after_tokens.next();
|
||||
}
|
||||
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
|
||||
after_tokens.next();
|
||||
} else {
|
||||
before_tokens.next();
|
||||
}
|
||||
}
|
||||
// if one of the tokens is a word, we expend in the side of the word.
|
||||
// left is a word, advance left.
|
||||
(SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
|
||||
before_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
// right is a word, advance right.
|
||||
(SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
|
||||
after_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
// both are words, advance left then right if remaining_word > 0.
|
||||
(SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
|
||||
before_tokens.next();
|
||||
remaining_words -= 1;
|
||||
|
||||
if remaining_words > 0 {
|
||||
after_tokens.next();
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// the end of the text is reached, advance left.
|
||||
(Some(before_token_kind), None) => {
|
||||
before_tokens.next();
|
||||
if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
// the start of the text is reached, advance right.
|
||||
(None, Some(after_token_kind)) => {
|
||||
after_tokens.next();
|
||||
if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
|
||||
remaining_words -= 1;
|
||||
}
|
||||
}
|
||||
// no more token to add.
|
||||
(None, None) => break,
|
||||
}
|
||||
}
|
||||
|
||||
// finally, keep the byte index of each bound of the crop window.
|
||||
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
|
||||
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
|
||||
|
||||
[crop_byte_start, crop_byte_end]
|
||||
}
|
||||
|
||||
// Returns the formatted version of the original text.
|
||||
pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
|
||||
if !format_options.highlight && format_options.crop.is_none() {
|
||||
// compute matches is not needed if no highlight nor crop is requested.
|
||||
Cow::Borrowed(self.text)
|
||||
} else {
|
||||
match &self.matches {
|
||||
Some((tokens, matches)) => {
|
||||
// If the text has to be cropped, crop around the best interval.
|
||||
let [crop_byte_start, crop_byte_end] = match format_options.crop {
|
||||
Some(crop_size) if crop_size > 0 => {
|
||||
self.crop_bounds(tokens, matches, crop_size)
|
||||
}
|
||||
_ => [0, self.text.len()],
|
||||
};
|
||||
|
||||
let mut formatted = Vec::new();
|
||||
|
||||
// push crop marker if it's not the start of the text.
|
||||
if crop_byte_start > 0 && !self.crop_marker.is_empty() {
|
||||
formatted.push(self.crop_marker);
|
||||
}
|
||||
|
||||
let mut byte_index = crop_byte_start;
|
||||
|
||||
if format_options.highlight {
|
||||
// insert highlight markers around matches.
|
||||
for m in matches {
|
||||
let [m_byte_start, m_byte_end] = match m.position {
|
||||
MatchPosition::Word { token_position, .. } => {
|
||||
let token = &tokens[token_position];
|
||||
[&token.byte_start, &token.byte_end]
|
||||
}
|
||||
MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
|
||||
[&tokens[ftp].byte_start, &tokens[ltp].byte_end]
|
||||
}
|
||||
};
|
||||
|
||||
// skip matches out of the crop window
|
||||
if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end {
|
||||
continue;
|
||||
}
|
||||
|
||||
// adjust start and end to the crop window size
|
||||
let [m_byte_start, m_byte_end] = [
|
||||
max(m_byte_start, &crop_byte_start),
|
||||
min(m_byte_end, &crop_byte_end),
|
||||
];
|
||||
|
||||
// push text that is positioned before our matches
|
||||
if byte_index < *m_byte_start {
|
||||
formatted.push(&self.text[byte_index..*m_byte_start]);
|
||||
}
|
||||
|
||||
formatted.push(self.highlight_prefix);
|
||||
|
||||
// TODO: This is additional work done, charabia::token::Token byte_len
|
||||
// should already get us the original byte length, however, that doesn't work as
|
||||
// it's supposed to, investigate why
|
||||
let highlight_byte_index = self.text[*m_byte_start..]
|
||||
.char_indices()
|
||||
.nth(m.char_count)
|
||||
.map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end));
|
||||
formatted.push(&self.text[*m_byte_start..highlight_byte_index]);
|
||||
|
||||
formatted.push(self.highlight_suffix);
|
||||
|
||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||
if highlight_byte_index < *m_byte_end {
|
||||
formatted.push(&self.text[highlight_byte_index..*m_byte_end]);
|
||||
}
|
||||
|
||||
byte_index = *m_byte_end;
|
||||
}
|
||||
}
|
||||
|
||||
// push the rest of the text between last match and the end of crop.
|
||||
if byte_index < crop_byte_end {
|
||||
formatted.push(&self.text[byte_index..crop_byte_end]);
|
||||
}
|
||||
|
||||
// push crop marker if it's not the end of the text.
|
||||
if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() {
|
||||
formatted.push(self.crop_marker);
|
||||
}
|
||||
|
||||
if formatted.len() == 1 {
|
||||
// avoid concatenating if there is already 1 slice.
|
||||
Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end])
|
||||
} else {
|
||||
Cow::Owned(formatted.concat())
|
||||
}
|
||||
}
|
||||
None => self.compute_matches().format(format_options),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use charabia::TokenizerBuilder;
|
||||
use matching_words::tests::temp_index_with_documents;
|
||||
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
|
||||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
|
||||
let mut ctx = SearchContext::new(index, rtxn).unwrap();
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &None).unwrap();
|
||||
let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search(
|
||||
&mut ctx,
|
||||
Some(query),
|
||||
crate::TermsMatchingStrategy::default(),
|
||||
crate::score_details::ScoringStrategy::Skip,
|
||||
false,
|
||||
universe,
|
||||
&None,
|
||||
&None,
|
||||
crate::search::new::GeoSortStrategy::default(),
|
||||
0,
|
||||
100,
|
||||
Some(10),
|
||||
&mut crate::DefaultSearchLogger,
|
||||
&mut crate::DefaultSearchLogger,
|
||||
TimeBudget::max(),
|
||||
None,
|
||||
None,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
|
||||
MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_identity() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: None };
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(&matcher.format(format_options), "");
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(&matcher.format(format_options), ":-)");
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text, because there is no matches.
|
||||
assert_eq!(&matcher.format(format_options), &text);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "world");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing prefix match.
|
||||
let text = "Ŵôřlḑôle";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>ôle"
|
||||
);
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Ŵôřlḑ";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Ŵôřlḑ</em>"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "westfali");
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
// Text containing unicode match.
|
||||
let text = "Westfália";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>Westfáli</em>a"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_crop() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@""
|
||||
);
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@":-)"
|
||||
);
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Text without any match starting by a separator.
|
||||
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"(A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Test phrase propagation
|
||||
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should crop the phrase instead of croping around the match.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…Split The World is a book written by Emily Henry…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…future to build a world with the boy she loves…"
|
||||
);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no highlight should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…she loves. Emily Henry: The Love That Split The World."
|
||||
);
|
||||
|
||||
// Text containing a match unordered and a match ordered.
|
||||
let text = "The world split void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with different density.
|
||||
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
|
||||
// Text containing matches with same word.
|
||||
let text = "split split split split split split void void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void split the world void void"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
// empty text.
|
||||
let text = "";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@""
|
||||
);
|
||||
|
||||
// text containing only separators.
|
||||
let text = ":-)";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@":-)"
|
||||
);
|
||||
|
||||
// Text without any match.
|
||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 first words with a marker at the end.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"A quick brown fox can not jump 32 feet, right…"
|
||||
);
|
||||
|
||||
// Text containing some matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…future to build a <em>world</em> with <em>the</em> boy she loves…"
|
||||
);
|
||||
|
||||
// Text containing all matches.
|
||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
|
||||
);
|
||||
|
||||
// Text containing a match unordered and a match ordered.
|
||||
let text = "The world split void void void void void void void void void split the world void void";
|
||||
let mut matcher = builder.build(text, None);
|
||||
// crop should return 10 last words with a marker at the start.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop_phrase_query() {
|
||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||
let temp_index = TempIndex::new();
|
||||
|
||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||
temp_index
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "text": text }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…the power to split <em>the world</em> between those who embraced…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
||||
let mut matcher = builder.build(text, None);
|
||||
// should highlight "those" and the phrase "and those".
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"The groundbreaking invention had the power to split the world\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"The groundbreaking invention had the power to split the world between those\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"<em>The groundbreaking invention had the power to split the world</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
// TODO: Should include exclamation mark without crop markers
|
||||
@"…between those who <em>embraced progress and those who resisted change</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"groundbreaking invention\" \"split the world between\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
|
||||
);
|
||||
|
||||
let builder = MatcherBuilder::new_test(
|
||||
&rtxn,
|
||||
&temp_index,
|
||||
"\"groundbreaking invention\" \"had the power to split the world between those\"",
|
||||
);
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…<em>invention</em> <em>had the power to split the world between those</em>…"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
|
||||
let text = "void void split the world void void.";
|
||||
|
||||
// set a smaller crop size
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(2) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size < query size, partially format matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…split the…"
|
||||
);
|
||||
|
||||
// set a smaller crop size
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(1) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size < query size, partially format matches.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"…split…"
|
||||
);
|
||||
|
||||
// set crop size to 0
|
||||
let format_options = FormatOptions { highlight: false, crop: Some(0) };
|
||||
let mut matcher = builder.build(text, None);
|
||||
// because crop size is 0, crop is ignored.
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"void void split the world void void."
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let temp_index = temp_index_with_documents();
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut builder =
|
||||
MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
|
||||
builder.highlight_prefix("_".to_string());
|
||||
builder.highlight_suffix("_".to_string());
|
||||
|
||||
let format_options = FormatOptions { highlight: true, crop: None };
|
||||
|
||||
let text = "the do or die can't be he do and or isn't he";
|
||||
let mut matcher = builder.build(text, None);
|
||||
insta::assert_snapshot!(
|
||||
matcher.format(format_options),
|
||||
@"_the_ _do or_ die can't be he do and or isn'_t he_"
|
||||
);
|
||||
}
|
||||
}
|
15
crates/milli/src/search/new/matches/simple_token_kind.rs
Normal file
15
crates/milli/src/search/new/matches/simple_token_kind.rs
Normal file
@ -0,0 +1,15 @@
|
||||
use charabia::{SeparatorKind, Token, TokenKind};
|
||||
|
||||
pub enum SimpleTokenKind {
|
||||
Separator(SeparatorKind),
|
||||
NotSeparator,
|
||||
}
|
||||
|
||||
impl SimpleTokenKind {
|
||||
pub fn new(token: &&Token<'_>) -> Self {
|
||||
match token.kind {
|
||||
TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
|
||||
_ => Self::NotSeparator,
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user