mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	add new highlighter
This commit is contained in:
		
							
								
								
									
										334
									
								
								milli/src/search/new/matches/matching_words.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										334
									
								
								milli/src/search/new/matches/matching_words.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,334 @@ | ||||
| use std::cmp::Reverse; | ||||
| use std::ops::RangeInclusive; | ||||
|  | ||||
| use charabia::Token; | ||||
|  | ||||
| use super::super::interner::Interned; | ||||
| use super::super::query_term::{ | ||||
|     Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, | ||||
| }; | ||||
| use super::super::{DedupInterner, Phrase}; | ||||
| use crate::SearchContext; | ||||
|  | ||||
| pub struct LocatedMatchingPhrase { | ||||
|     pub value: Interned<Phrase>, | ||||
|     pub positions: RangeInclusive<WordId>, | ||||
| } | ||||
|  | ||||
| pub struct LocatedMatchingWords { | ||||
|     pub value: Vec<Interned<String>>, | ||||
|     pub positions: RangeInclusive<WordId>, | ||||
|     pub is_prefix: bool, | ||||
| } | ||||
|  | ||||
| /// Structure created from a query tree | ||||
| /// referencing words that match the given query tree. | ||||
| pub struct MatchingWords<'ctx> { | ||||
|     word_interner: &'ctx DedupInterner<String>, | ||||
|     phrase_interner: &'ctx DedupInterner<Phrase>, | ||||
|     phrases: Vec<LocatedMatchingPhrase>, | ||||
|     words: Vec<LocatedMatchingWords>, | ||||
| } | ||||
|  | ||||
| /// Extract and centralize the different phrases and words to match stored in a QueryTerm. | ||||
| fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) { | ||||
|     let mut matching_words = Vec::new(); | ||||
|     let mut matching_phrases = Vec::new(); | ||||
|  | ||||
|     // the structure is exhaustively extracted to ensure that no field is missing. | ||||
|     let QueryTerm { | ||||
|         original: _, | ||||
|         is_multiple_words: _, | ||||
|         max_nbr_typos: _, | ||||
|         is_prefix: _, | ||||
|         zero_typo, | ||||
|         one_typo, | ||||
|         two_typo, | ||||
|     } = term; | ||||
|  | ||||
|     // the structure is exhaustively extracted to ensure that no field is missing. | ||||
|     let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo; | ||||
|  | ||||
|     // zero typo | ||||
|     if let Some(phrase) = phrase { | ||||
|         matching_phrases.push(*phrase); | ||||
|     } | ||||
|     if let Some(zero_typo) = zero_typo { | ||||
|         matching_words.push(*zero_typo); | ||||
|     } | ||||
|     for synonym in synonyms { | ||||
|         matching_phrases.push(*synonym); | ||||
|     } | ||||
|  | ||||
|     // one typo | ||||
|     // the structure is exhaustively extracted to ensure that no field is missing. | ||||
|     if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo { | ||||
|         if let Some(split_words) = split_words { | ||||
|             matching_phrases.push(*split_words); | ||||
|         } | ||||
|         for one_typo in one_typo { | ||||
|             matching_words.push(*one_typo); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // two typos | ||||
|     // the structure is exhaustively extracted to ensure that no field is missing. | ||||
|     if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo { | ||||
|         for two_typos in two_typos { | ||||
|             matching_words.push(*two_typos); | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     (matching_phrases, matching_words) | ||||
| } | ||||
|  | ||||
| impl<'ctx> MatchingWords<'ctx> { | ||||
|     pub fn new(ctx: &'ctx SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self { | ||||
|         let mut phrases = Vec::new(); | ||||
|         let mut words = Vec::new(); | ||||
|  | ||||
|         // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms | ||||
|         // and wrap them in dedicated structures. | ||||
|         for located_term in located_terms { | ||||
|             let term = ctx.term_interner.get(located_term.value); | ||||
|             let (matching_phrases, matching_words) = extract_matching_terms(term); | ||||
|  | ||||
|             for matching_phrase in matching_phrases { | ||||
|                 phrases.push(LocatedMatchingPhrase { | ||||
|                     value: matching_phrase, | ||||
|                     positions: located_term.positions.clone(), | ||||
|                 }); | ||||
|             } | ||||
|             words.push(LocatedMatchingWords { | ||||
|                 value: matching_words, | ||||
|                 positions: located_term.positions.clone(), | ||||
|                 is_prefix: term.is_prefix, | ||||
|             }); | ||||
|         } | ||||
|  | ||||
|         // Sort word to put prefixes at the bottom prioritizing the exact matches. | ||||
|         words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len()))); | ||||
|  | ||||
|         Self { | ||||
|             phrases, | ||||
|             words, | ||||
|             word_interner: &ctx.word_interner, | ||||
|             phrase_interner: &ctx.phrase_interner, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns an iterator over terms that match or partially match the given token. | ||||
|     pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> { | ||||
|         MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } | ||||
|     } | ||||
|  | ||||
|     /// Try to match the token with one of the located_words. | ||||
|     fn match_unique_words(&'ctx self, token: &Token) -> Option<MatchType<'ctx>> { | ||||
|         for located_words in &self.words { | ||||
|             for word in &located_words.value { | ||||
|                 let word = self.word_interner.get(*word); | ||||
|                 // if the word is a prefix we match using starts_with. | ||||
|                 if located_words.is_prefix && token.lemma().starts_with(word) { | ||||
|                     let char_len = token.original_lengths(word.len()).0; | ||||
|                     let ids = &located_words.positions; | ||||
|                     return Some(MatchType::Full { char_len, ids }); | ||||
|                 // else we exact match the token. | ||||
|                 } else if token.lemma() == word { | ||||
|                     let char_len = token.char_end - token.char_start; | ||||
|                     let ids = &located_words.positions; | ||||
|                     return Some(MatchType::Full { char_len, ids }); | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         None | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Iterator over terms that match the given token, | ||||
| /// This allow to lazily evaluate matches. | ||||
| pub struct MatchesIter<'a, 'b> { | ||||
|     matching_words: &'a MatchingWords<'a>, | ||||
|     phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>, | ||||
|     token: &'b Token<'b>, | ||||
| } | ||||
|  | ||||
| impl<'a> Iterator for MatchesIter<'a, '_> { | ||||
|     type Item = MatchType<'a>; | ||||
|  | ||||
|     fn next(&mut self) -> Option<Self::Item> { | ||||
|         match self.phrases.next() { | ||||
|             // Try to match all the phrases first. | ||||
|             Some(located_phrase) => { | ||||
|                 let phrase = self.matching_words.phrase_interner.get(located_phrase.value); | ||||
|  | ||||
|                 // create a PartialMatch struct to make it compute the first match | ||||
|                 // instead of duplicating the code. | ||||
|                 let ids = &located_phrase.positions; | ||||
|                 // collect the references of words from the interner. | ||||
|                 let words = phrase | ||||
|                     .words | ||||
|                     .iter() | ||||
|                     .map(|word| { | ||||
|                         word.map(|word| self.matching_words.word_interner.get(word).as_str()) | ||||
|                     }) | ||||
|                     .collect(); | ||||
|                 let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; | ||||
|  | ||||
|                 partial.match_token(self.token).or_else(|| self.next()) | ||||
|             } | ||||
|             // If no phrases matches, try to match uiques words. | ||||
|             None => self.matching_words.match_unique_words(self.token), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Id of a matching term corespounding to a word written by the end user. | ||||
| pub type WordId = u16; | ||||
|  | ||||
| /// A given token can partially match a query word for several reasons: | ||||
| /// - split words | ||||
| /// - multi-word synonyms | ||||
| /// In these cases we need to match consecutively several tokens to consider that the match is full. | ||||
| #[derive(Debug, PartialEq)] | ||||
| pub enum MatchType<'a> { | ||||
|     Full { char_len: usize, ids: &'a RangeInclusive<WordId> }, | ||||
|     Partial(PartialMatch<'a>), | ||||
| } | ||||
|  | ||||
| /// Structure helper to match several tokens in a row in order to complete a partial match. | ||||
| #[derive(Debug, PartialEq)] | ||||
| pub struct PartialMatch<'a> { | ||||
|     matching_words: Vec<Option<&'a str>>, | ||||
|     ids: &'a RangeInclusive<WordId>, | ||||
|     char_len: usize, | ||||
| } | ||||
|  | ||||
| impl<'a> PartialMatch<'a> { | ||||
|     /// Returns: | ||||
|     /// - None if the given token breaks the partial match | ||||
|     /// - Partial if the given token matches the partial match but doesn't complete it | ||||
|     /// - Full if the given token completes the partial match | ||||
|     pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> { | ||||
|         let Self { mut matching_words, ids, .. } = self; | ||||
|  | ||||
|         let is_matching = match matching_words.first()? { | ||||
|             Some(word) => &token.lemma() == word, | ||||
|             // a None value in the phrase corresponds to a stop word, | ||||
|             // the walue is considered a match if the current token is categorized as a stop word. | ||||
|             None => token.is_stopword(), | ||||
|         }; | ||||
|  | ||||
|         let char_len = token.char_end - token.char_start; | ||||
|         // if there are remaining words to match in the phrase and the current token is matching, | ||||
|         // return a new Partial match allowing the highlighter to continue. | ||||
|         if is_matching && matching_words.len() > 1 { | ||||
|             matching_words.remove(0); | ||||
|             Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) | ||||
|         // if there is no remaining word to match in the phrase and the current token is matching, | ||||
|         // return a Full match. | ||||
|         } else if is_matching { | ||||
|             Some(MatchType::Full { char_len, ids }) | ||||
|         // if the current token doesn't match, return None to break the match sequence. | ||||
|         } else { | ||||
|             None | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn char_len(&self) -> usize { | ||||
|         self.char_len | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| pub(crate) mod tests { | ||||
|     use std::borrow::Cow; | ||||
|  | ||||
|     use charabia::{TokenKind, TokenizerBuilder}; | ||||
|  | ||||
|     use super::super::super::located_query_terms_from_string; | ||||
|     use super::*; | ||||
|     use crate::index::tests::TempIndex; | ||||
|  | ||||
|     pub(crate) fn temp_index_with_documents() -> TempIndex { | ||||
|         let temp_index = TempIndex::new(); | ||||
|         temp_index | ||||
|             .add_documents(documents!([ | ||||
|                 { "id": 1, "name": "split this world westfali westfalia the" }, | ||||
|             ])) | ||||
|             .unwrap(); | ||||
|         temp_index | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn matching_words() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let tokenizer = TokenizerBuilder::new().build(); | ||||
|         let tokens = tokenizer.tokenize("split this world"); | ||||
|         let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); | ||||
|         let matching_words = MatchingWords::new(&ctx, query_terms); | ||||
|  | ||||
|         assert_eq!( | ||||
|             matching_words | ||||
|                 .match_token(&Token { | ||||
|                     kind: TokenKind::Word, | ||||
|                     lemma: Cow::Borrowed("split"), | ||||
|                     char_end: "split".chars().count(), | ||||
|                     byte_end: "split".len(), | ||||
|                     ..Default::default() | ||||
|                 }) | ||||
|                 .next(), | ||||
|             Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words | ||||
|                 .match_token(&Token { | ||||
|                     kind: TokenKind::Word, | ||||
|                     lemma: Cow::Borrowed("nyc"), | ||||
|                     char_end: "nyc".chars().count(), | ||||
|                     byte_end: "nyc".len(), | ||||
|                     ..Default::default() | ||||
|                 }) | ||||
|                 .next(), | ||||
|             None | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words | ||||
|                 .match_token(&Token { | ||||
|                     kind: TokenKind::Word, | ||||
|                     lemma: Cow::Borrowed("world"), | ||||
|                     char_end: "world".chars().count(), | ||||
|                     byte_end: "world".len(), | ||||
|                     ..Default::default() | ||||
|                 }) | ||||
|                 .next(), | ||||
|             Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words | ||||
|                 .match_token(&Token { | ||||
|                     kind: TokenKind::Word, | ||||
|                     lemma: Cow::Borrowed("worlded"), | ||||
|                     char_end: "worlded".chars().count(), | ||||
|                     byte_end: "worlded".len(), | ||||
|                     ..Default::default() | ||||
|                 }) | ||||
|                 .next(), | ||||
|             Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) | ||||
|         ); | ||||
|         assert_eq!( | ||||
|             matching_words | ||||
|                 .match_token(&Token { | ||||
|                     kind: TokenKind::Word, | ||||
|                     lemma: Cow::Borrowed("thisnew"), | ||||
|                     char_end: "thisnew".chars().count(), | ||||
|                     byte_end: "thisnew".len(), | ||||
|                     ..Default::default() | ||||
|                 }) | ||||
|                 .next(), | ||||
|             None | ||||
|         ); | ||||
|     } | ||||
| } | ||||
							
								
								
									
										848
									
								
								milli/src/search/new/matches/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										848
									
								
								milli/src/search/new/matches/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,848 @@ | ||||
| use std::borrow::Cow; | ||||
|  | ||||
| use charabia::{SeparatorKind, Token, Tokenizer}; | ||||
| use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; | ||||
| use serde::Serialize; | ||||
|  | ||||
| use super::query_term::LocatedQueryTerm; | ||||
| use crate::SearchContext; | ||||
|  | ||||
| pub mod matching_words; | ||||
|  | ||||
| const DEFAULT_CROP_MARKER: &str = "…"; | ||||
| const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>"; | ||||
| const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>"; | ||||
|  | ||||
| /// Structure used to build a Matcher allowing to customize formating tags. | ||||
| pub struct MatcherBuilder<'a, 'ctx, A> { | ||||
|     matching_words: MatchingWords<'ctx>, | ||||
|     tokenizer: Tokenizer<'a, 'a, A>, | ||||
|     crop_marker: Option<String>, | ||||
|     highlight_prefix: Option<String>, | ||||
|     highlight_suffix: Option<String>, | ||||
| } | ||||
|  | ||||
| impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> { | ||||
|     pub fn new( | ||||
|         ctx: &'ctx SearchContext, | ||||
|         located_terms: Vec<LocatedQueryTerm>, | ||||
|         tokenizer: Tokenizer<'a, 'a, A>, | ||||
|     ) -> Self { | ||||
|         let matching_words = MatchingWords::new(ctx, located_terms); | ||||
|         Self { | ||||
|             matching_words, | ||||
|             tokenizer, | ||||
|             crop_marker: None, | ||||
|             highlight_prefix: None, | ||||
|             highlight_suffix: None, | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     pub fn crop_marker(&mut self, marker: String) -> &Self { | ||||
|         self.crop_marker = Some(marker); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn highlight_prefix(&mut self, prefix: String) -> &Self { | ||||
|         self.highlight_prefix = Some(prefix); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn highlight_suffix(&mut self, suffix: String) -> &Self { | ||||
|         self.highlight_suffix = Some(suffix); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { | ||||
|         let crop_marker = match &self.crop_marker { | ||||
|             Some(marker) => marker.as_str(), | ||||
|             None => DEFAULT_CROP_MARKER, | ||||
|         }; | ||||
|  | ||||
|         let highlight_prefix = match &self.highlight_prefix { | ||||
|             Some(marker) => marker.as_str(), | ||||
|             None => DEFAULT_HIGHLIGHT_PREFIX, | ||||
|         }; | ||||
|         let highlight_suffix = match &self.highlight_suffix { | ||||
|             Some(marker) => marker.as_str(), | ||||
|             None => DEFAULT_HIGHLIGHT_SUFFIX, | ||||
|         }; | ||||
|         Matcher { | ||||
|             text, | ||||
|             matching_words: &self.matching_words, | ||||
|             tokenizer: &self.tokenizer, | ||||
|             crop_marker, | ||||
|             highlight_prefix, | ||||
|             highlight_suffix, | ||||
|             matches: None, | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Copy, Clone, Default)] | ||||
| pub struct FormatOptions { | ||||
|     pub highlight: bool, | ||||
|     pub crop: Option<usize>, | ||||
| } | ||||
|  | ||||
| impl FormatOptions { | ||||
|     pub fn merge(self, other: Self) -> Self { | ||||
|         Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[derive(Clone, Debug)] | ||||
| pub struct Match { | ||||
|     match_len: usize, | ||||
|     // ids of the query words that matches. | ||||
|     ids: Vec<WordId>, | ||||
|     // position of the word in the whole text. | ||||
|     word_position: usize, | ||||
|     // position of the token in the whole text. | ||||
|     token_position: usize, | ||||
| } | ||||
|  | ||||
| #[derive(Serialize, Debug, Clone, PartialEq, Eq)] | ||||
| pub struct MatchBounds { | ||||
|     pub start: usize, | ||||
|     pub length: usize, | ||||
| } | ||||
|  | ||||
| /// Structure used to analize a string, compute words that match, | ||||
| /// and format the source string, returning a highlighted and cropped sub-string. | ||||
| pub struct Matcher<'t, 'm, A> { | ||||
|     text: &'t str, | ||||
|     matching_words: &'m MatchingWords<'m>, | ||||
|     tokenizer: &'m Tokenizer<'m, 'm, A>, | ||||
|     crop_marker: &'m str, | ||||
|     highlight_prefix: &'m str, | ||||
|     highlight_suffix: &'m str, | ||||
|     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, | ||||
| } | ||||
|  | ||||
| impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { | ||||
|     /// Iterates over tokens and save any of them that matches the query. | ||||
|     fn compute_matches(&mut self) -> &mut Self { | ||||
|         /// some words are counted as matches only if they are close together and in the good order, | ||||
|         /// compute_partial_match peek into next words to validate if the match is complete. | ||||
|         fn compute_partial_match<'a>( | ||||
|             mut partial: PartialMatch, | ||||
|             token_position: usize, | ||||
|             word_position: usize, | ||||
|             words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>, | ||||
|             matches: &mut Vec<Match>, | ||||
|         ) -> bool { | ||||
|             let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; | ||||
|  | ||||
|             for (token_position, word_position, word) in words_positions { | ||||
|                 partial = match partial.match_token(word) { | ||||
|                     // token matches the partial match, but the match is not full, | ||||
|                     // we temporarly save the current token then we try to match the next one. | ||||
|                     Some(MatchType::Partial(partial)) => { | ||||
|                         potential_matches.push((token_position, word_position, partial.char_len())); | ||||
|                         partial | ||||
|                     } | ||||
|                     // partial match is now full, we keep this matches and we advance positions | ||||
|                     Some(MatchType::Full { char_len, ids }) => { | ||||
|                         let ids: Vec<_> = ids.clone().into_iter().collect(); | ||||
|                         // save previously matched tokens as matches. | ||||
|                         let iter = potential_matches.into_iter().map( | ||||
|                             |(token_position, word_position, match_len)| Match { | ||||
|                                 match_len, | ||||
|                                 ids: ids.clone(), | ||||
|                                 word_position, | ||||
|                                 token_position, | ||||
|                             }, | ||||
|                         ); | ||||
|                         matches.extend(iter); | ||||
|  | ||||
|                         // save the token that closes the partial match as a match. | ||||
|                         matches.push(Match { | ||||
|                             match_len: char_len, | ||||
|                             ids, | ||||
|                             word_position, | ||||
|                             token_position, | ||||
|                         }); | ||||
|  | ||||
|                         // the match is complete, we return true. | ||||
|                         return true; | ||||
|                     } | ||||
|                     // no match, continue to next match. | ||||
|                     None => break, | ||||
|                 }; | ||||
|             } | ||||
|  | ||||
|             // the match is not complete, we return false. | ||||
|             false | ||||
|         } | ||||
|  | ||||
|         let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); | ||||
|         let mut matches = Vec::new(); | ||||
|  | ||||
|         let mut words_positions = tokens | ||||
|             .iter() | ||||
|             .scan((0, 0), |(token_position, word_position), token| { | ||||
|                 let current_token_position = *token_position; | ||||
|                 let current_word_position = *word_position; | ||||
|                 *token_position += 1; | ||||
|                 if !token.is_separator() { | ||||
|                     *word_position += 1; | ||||
|                 } | ||||
|  | ||||
|                 Some((current_token_position, current_word_position, token)) | ||||
|             }) | ||||
|             .filter(|(_, _, token)| !token.is_separator()); | ||||
|  | ||||
|         while let Some((token_position, word_position, word)) = words_positions.next() { | ||||
|             for match_type in self.matching_words.match_token(word) { | ||||
|                 match match_type { | ||||
|                     // we match, we save the current token as a match, | ||||
|                     // then we continue the rest of the tokens. | ||||
|                     MatchType::Full { char_len, ids } => { | ||||
|                         let ids: Vec<_> = ids.clone().into_iter().collect(); | ||||
|                         matches.push(Match { | ||||
|                             match_len: char_len, | ||||
|                             ids, | ||||
|                             word_position, | ||||
|                             token_position, | ||||
|                         }); | ||||
|                         break; | ||||
|                     } | ||||
|                     // we match partially, iterate over next tokens to check if we can complete the match. | ||||
|                     MatchType::Partial(partial) => { | ||||
|                         // if match is completed, we break the matching loop over the current token, | ||||
|                         // then we continue the rest of the tokens. | ||||
|                         let mut wp = words_positions.clone(); | ||||
|                         if compute_partial_match( | ||||
|                             partial, | ||||
|                             token_position, | ||||
|                             word_position, | ||||
|                             &mut wp, | ||||
|                             &mut matches, | ||||
|                         ) { | ||||
|                             words_positions = wp; | ||||
|                             break; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         self.matches = Some((tokens, matches)); | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     /// Returns boundaries of the words that match the query. | ||||
|     pub fn matches(&mut self) -> Vec<MatchBounds> { | ||||
|         match &self.matches { | ||||
|             None => self.compute_matches().matches(), | ||||
|             Some((tokens, matches)) => matches | ||||
|                 .iter() | ||||
|                 .map(|m| MatchBounds { | ||||
|                     start: tokens[m.token_position].byte_start, | ||||
|                     length: m.match_len, | ||||
|                 }) | ||||
|                 .collect(), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns the bounds in byte index of the crop window. | ||||
|     fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { | ||||
|         // if there is no match, we start from the beginning of the string by default. | ||||
|         let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); | ||||
|         let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); | ||||
|         let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); | ||||
|         let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); | ||||
|  | ||||
|         // matches needs to be counted in the crop len. | ||||
|         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; | ||||
|  | ||||
|         // create the initial state of the crop window: 2 iterators starting from the matches positions, | ||||
|         // a reverse iterator starting from the first match token position and going towards the beginning of the text, | ||||
|         let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); | ||||
|         // an iterator starting from the last match token position and going towards the end of the text. | ||||
|         let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); | ||||
|  | ||||
|         // grows the crop window peeking in both directions | ||||
|         // until the window contains the good number of words: | ||||
|         while remaining_words > 0 { | ||||
|             let before_token = before_tokens.peek().map(|t| t.separator_kind()); | ||||
|             let after_token = after_tokens.peek().map(|t| t.separator_kind()); | ||||
|  | ||||
|             match (before_token, after_token) { | ||||
|                 // we can expand both sides. | ||||
|                 (Some(before_token), Some(after_token)) => { | ||||
|                     match (before_token, after_token) { | ||||
|                         // if they are both separators and are the same kind then advance both, | ||||
|                         // or expand in the soft separator separator side. | ||||
|                         (Some(before_token_kind), Some(after_token_kind)) => { | ||||
|                             if before_token_kind == after_token_kind { | ||||
|                                 before_tokens.next(); | ||||
|  | ||||
|                                 // this avoid having an ending separator before crop marker. | ||||
|                                 if remaining_words > 1 { | ||||
|                                     after_tokens.next(); | ||||
|                                 } | ||||
|                             } else if before_token_kind == SeparatorKind::Hard { | ||||
|                                 after_tokens.next(); | ||||
|                             } else { | ||||
|                                 before_tokens.next(); | ||||
|                             } | ||||
|                         } | ||||
|                         // if one of the tokens is a word, we expend in the side of the word. | ||||
|                         // left is a word, advance left. | ||||
|                         (None, Some(_)) => { | ||||
|                             before_tokens.next(); | ||||
|                             remaining_words -= 1; | ||||
|                         } | ||||
|                         // right is a word, advance right. | ||||
|                         (Some(_), None) => { | ||||
|                             after_tokens.next(); | ||||
|                             remaining_words -= 1; | ||||
|                         } | ||||
|                         // both are words, advance left then right if remaining_word > 0. | ||||
|                         (None, None) => { | ||||
|                             before_tokens.next(); | ||||
|                             remaining_words -= 1; | ||||
|  | ||||
|                             if remaining_words > 0 { | ||||
|                                 after_tokens.next(); | ||||
|                                 remaining_words -= 1; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|                 // the end of the text is reached, advance left. | ||||
|                 (Some(before_token), None) => { | ||||
|                     before_tokens.next(); | ||||
|                     if before_token.is_none() { | ||||
|                         remaining_words -= 1; | ||||
|                     } | ||||
|                 } | ||||
|                 // the start of the text is reached, advance right. | ||||
|                 (None, Some(after_token)) => { | ||||
|                     after_tokens.next(); | ||||
|                     if after_token.is_none() { | ||||
|                         remaining_words -= 1; | ||||
|                     } | ||||
|                 } | ||||
|                 // no more token to add. | ||||
|                 (None, None) => break, | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         // finally, keep the byte index of each bound of the crop window. | ||||
|         let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); | ||||
|         let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); | ||||
|  | ||||
|         (crop_byte_start, crop_byte_end) | ||||
|     } | ||||
|  | ||||
|     /// Compute the score of a match interval: | ||||
|     /// 1) count unique matches | ||||
|     /// 2) calculate distance between matches | ||||
|     /// 3) count ordered matches | ||||
|     fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { | ||||
|         let mut ids: Vec<WordId> = Vec::with_capacity(matches.len()); | ||||
|         let mut order_score = 0; | ||||
|         let mut distance_score = 0; | ||||
|  | ||||
|         let mut iter = matches.iter().peekable(); | ||||
|         while let Some(m) = iter.next() { | ||||
|             if let Some(next_match) = iter.peek() { | ||||
|                 // if matches are ordered | ||||
|                 if next_match.ids.iter().min() > m.ids.iter().min() { | ||||
|                     order_score += 1; | ||||
|                 } | ||||
|  | ||||
|                 // compute distance between matches | ||||
|                 distance_score -= (next_match.word_position - m.word_position).min(7) as i16; | ||||
|             } | ||||
|  | ||||
|             ids.extend(m.ids.iter()); | ||||
|         } | ||||
|  | ||||
|         ids.sort_unstable(); | ||||
|         ids.dedup(); | ||||
|         let uniq_score = ids.len() as i16; | ||||
|  | ||||
|         // rank by unique match count, then by distance between matches, then by ordered match count. | ||||
|         (uniq_score, distance_score, order_score) | ||||
|     } | ||||
|  | ||||
|     /// Returns the matches interval where the score computed by match_interval_score is the best. | ||||
|     fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { | ||||
|         // we compute the matches interval if we have at least 2 matches. | ||||
|         if matches.len() > 1 { | ||||
|             // positions of the first and the last match of the best matches interval in `matches`. | ||||
|             let mut best_interval = (0, 0); | ||||
|             let mut best_interval_score = self.match_interval_score(&matches[0..=0]); | ||||
|             // current interval positions. | ||||
|             let mut interval_first = 0; | ||||
|             let mut interval_last = 0; | ||||
|             for (index, next_match) in matches.iter().enumerate().skip(1) { | ||||
|                 // if next match would make interval gross more than crop_size, | ||||
|                 // we compare the current interval with the best one, | ||||
|                 // then we increase `interval_first` until next match can be added. | ||||
|                 if next_match.word_position - matches[interval_first].word_position >= crop_size { | ||||
|                     let interval_score = | ||||
|                         self.match_interval_score(&matches[interval_first..=interval_last]); | ||||
|  | ||||
|                     // keep interval if it's the best | ||||
|                     if interval_score > best_interval_score { | ||||
|                         best_interval = (interval_first, interval_last); | ||||
|                         best_interval_score = interval_score; | ||||
|                     } | ||||
|  | ||||
|                     // advance start of the interval while interval is longer than crop_size. | ||||
|                     while next_match.word_position - matches[interval_first].word_position | ||||
|                         >= crop_size | ||||
|                     { | ||||
|                         interval_first += 1; | ||||
|                     } | ||||
|                 } | ||||
|                 interval_last = index; | ||||
|             } | ||||
|  | ||||
|             // compute the last interval score and compare it to the best one. | ||||
|             let interval_score = | ||||
|                 self.match_interval_score(&matches[interval_first..=interval_last]); | ||||
|             if interval_score > best_interval_score { | ||||
|                 best_interval = (interval_first, interval_last); | ||||
|             } | ||||
|  | ||||
|             &matches[best_interval.0..=best_interval.1] | ||||
|         } else { | ||||
|             matches | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     // Returns the formatted version of the original text. | ||||
|     pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { | ||||
|         if !format_options.highlight && format_options.crop.is_none() { | ||||
|             // compute matches is not needed if no highlight nor crop is requested. | ||||
|             Cow::Borrowed(self.text) | ||||
|         } else { | ||||
|             match &self.matches { | ||||
|                 Some((tokens, matches)) => { | ||||
|                     // If the text has to be cropped, | ||||
|                     // compute the best interval to crop around. | ||||
|                     let matches = match format_options.crop { | ||||
|                         Some(crop_size) if crop_size > 0 => { | ||||
|                             self.find_best_match_interval(matches, crop_size) | ||||
|                         } | ||||
|                         _ => matches, | ||||
|                     }; | ||||
|  | ||||
|                     // If the text has to be cropped, | ||||
|                     // crop around the best interval. | ||||
|                     let (byte_start, byte_end) = match format_options.crop { | ||||
|                         Some(crop_size) if crop_size > 0 => { | ||||
|                             self.crop_bounds(tokens, matches, crop_size) | ||||
|                         } | ||||
|                         _ => (0, self.text.len()), | ||||
|                     }; | ||||
|  | ||||
|                     let mut formatted = Vec::new(); | ||||
|  | ||||
|                     // push crop marker if it's not the start of the text. | ||||
|                     if byte_start > 0 && !self.crop_marker.is_empty() { | ||||
|                         formatted.push(self.crop_marker); | ||||
|                     } | ||||
|  | ||||
|                     let mut byte_index = byte_start; | ||||
|  | ||||
|                     if format_options.highlight { | ||||
|                         // insert highlight markers around matches. | ||||
|                         for m in matches { | ||||
|                             let token = &tokens[m.token_position]; | ||||
|  | ||||
|                             if byte_index < token.byte_start { | ||||
|                                 formatted.push(&self.text[byte_index..token.byte_start]); | ||||
|                             } | ||||
|  | ||||
|                             let highlight_byte_index = self.text[token.byte_start..] | ||||
|                                 .char_indices() | ||||
|                                 .enumerate() | ||||
|                                 .find(|(i, _)| *i == m.match_len) | ||||
|                                 .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); | ||||
|                             formatted.push(self.highlight_prefix); | ||||
|                             formatted.push(&self.text[token.byte_start..highlight_byte_index]); | ||||
|                             formatted.push(self.highlight_suffix); | ||||
|                             // if it's a prefix highlight, we put the end of the word after the highlight marker. | ||||
|                             if highlight_byte_index < token.byte_end { | ||||
|                                 formatted.push(&self.text[highlight_byte_index..token.byte_end]); | ||||
|                             } | ||||
|  | ||||
|                             byte_index = token.byte_end; | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // push the rest of the text between last match and the end of crop. | ||||
|                     if byte_index < byte_end { | ||||
|                         formatted.push(&self.text[byte_index..byte_end]); | ||||
|                     } | ||||
|  | ||||
|                     // push crop marker if it's not the end of the text. | ||||
|                     if byte_end < self.text.len() && !self.crop_marker.is_empty() { | ||||
|                         formatted.push(self.crop_marker); | ||||
|                     } | ||||
|  | ||||
|                     if formatted.len() == 1 { | ||||
|                         // avoid concatenating if there is already 1 slice. | ||||
|                         Cow::Borrowed(&self.text[byte_start..byte_end]) | ||||
|                     } else { | ||||
|                         Cow::Owned(formatted.concat()) | ||||
|                     } | ||||
|                 } | ||||
|                 None => self.compute_matches().format(format_options), | ||||
|             } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| #[cfg(test)] | ||||
| mod tests { | ||||
|     use charabia::TokenizerBuilder; | ||||
|     use matching_words::tests::temp_index_with_documents; | ||||
|  | ||||
|     use super::super::located_query_terms_from_string; | ||||
|     use super::*; | ||||
|  | ||||
|     impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> { | ||||
|         pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self { | ||||
|             let tokenizer = TokenizerBuilder::new().build(); | ||||
|             let tokens = tokenizer.tokenize(query); | ||||
|             let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap(); | ||||
|             Self::new(ctx, query_terms, TokenizerBuilder::new().build()) | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn format_identity() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); | ||||
|  | ||||
|         let format_options = FormatOptions { highlight: false, crop: None }; | ||||
|  | ||||
|         // Text without any match. | ||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop and no highlight should return complete text. | ||||
|         assert_eq!(&matcher.format(format_options), &text); | ||||
|  | ||||
|         // Text containing all matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop and no highlight should return complete text. | ||||
|         assert_eq!(&matcher.format(format_options), &text); | ||||
|  | ||||
|         // Text containing some matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop and no highlight should return complete text. | ||||
|         assert_eq!(&matcher.format(format_options), &text); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn format_highlight() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); | ||||
|  | ||||
|         let format_options = FormatOptions { highlight: true, crop: None }; | ||||
|  | ||||
|         // empty text. | ||||
|         let text = ""; | ||||
|         let mut matcher = builder.build(text); | ||||
|         assert_eq!(&matcher.format(format_options), ""); | ||||
|  | ||||
|         // text containing only separators. | ||||
|         let text = ":-)"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         assert_eq!(&matcher.format(format_options), ":-)"); | ||||
|  | ||||
|         // Text without any match. | ||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop should return complete text, because there is no matches. | ||||
|         assert_eq!(&matcher.format(format_options), &text); | ||||
|  | ||||
|         // Text containing all matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop should return complete text with highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>." | ||||
|         ); | ||||
|  | ||||
|         // Text containing some matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop should return complete text with highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves." | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn highlight_unicode() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "world"); | ||||
|         let format_options = FormatOptions { highlight: true, crop: None }; | ||||
|  | ||||
|         // Text containing prefix match. | ||||
|         let text = "Ŵôřlḑôle"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop should return complete text with highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"<em>Ŵôřlḑ</em>ôle" | ||||
|         ); | ||||
|  | ||||
|         // Text containing unicode match. | ||||
|         let text = "Ŵôřlḑ"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop should return complete text with highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"<em>Ŵôřlḑ</em>" | ||||
|         ); | ||||
|  | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "westfali"); | ||||
|         let format_options = FormatOptions { highlight: true, crop: None }; | ||||
|  | ||||
|         // Text containing unicode match. | ||||
|         let text = "Westfália"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no crop should return complete text with highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"<em>Westfáli</em>a" | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn format_crop() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); | ||||
|  | ||||
|         let format_options = FormatOptions { highlight: false, crop: Some(10) }; | ||||
|  | ||||
|         // empty text. | ||||
|         let text = ""; | ||||
|         let mut matcher = builder.build(text); | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"" | ||||
|         ); | ||||
|  | ||||
|         // text containing only separators. | ||||
|         let text = ":-)"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @":-)" | ||||
|         ); | ||||
|  | ||||
|         // Text without any match. | ||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no highlight should return 10 first words with a marker at the end. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"A quick brown fox can not jump 32 feet, right…" | ||||
|         ); | ||||
|  | ||||
|         // Text without any match starting by a separator. | ||||
|         let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no highlight should return 10 first words with a marker at the end. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"(A quick brown fox can not jump 32 feet, right…" | ||||
|         ); | ||||
|  | ||||
|         // Test phrase propagation | ||||
|         let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // should crop the phrase instead of croping around the match. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"… Split The World is a book written by Emily Henry…" | ||||
|         ); | ||||
|  | ||||
|         // Text containing some matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no highlight should return 10 last words with a marker at the start. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…future to build a world with the boy she loves…" | ||||
|         ); | ||||
|  | ||||
|         // Text containing all matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // no highlight should return 10 last words with a marker at the start. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…she loves. Emily Henry: The Love That Split The World." | ||||
|         ); | ||||
|  | ||||
|         // Text containing a match unordered and a match ordered. | ||||
|         let text = "The world split void void void void void void void void void split the world void void"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // crop should return 10 last words with a marker at the start. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…void void void void void split the world void void" | ||||
|         ); | ||||
|  | ||||
|         // Text containing matches with diferent density. | ||||
|         let text = "split void the void void world void void void void void void void void void void split the world void void"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // crop should return 10 last words with a marker at the start. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…void void void void void split the world void void" | ||||
|         ); | ||||
|  | ||||
|         // Text containing matches with same word. | ||||
|         let text = "split split split split split split void void void void void void void void void void split the world void void"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // crop should return 10 last words with a marker at the start. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…void void void void void split the world void void" | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn format_highlight_crop() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); | ||||
|  | ||||
|         let format_options = FormatOptions { highlight: true, crop: Some(10) }; | ||||
|  | ||||
|         // empty text. | ||||
|         let text = ""; | ||||
|         let mut matcher = builder.build(text); | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"" | ||||
|         ); | ||||
|  | ||||
|         // text containing only separators. | ||||
|         let text = ":-)"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @":-)" | ||||
|         ); | ||||
|  | ||||
|         // Text without any match. | ||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // both should return 10 first words with a marker at the end. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"A quick brown fox can not jump 32 feet, right…" | ||||
|         ); | ||||
|  | ||||
|         // Text containing some matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // both should return 10 last words with a marker at the start and highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…future to build a <em>world</em> with <em>the</em> boy she loves…" | ||||
|         ); | ||||
|  | ||||
|         // Text containing all matches. | ||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // both should return 10 last words with a marker at the start and highlighted matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>." | ||||
|         ); | ||||
|  | ||||
|         // Text containing a match unordered and a match ordered. | ||||
|         let text = "The world split void void void void void void void void void split the world void void"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // crop should return 10 last words with a marker at the start. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void" | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn smaller_crop_size() { | ||||
|         //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); | ||||
|  | ||||
|         let text = "void void split the world void void."; | ||||
|  | ||||
|         // set a smaller crop size | ||||
|         let format_options = FormatOptions { highlight: false, crop: Some(2) }; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // because crop size < query size, partially format matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…split the…" | ||||
|         ); | ||||
|  | ||||
|         // set a smaller crop size | ||||
|         let format_options = FormatOptions { highlight: false, crop: Some(1) }; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // because crop size < query size, partially format matches. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"…split…" | ||||
|         ); | ||||
|  | ||||
|         // set  crop size to 0 | ||||
|         let format_options = FormatOptions { highlight: false, crop: Some(0) }; | ||||
|         let mut matcher = builder.build(text); | ||||
|         // because crop size is 0, crop is ignored. | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"void void split the world void void." | ||||
|         ); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn partial_matches() { | ||||
|         let temp_index = temp_index_with_documents(); | ||||
|         let rtxn = temp_index.read_txn().unwrap(); | ||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||
|         let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\""); | ||||
|         builder.highlight_prefix("_".to_string()); | ||||
|         builder.highlight_suffix("_".to_string()); | ||||
|  | ||||
|         let format_options = FormatOptions { highlight: true, crop: None }; | ||||
|  | ||||
|         let text = "the do or die can't be he do and or isn't he"; | ||||
|         let mut matcher = builder.build(text); | ||||
|         insta::assert_snapshot!( | ||||
|             matcher.format(format_options), | ||||
|             @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" | ||||
|         ); | ||||
|     } | ||||
| } | ||||
| @@ -4,6 +4,7 @@ mod graph_based_ranking_rule; | ||||
| mod interner; | ||||
| mod limits; | ||||
| mod logger; | ||||
| mod matches; | ||||
| mod query_graph; | ||||
| mod query_term; | ||||
| mod ranking_rule_graph; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user