mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Make some cleaning and add comments
This commit is contained in:
		| @@ -4,6 +4,8 @@ pub use matching_words::MatchingWords; | ||||
| use matching_words::{MatchType, PrimitiveWordId}; | ||||
| use meilisearch_tokenizer::token::{SeparatorKind, Token}; | ||||
|  | ||||
| use crate::search::matches::matching_words::PartialMatch; | ||||
|  | ||||
| pub mod matching_words; | ||||
|  | ||||
| const DEFAULT_CROP_SIZE: usize = 10; | ||||
| @@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> { | ||||
| } | ||||
|  | ||||
| impl<'t> Matcher<'t, '_> { | ||||
|     /// Iterates over tokens and save any of them that matches the query. | ||||
|     fn compute_matches(&mut self) -> &mut Self { | ||||
|         fn compute_partial_match( | ||||
|             mut partial: PartialMatch, | ||||
|             tokens: &[Token], | ||||
|             token_position: &mut usize, | ||||
|             word_position: &mut usize, | ||||
|             matches: &mut Vec<Match>, | ||||
|         ) -> bool { | ||||
|             let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())]; | ||||
|             let mut t_position = 1; | ||||
|             let mut w_position = 1; | ||||
|             for token in &tokens[*token_position + 1..] { | ||||
|                 if token.is_separator().is_none() { | ||||
|                     partial = match partial.match_token(&token) { | ||||
|                         // token matches the partial match, but the match is not full, | ||||
|                         // we temporarly save the current token then we try to match the next one. | ||||
|                         Some(MatchType::Partial(partial)) => { | ||||
|                             potential_matches.push(( | ||||
|                                 *token_position + t_position, | ||||
|                                 *word_position + w_position, | ||||
|                                 partial.char_len(), | ||||
|                             )); | ||||
|                             partial | ||||
|                         } | ||||
|                         // partial match is now full, we keep this matches and we advance positions | ||||
|                         Some(MatchType::Full { char_len, ids }) => { | ||||
|                             // save previously matched tokens as matches. | ||||
|                             let iter = potential_matches.into_iter().map( | ||||
|                                 |(token_position, word_position, match_len)| Match { | ||||
|                                     match_len, | ||||
|                                     ids: ids.to_vec(), | ||||
|                                     word_position, | ||||
|                                     token_position, | ||||
|                                 }, | ||||
|                             ); | ||||
|                             matches.extend(iter); | ||||
|  | ||||
|                             // move word and token positions after the end of the match. | ||||
|                             *word_position += w_position; | ||||
|                             *token_position += t_position; | ||||
|  | ||||
|                             // save the token that closes the partial match as a match. | ||||
|                             matches.push(Match { | ||||
|                                 match_len: char_len, | ||||
|                                 ids: ids.to_vec(), | ||||
|                                 word_position: *word_position, | ||||
|                                 token_position: *token_position, | ||||
|                             }); | ||||
|  | ||||
|                             // the match is complete, we return true. | ||||
|                             return true; | ||||
|                         } | ||||
|                         // no match, continue to next match. | ||||
|                         None => break, | ||||
|                     }; | ||||
|                     w_position += 1; | ||||
|                 } | ||||
|                 t_position += 1; | ||||
|             } | ||||
|  | ||||
|             // the match is not complete, we return false. | ||||
|             false | ||||
|         } | ||||
|  | ||||
|         let mut matches = Vec::new(); | ||||
|         let mut word_position = 0; | ||||
|         let mut token_position = 0; | ||||
|         while let Some(token) = self.tokens.get(token_position) { | ||||
|             if token.is_separator().is_none() { | ||||
|                 'matches: for match_type in self.matching_words.match_token(&token) { | ||||
|                 for match_type in self.matching_words.match_token(&token) { | ||||
|                     match match_type { | ||||
|                         // we match, we save the current token as a match, | ||||
|                         // then we continue the rest of the tokens. | ||||
|                         MatchType::Full { char_len, ids } => { | ||||
|                             matches.push(Match { | ||||
|                                 match_len: char_len, | ||||
| @@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> { | ||||
|                                 word_position, | ||||
|                                 token_position, | ||||
|                             }); | ||||
|                             // stop on the first match | ||||
|                             break; | ||||
|                         } | ||||
|                         MatchType::Partial(mut partial) => { | ||||
|                             let mut potential_matches = | ||||
|                                 vec![(token_position, word_position, partial.char_len())]; | ||||
|                             let mut t_position = 1; | ||||
|                             let mut w_position = 1; | ||||
|                             'partials: for token in &self.tokens[token_position + 1..] { | ||||
|                                 if token.is_separator().is_none() { | ||||
|                                     partial = match partial.match_token(&token) { | ||||
|                                         Some(MatchType::Partial(partial)) => { | ||||
|                                             potential_matches.push(( | ||||
|                                                 token_position + t_position, | ||||
|                                                 word_position + w_position, | ||||
|                                                 partial.char_len(), | ||||
|                                             )); | ||||
|                                             partial | ||||
|                                         } | ||||
|                                         // partial match is now full, we keep this matches and we advance positions | ||||
|                                         Some(MatchType::Full { char_len, ids }) => { | ||||
|                                             let iter = potential_matches.into_iter().map( | ||||
|                                                 |(token_position, word_position, match_len)| { | ||||
|                                                     Match { | ||||
|                                                         match_len, | ||||
|                                                         ids: ids.to_vec(), | ||||
|                                                         word_position, | ||||
|                                                         token_position, | ||||
|                                                     } | ||||
|                                                 }, | ||||
|                                             ); | ||||
|  | ||||
|                                             matches.extend(iter); | ||||
|  | ||||
|                                             word_position += w_position; | ||||
|                                             token_position += t_position; | ||||
|  | ||||
|                                             matches.push(Match { | ||||
|                                                 match_len: char_len, | ||||
|                                                 ids: ids.to_vec(), | ||||
|                                                 word_position, | ||||
|                                                 token_position, | ||||
|                                             }); | ||||
|  | ||||
|                                             break 'matches; | ||||
|                                         } | ||||
|                                         // no match, continue to next match. | ||||
|                                         None => break 'partials, | ||||
|                                     }; | ||||
|                                     w_position += 1; | ||||
|                                 } | ||||
|                                 t_position += 1; | ||||
|                         // we match partially, iterate over next tokens to check if we can complete the match. | ||||
|                         MatchType::Partial(partial) => { | ||||
|                             // if match is completed, we break the matching loop over the current token, | ||||
|                             // then we continue the rest of the tokens. | ||||
|                             if compute_partial_match( | ||||
|                                 partial, | ||||
|                                 &self.tokens, | ||||
|                                 &mut token_position, | ||||
|                                 &mut word_position, | ||||
|                                 &mut matches, | ||||
|                             ) { | ||||
|                                 break; | ||||
|                             } | ||||
|                         } | ||||
|                     } | ||||
| @@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> { | ||||
|         self | ||||
|     } | ||||
|  | ||||
|     /// Returns boundaries of the words that match the query. | ||||
|     pub fn matches(&mut self) -> Vec<MatchBounds> { | ||||
|         match &self.matches { | ||||
|             None => self.compute_matches().matches(), | ||||
| @@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns token position of the window to crop around. | ||||
|     fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { | ||||
|         // if there is no match, we start from the beginning of the string by default. | ||||
|         let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); | ||||
|         let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); | ||||
|         let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); | ||||
|         let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); | ||||
|  | ||||
|         // TODO: buggy if no match and first token is a sepparator | ||||
|         // matches needs to be counted in the crop len. | ||||
|         let mut remaining_words = | ||||
|             self.crop_size + first_match_word_position - last_match_word_position; | ||||
|         // if first token is a word, then remove 1 to remaining_words. | ||||
|         if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { | ||||
|             remaining_words -= 1; | ||||
|         } | ||||
|  | ||||
|         // we start from matches positions, then we expand the window in both sides. | ||||
|         let mut first_token_position = first_match_token_position; | ||||
|         let mut last_token_position = last_match_token_position; | ||||
|  | ||||
|         while remaining_words > 0 { | ||||
|             match ( | ||||
|                 // try to expand left | ||||
|                 first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), | ||||
|                 // try to expand right | ||||
|                 last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), | ||||
|             ) { | ||||
|                 // we can expand both sides. | ||||
|                 (Some(ft), Some(lt)) => { | ||||
|                     match (ft.is_separator(), lt.is_separator()) { | ||||
|                         // if they are both separators and are the same kind then advance both | ||||
|                         // if they are both separators and are the same kind then advance both, | ||||
|                         // or expand in the soft separator separator side. | ||||
|                         (Some(f_kind), Some(s_kind)) => { | ||||
|                             if f_kind == s_kind { | ||||
|                                 first_token_position -= 1; | ||||
| @@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> { | ||||
|                                 first_token_position -= 1; | ||||
|                             } | ||||
|                         } | ||||
|                         // left is a word, advance left | ||||
|                         // if one of the tokens is a word, we expend in the side of the word. | ||||
|                         // left is a word, advance left. | ||||
|                         (None, Some(_)) => { | ||||
|                             first_token_position -= 1; | ||||
|                             remaining_words -= 1; | ||||
|                         } | ||||
|                         // right is a word, advance right | ||||
|                         // right is a word, advance right. | ||||
|                         (Some(_), None) => { | ||||
|                             last_token_position += 1; | ||||
|                             remaining_words -= 1; | ||||
|                         } | ||||
|                         // both are words, advance left then right if remaining_word > 0 | ||||
|                         // both are words, advance left then right if remaining_word > 0. | ||||
|                         (None, None) => { | ||||
|                             first_token_position -= 1; | ||||
|                             remaining_words -= 1; | ||||
| @@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> { | ||||
|         (first_token_position, last_token_position) | ||||
|     } | ||||
|  | ||||
|     /// Compute the score of a match interval: | ||||
|     /// 1) count unique matches | ||||
|     /// 2) calculate distance between matches | ||||
|     /// 3) count ordered matches | ||||
|     fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { | ||||
|         let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len()); | ||||
|         let mut order_score = 0; | ||||
| @@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> { | ||||
|         (uniq_score, distance_score, order_score) | ||||
|     } | ||||
|  | ||||
|     /// Returns the matches interval where the score computed by match_interval_score is maximal. | ||||
|     fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { | ||||
|         // we compute the matches interval if we have at least 2 matches. | ||||
|         if matches.len() > 1 { | ||||
|             // positions of the first and the last match of the best matches interval in `matches`. | ||||
|             let mut best_interval = (0, 0); | ||||
|             let mut best_interval_score = self.match_interval_score(&matches[0..=0]); | ||||
|             // current interval positions. | ||||
|             let mut interval_first = 0; | ||||
|             let mut interval_last = 0; | ||||
|             for (index, next_match) in matches.iter().enumerate().skip(1) { | ||||
|                 // if next match would make interval gross more than crop_size | ||||
|                 // if next match would make interval gross more than crop_size, | ||||
|                 // we compare the current interval with the best one, | ||||
|                 // then we increase `interval_first` until next match can be added. | ||||
|                 if next_match.word_position - matches[interval_first].word_position | ||||
|                     >= self.crop_size | ||||
|                 { | ||||
| @@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> { | ||||
|                         best_interval_score = interval_score; | ||||
|                     } | ||||
|  | ||||
|                     // advance start of the interval while interval is longer than crop_size | ||||
|                     // advance start of the interval while interval is longer than crop_size. | ||||
|                     while next_match.word_position - matches[interval_first].word_position | ||||
|                         >= self.crop_size | ||||
|                     { | ||||
| @@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> { | ||||
|                 interval_last = index; | ||||
|             } | ||||
|  | ||||
|             // compute the last interval score and compare it to the best one. | ||||
|             let interval_score = | ||||
|                 self.match_interval_score(&matches[interval_first..=interval_last]); | ||||
|             if interval_score > best_interval_score { | ||||
| @@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Returns the bounds in byte index of the crop window. | ||||
|     fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { | ||||
|         let match_interval = self.find_best_match_interval(matches); | ||||
|  | ||||
| @@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> { | ||||
|         (byte_start, byte_end) | ||||
|     } | ||||
|  | ||||
|     // Returns the formatted version of the original text. | ||||
|     pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { | ||||
|         // If 0 it will be considered null and thus not crop the field | ||||
|         // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 | ||||
|         let crop = crop && self.crop_size > 0; | ||||
|         if !highlight && !crop { | ||||
|             // compute matches is not needed if no highlight or crop is requested. | ||||
|             // compute matches is not needed if no highlight nor crop is requested. | ||||
|             Cow::Borrowed(self.text) | ||||
|         } else { | ||||
|             match &self.matches { | ||||
| @@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> { | ||||
|                                 .char_indices() | ||||
|                                 .enumerate() | ||||
|                                 .find(|(i, _)| *i == m.match_len) | ||||
|                                 .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) | ||||
|                                 .min(token.byte_end); | ||||
|                                 .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); | ||||
|                             formatted.push(self.highlight_prefix); | ||||
|                             formatted.push(&self.text[token.byte_start..highlight_byte_index]); | ||||
|                             formatted.push(self.highlight_suffix); | ||||
|                             formatted.push(&self.text[highlight_byte_index..token.byte_end]); | ||||
|                             // if it's a prefix highlight, we put the end of the word after the highlight marker. | ||||
|                             if highlight_byte_index < token.byte_end { | ||||
|                                 formatted.push(&self.text[highlight_byte_index..token.byte_end]); | ||||
|                             } | ||||
|  | ||||
|                             byte_index = token.byte_end; | ||||
|                         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user