mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Improve changes to Matcher
This commit is contained in:
		| @@ -93,15 +93,28 @@ impl FormatOptions { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[derive(Clone, Debug)] | ||||||
|  | pub enum MatchPosition { | ||||||
|  |     Word { | ||||||
|  |         // position of the word in the whole text. | ||||||
|  |         word_position: usize, | ||||||
|  |         // position of the token in the whole text. | ||||||
|  |         token_position: usize, | ||||||
|  |     }, | ||||||
|  |     Phrase { | ||||||
|  |         // position of the first and last word in the phrase in the whole text. | ||||||
|  |         word_positions: (usize, usize), | ||||||
|  |         // position of the first and last token in the phrase in the whole text. | ||||||
|  |         token_positions: (usize, usize), | ||||||
|  |     }, | ||||||
|  | } | ||||||
|  |  | ||||||
| #[derive(Clone, Debug)] | #[derive(Clone, Debug)] | ||||||
| pub struct Match { | pub struct Match { | ||||||
|     match_len: usize, |     match_len: usize, | ||||||
|     // ids of the query words that matches. |     // ids of the query words that matches. | ||||||
|     ids: Vec<WordId>, |     ids: Vec<WordId>, | ||||||
|     // position of the word in the whole text. |     position: MatchPosition, | ||||||
|     word_position: usize, |  | ||||||
|     // position of the token in the whole text. |  | ||||||
|     token_position: usize, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Serialize, Debug, Clone, PartialEq, Eq)] | #[derive(Serialize, Debug, Clone, PartialEq, Eq)] | ||||||
| @@ -130,13 +143,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|         /// compute_partial_match peek into next words to validate if the match is complete. |         /// compute_partial_match peek into next words to validate if the match is complete. | ||||||
|         fn compute_partial_match<'a>( |         fn compute_partial_match<'a>( | ||||||
|             mut partial: PartialMatch<'a>, |             mut partial: PartialMatch<'a>, | ||||||
|             token_position: usize, |             first_token_position: usize, | ||||||
|             word_position: usize, |             first_word_position: usize, | ||||||
|             first_word_char_start: &usize, |             first_word_char_start: &usize, | ||||||
|             words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>, |             words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>, | ||||||
|             matches: &mut Vec<Match>, |             matches: &mut Vec<Match>, | ||||||
|         ) -> bool { |         ) -> bool { | ||||||
|             for (_, _, word) in words_positions { |             for (token_position, word_position, word) in words_positions { | ||||||
|                 partial = match partial.match_token(word) { |                 partial = match partial.match_token(word) { | ||||||
|                     // token matches the partial match, but the match is not full, |                     // token matches the partial match, but the match is not full, | ||||||
|                     // we temporarily save the current token then we try to match the next one. |                     // we temporarily save the current token then we try to match the next one. | ||||||
| @@ -145,10 +158,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|                     Some(MatchType::Full { ids, .. }) => { |                     Some(MatchType::Full { ids, .. }) => { | ||||||
|                         // save the token that closes the partial match as a match. |                         // save the token that closes the partial match as a match. | ||||||
|                         matches.push(Match { |                         matches.push(Match { | ||||||
|                             match_len: word.char_end - first_word_char_start, |                             match_len: word.char_end - *first_word_char_start, | ||||||
|                             ids: ids.clone().collect(), |                             ids: ids.clone().collect(), | ||||||
|                             word_position, |                             position: MatchPosition::Phrase { | ||||||
|                             token_position, |                                 word_positions: (first_word_position, word_position), | ||||||
|  |                                 token_positions: (first_token_position, token_position), | ||||||
|  |                             }, | ||||||
|                         }); |                         }); | ||||||
|  |  | ||||||
|                         // the match is complete, we return true. |                         // the match is complete, we return true. | ||||||
| @@ -191,8 +206,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|                         matches.push(Match { |                         matches.push(Match { | ||||||
|                             match_len: char_len, |                             match_len: char_len, | ||||||
|                             ids, |                             ids, | ||||||
|                             word_position, |                             position: MatchPosition::Word { word_position, token_position }, | ||||||
|                             token_position, |  | ||||||
|                         }); |                         }); | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
| @@ -228,13 +242,47 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|             Some((tokens, matches)) => matches |             Some((tokens, matches)) => matches | ||||||
|                 .iter() |                 .iter() | ||||||
|                 .map(|m| MatchBounds { |                 .map(|m| MatchBounds { | ||||||
|                     start: tokens[m.token_position].byte_start, |                     start: tokens[match m.position { | ||||||
|  |                         MatchPosition::Word { token_position, .. } => token_position, | ||||||
|  |                         MatchPosition::Phrase { | ||||||
|  |                             token_positions: (first_token_position, _), | ||||||
|  |                             .. | ||||||
|  |                         } => first_token_position, | ||||||
|  |                     }] | ||||||
|  |                     .byte_start, | ||||||
|                     length: m.match_len, |                     length: m.match_len, | ||||||
|                 }) |                 }) | ||||||
|                 .collect(), |                 .collect(), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     // @TODO: This should be improved, looks nasty | ||||||
|  |     fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize { | ||||||
|  |         match m.position { | ||||||
|  |             MatchPosition::Word { word_position, token_position } => { | ||||||
|  |                 if is_word { | ||||||
|  |                     word_position | ||||||
|  |                 } else { | ||||||
|  |                     token_position | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => { | ||||||
|  |                 if is_word { | ||||||
|  |                     if is_first { | ||||||
|  |                         return wpf; | ||||||
|  |                     } else { | ||||||
|  |                         return wpl; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 if is_first { | ||||||
|  |                     tpf | ||||||
|  |                 } else { | ||||||
|  |                     tpl | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /// Returns the bounds in byte index of the crop window. |     /// Returns the bounds in byte index of the crop window. | ||||||
|     fn crop_bounds( |     fn crop_bounds( | ||||||
|         &self, |         &self, | ||||||
| @@ -243,10 +291,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|         crop_size: usize, |         crop_size: usize, | ||||||
|     ) -> (usize, usize) { |     ) -> (usize, usize) { | ||||||
|         // if there is no match, we start from the beginning of the string by default. |         // if there is no match, we start from the beginning of the string by default. | ||||||
|         let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); |         let first_match_word_position = | ||||||
|         let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); |             matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0); | ||||||
|         let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); |         let first_match_token_position = | ||||||
|         let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); |             matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0); | ||||||
|  |         let last_match_word_position = | ||||||
|  |             matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0); | ||||||
|  |         let last_match_token_position = | ||||||
|  |             matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0); | ||||||
|  |  | ||||||
|         // matches needs to be counted in the crop len. |         // matches needs to be counted in the crop len. | ||||||
|         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; |         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; | ||||||
| @@ -350,7 +402,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|                 } |                 } | ||||||
|  |  | ||||||
|                 // compute distance between matches |                 // compute distance between matches | ||||||
|                 distance_score -= (next_match.word_position - m.word_position).min(7) as i16; |                 distance_score -= (self.get_match_pos(next_match, true, true) | ||||||
|  |                     - self.get_match_pos(m, true, true)) | ||||||
|  |                 .min(7) as i16; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             ids.extend(m.ids.iter()); |             ids.extend(m.ids.iter()); | ||||||
| @@ -378,7 +432,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|                 // if next match would make interval gross more than crop_size, |                 // if next match would make interval gross more than crop_size, | ||||||
|                 // we compare the current interval with the best one, |                 // we compare the current interval with the best one, | ||||||
|                 // then we increase `interval_first` until next match can be added. |                 // then we increase `interval_first` until next match can be added. | ||||||
|                 if next_match.word_position - matches[interval_first].word_position >= crop_size { |                 let next_match_word_position = self.get_match_pos(next_match, true, true); | ||||||
|  |  | ||||||
|  |                 if next_match_word_position | ||||||
|  |                     - self.get_match_pos(&matches[interval_first], false, true) | ||||||
|  |                     >= crop_size | ||||||
|  |                 { | ||||||
|                     let interval_score = |                     let interval_score = | ||||||
|                         self.match_interval_score(&matches[interval_first..=interval_last]); |                         self.match_interval_score(&matches[interval_first..=interval_last]); | ||||||
|  |  | ||||||
| @@ -389,10 +448,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|                     } |                     } | ||||||
|  |  | ||||||
|                     // advance start of the interval while interval is longer than crop_size. |                     // advance start of the interval while interval is longer than crop_size. | ||||||
|                     while next_match.word_position - matches[interval_first].word_position |                     loop { | ||||||
|                         >= crop_size |  | ||||||
|                     { |  | ||||||
|                         interval_first += 1; |                         interval_first += 1; | ||||||
|  |  | ||||||
|  |                         if next_match_word_position | ||||||
|  |                             - self.get_match_pos(&matches[interval_first], false, true) | ||||||
|  |                             < crop_size | ||||||
|  |                         { | ||||||
|  |                             break; | ||||||
|  |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 interval_last = index; |                 interval_last = index; | ||||||
| @@ -441,33 +505,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | |||||||
|                     if format_options.highlight { |                     if format_options.highlight { | ||||||
|                         // insert highlight markers around matches. |                         // insert highlight markers around matches. | ||||||
|                         for m in matches { |                         for m in matches { | ||||||
|                             let token = &tokens[m.token_position]; |                             let (current_byte_start, current_byte_end) = match m.position { | ||||||
|  |                                 MatchPosition::Word { token_position, .. } => { | ||||||
|  |                                     let token = &tokens[token_position]; | ||||||
|  |                                     (&token.byte_start, &token.byte_end) | ||||||
|  |                                 } | ||||||
|  |                                 MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => { | ||||||
|  |                                     (&tokens[ftp].byte_start, &tokens[ltp].byte_end) | ||||||
|  |                                 } | ||||||
|  |                             }; | ||||||
|  |  | ||||||
|                             // skip matches out of the crop window. |                             // skip matches out of the crop window. | ||||||
|                             if token.byte_start < byte_start || token.byte_end > byte_end { |                             if *current_byte_start < byte_start || *current_byte_end > byte_end { | ||||||
|                                 continue; |                                 continue; | ||||||
|                             } |                             } | ||||||
|  |  | ||||||
|                             if byte_index < token.byte_start { |                             if byte_index < *current_byte_start { | ||||||
|                                 formatted.push(&self.text[byte_index..token.byte_start]); |                                 formatted.push(&self.text[byte_index..*current_byte_start]); | ||||||
|                             } |                             } | ||||||
|  |  | ||||||
|                             let highlight_byte_index = self.text[token.byte_start..] |                             let highlight_byte_index = self.text[*current_byte_start..] | ||||||
|                                 .char_indices() |                                 .char_indices() | ||||||
|                                 .enumerate() |                                 .enumerate() | ||||||
|                                 .find(|(i, _)| *i == m.match_len) |                                 .find(|(i, _)| *i == m.match_len) | ||||||
|                                 .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); |                                 .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start); | ||||||
|  |  | ||||||
|                             formatted.push(self.highlight_prefix); |                             formatted.push(self.highlight_prefix); | ||||||
|                             formatted.push(&self.text[token.byte_start..highlight_byte_index]); |                             formatted.push(&self.text[*current_byte_start..highlight_byte_index]); | ||||||
|                             formatted.push(self.highlight_suffix); |                             formatted.push(self.highlight_suffix); | ||||||
|  |  | ||||||
|                             // if it's a prefix highlight, we put the end of the word after the highlight marker. |                             // if it's a prefix highlight, we put the end of the word after the highlight marker. | ||||||
|                             if highlight_byte_index < token.byte_end { |                             if highlight_byte_index < *current_byte_end { | ||||||
|                                 formatted.push(&self.text[highlight_byte_index..token.byte_end]); |                                 formatted.push(&self.text[highlight_byte_index..*current_byte_end]); | ||||||
|                             } |                             } | ||||||
|  |  | ||||||
|                             byte_index = token.byte_start + m.match_len; |                             byte_index = *current_byte_end; | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user