Make some cleaning and add comments

2025-10-31 16:06:31 +00:00 · 2022-04-05 17:35:52 +02:00
parent 3bb1e35ada
commit fa7d3a37c0
1 changed files with 117 additions and 63 deletions
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@@ -4,6 +4,8 @@ pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PrimitiveWordId};
 use meilisearch_tokenizer::token::{SeparatorKind, Token};

+use crate::search::matches::matching_words::PartialMatch;
+
 pub mod matching_words;

 const DEFAULT_CROP_SIZE: usize = 10;
@@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> {
 }

 impl<'t> Matcher<'t, '_> {
+    /// Iterates over tokens and save any of them that matches the query.
    fn compute_matches(&mut self) -> &mut Self {
+        fn compute_partial_match(
+            mut partial: PartialMatch,
+            tokens: &[Token],
+            token_position: &mut usize,
+            word_position: &mut usize,
+            matches: &mut Vec<Match>,
+        ) -> bool {
+            let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())];
+            let mut t_position = 1;
+            let mut w_position = 1;
+            for token in &tokens[*token_position + 1..] {
+                if token.is_separator().is_none() {
+                    partial = match partial.match_token(&token) {
+                        // token matches the partial match, but the match is not full,
+                        // we temporarly save the current token then we try to match the next one.
+                        Some(MatchType::Partial(partial)) => {
+                            potential_matches.push((
+                                *token_position + t_position,
+                                *word_position + w_position,
+                                partial.char_len(),
+                            ));
+                            partial
+                        }
+                        // partial match is now full, we keep this matches and we advance positions
+                        Some(MatchType::Full { char_len, ids }) => {
+                            // save previously matched tokens as matches.
+                            let iter = potential_matches.into_iter().map(
+                                |(token_position, word_position, match_len)| Match {
+                                    match_len,
+                                    ids: ids.to_vec(),
+                                    word_position,
+                                    token_position,
+                                },
+                            );
+                            matches.extend(iter);
+
+                            // move word and token positions after the end of the match.
+                            *word_position += w_position;
+                            *token_position += t_position;
+
+                            // save the token that closes the partial match as a match.
+                            matches.push(Match {
+                                match_len: char_len,
+                                ids: ids.to_vec(),
+                                word_position: *word_position,
+                                token_position: *token_position,
+                            });
+
+                            // the match is complete, we return true.
+                            return true;
+                        }
+                        // no match, continue to next match.
+                        None => break,
+                    };
+                    w_position += 1;
+                }
+                t_position += 1;
+            }
+
+            // the match is not complete, we return false.
+            false
+        }
+
        let mut matches = Vec::new();
        let mut word_position = 0;
        let mut token_position = 0;
        while let Some(token) = self.tokens.get(token_position) {
            if token.is_separator().is_none() {
-                'matches: for match_type in self.matching_words.match_token(&token) {
+                for match_type in self.matching_words.match_token(&token) {
                    match match_type {
+                        // we match, we save the current token as a match,
+                        // then we continue the rest of the tokens.
                        MatchType::Full { char_len, ids } => {
                            matches.push(Match {
                                match_len: char_len,
@@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> {
                                word_position,
                                token_position,
                            });
-                            // stop on the first match
                            break;
                        }
-                        MatchType::Partial(mut partial) => {
-                            let mut potential_matches =
-                                vec![(token_position, word_position, partial.char_len())];
-                            let mut t_position = 1;
-                            let mut w_position = 1;
-                            'partials: for token in &self.tokens[token_position + 1..] {
-                                if token.is_separator().is_none() {
-                                    partial = match partial.match_token(&token) {
-                                        Some(MatchType::Partial(partial)) => {
-                                            potential_matches.push((
-                                                token_position + t_position,
-                                                word_position + w_position,
-                                                partial.char_len(),
-                                            ));
-                                            partial
-                                        }
-                                        // partial match is now full, we keep this matches and we advance positions
-                                        Some(MatchType::Full { char_len, ids }) => {
-                                            let iter = potential_matches.into_iter().map(
-                                                |(token_position, word_position, match_len)| {
-                                                    Match {
-                                                        match_len,
-                                                        ids: ids.to_vec(),
-                                                        word_position,
-                                                        token_position,
-                                                    }
-                                                },
-                                            );
-
-                                            matches.extend(iter);
-
-                                            word_position += w_position;
-                                            token_position += t_position;
-
-                                            matches.push(Match {
-                                                match_len: char_len,
-                                                ids: ids.to_vec(),
-                                                word_position,
-                                                token_position,
-                                            });
-
-                                            break 'matches;
-                                        }
-                                        // no match, continue to next match.
-                                        None => break 'partials,
-                                    };
-                                    w_position += 1;
-                                }
-                                t_position += 1;
+                        // we match partially, iterate over next tokens to check if we can complete the match.
+                        MatchType::Partial(partial) => {
+                            // if match is completed, we break the matching loop over the current token,
+                            // then we continue the rest of the tokens.
+                            if compute_partial_match(
+                                partial,
+                                &self.tokens,
+                                &mut token_position,
+                                &mut word_position,
+                                &mut matches,
+                            ) {
+                                break;
                            }
                        }
                    }
@@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> {
        self
    }

+    /// Returns boundaries of the words that match the query.
    pub fn matches(&mut self) -> Vec<MatchBounds> {
        match &self.matches {
            None => self.compute_matches().matches(),
@@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> {
        }
    }

+    /// Returns token position of the window to crop around.
    fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
+        // if there is no match, we start from the beginning of the string by default.
        let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
        let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
        let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
        let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);

-        // TODO: buggy if no match and first token is a sepparator
+        // matches needs to be counted in the crop len.
        let mut remaining_words =
            self.crop_size + first_match_word_position - last_match_word_position;
        // if first token is a word, then remove 1 to remaining_words.
        if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
            remaining_words -= 1;
        }
+
+        // we start from matches positions, then we expand the window in both sides.
        let mut first_token_position = first_match_token_position;
        let mut last_token_position = last_match_token_position;
-
        while remaining_words > 0 {
            match (
+                // try to expand left
                first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
+                // try to expand right
                last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
            ) {
+                // we can expand both sides.
                (Some(ft), Some(lt)) => {
                    match (ft.is_separator(), lt.is_separator()) {
-                        // if they are both separators and are the same kind then advance both
+                        // if they are both separators and are the same kind then advance both,
+                        // or expand in the soft separator separator side.
                        (Some(f_kind), Some(s_kind)) => {
                            if f_kind == s_kind {
                                first_token_position -= 1;
@@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> {
                                first_token_position -= 1;
                            }
                        }
-                        // left is a word, advance left
+                        // if one of the tokens is a word, we expend in the side of the word.
+                        // left is a word, advance left.
                        (None, Some(_)) => {
                            first_token_position -= 1;
                            remaining_words -= 1;
                        }
-                        // right is a word, advance right
+                        // right is a word, advance right.
                        (Some(_), None) => {
                            last_token_position += 1;
                            remaining_words -= 1;
                        }
-                        // both are words, advance left then right if remaining_word > 0
+                        // both are words, advance left then right if remaining_word > 0.
                        (None, None) => {
                            first_token_position -= 1;
                            remaining_words -= 1;
@@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> {
        (first_token_position, last_token_position)
    }

+    /// Compute the score of a match interval:
+    /// 1) count unique matches
+    /// 2) calculate distance between matches
+    /// 3) count ordered matches
    fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
        let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
        let mut order_score = 0;
@@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> {
        (uniq_score, distance_score, order_score)
    }

+    /// Returns the matches interval where the score computed by match_interval_score is maximal.
    fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
+        // we compute the matches interval if we have at least 2 matches.
        if matches.len() > 1 {
+            // positions of the first and the last match of the best matches interval in `matches`.
            let mut best_interval = (0, 0);
            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
+            // current interval positions.
            let mut interval_first = 0;
            let mut interval_last = 0;
            for (index, next_match) in matches.iter().enumerate().skip(1) {
-                // if next match would make interval gross more than crop_size
+                // if next match would make interval gross more than crop_size,
+                // we compare the current interval with the best one,
+                // then we increase `interval_first` until next match can be added.
                if next_match.word_position - matches[interval_first].word_position
                    >= self.crop_size
                {
@@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> {
                        best_interval_score = interval_score;
                    }

-                    // advance start of the interval while interval is longer than crop_size
+                    // advance start of the interval while interval is longer than crop_size.
                    while next_match.word_position - matches[interval_first].word_position
                        >= self.crop_size
                    {
@@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> {
                interval_last = index;
            }

+            // compute the last interval score and compare it to the best one.
            let interval_score =
                self.match_interval_score(&matches[interval_first..=interval_last]);
            if interval_score > best_interval_score {
@@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> {
        }
    }

+    /// Returns the bounds in byte index of the crop window.
    fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
        let match_interval = self.find_best_match_interval(matches);

@@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> {
        (byte_start, byte_end)
    }

+    // Returns the formatted version of the original text.
    pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
        // If 0 it will be considered null and thus not crop the field
        // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
        let crop = crop && self.crop_size > 0;
        if !highlight && !crop {
-            // compute matches is not needed if no highlight or crop is requested.
+            // compute matches is not needed if no highlight nor crop is requested.
            Cow::Borrowed(self.text)
        } else {
            match &self.matches {
@@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> {
                                .char_indices()
                                .enumerate()
                                .find(|(i, _)| *i == m.match_len)
-                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
-                                .min(token.byte_end);
+                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
                            formatted.push(self.highlight_prefix);
                            formatted.push(&self.text[token.byte_start..highlight_byte_index]);
                            formatted.push(self.highlight_suffix);
-                            formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            // if it's a prefix highlight, we put the end of the word after the highlight marker.
+                            if highlight_byte_index < token.byte_end {
+                                formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            }

                            byte_index = token.byte_end;
                        }