add new highlighter

2025-10-31 16:06:31 +00:00 · 2023-04-06 12:15:37 +02:00
parent 4708d9b016
commit 13b7c826c1
3 changed files with 1183 additions and 0 deletions
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -0,0 +1,334 @@
+use std::cmp::Reverse;
+use std::ops::RangeInclusive;
+
+use charabia::Token;
+
+use super::super::interner::Interned;
+use super::super::query_term::{
+    Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
+};
+use super::super::{DedupInterner, Phrase};
+use crate::SearchContext;
+
+pub struct LocatedMatchingPhrase {
+    pub value: Interned<Phrase>,
+    pub positions: RangeInclusive<WordId>,
+}
+
+pub struct LocatedMatchingWords {
+    pub value: Vec<Interned<String>>,
+    pub positions: RangeInclusive<WordId>,
+    pub is_prefix: bool,
+}
+
+/// Structure created from a query tree
+/// referencing words that match the given query tree.
+pub struct MatchingWords<'ctx> {
+    word_interner: &'ctx DedupInterner<String>,
+    phrase_interner: &'ctx DedupInterner<Phrase>,
+    phrases: Vec<LocatedMatchingPhrase>,
+    words: Vec<LocatedMatchingWords>,
+}
+
+/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
+fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
+    let mut matching_words = Vec::new();
+    let mut matching_phrases = Vec::new();
+
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    let QueryTerm {
+        original: _,
+        is_multiple_words: _,
+        max_nbr_typos: _,
+        is_prefix: _,
+        zero_typo,
+        one_typo,
+        two_typo,
+    } = term;
+
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
+
+    // zero typo
+    if let Some(phrase) = phrase {
+        matching_phrases.push(*phrase);
+    }
+    if let Some(zero_typo) = zero_typo {
+        matching_words.push(*zero_typo);
+    }
+    for synonym in synonyms {
+        matching_phrases.push(*synonym);
+    }
+
+    // one typo
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
+        if let Some(split_words) = split_words {
+            matching_phrases.push(*split_words);
+        }
+        for one_typo in one_typo {
+            matching_words.push(*one_typo);
+        }
+    }
+
+    // two typos
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
+        for two_typos in two_typos {
+            matching_words.push(*two_typos);
+        }
+    }
+
+    (matching_phrases, matching_words)
+}
+
+impl<'ctx> MatchingWords<'ctx> {
+    pub fn new(ctx: &'ctx SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
+        let mut phrases = Vec::new();
+        let mut words = Vec::new();
+
+        // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
+        // and wrap them in dedicated structures.
+        for located_term in located_terms {
+            let term = ctx.term_interner.get(located_term.value);
+            let (matching_phrases, matching_words) = extract_matching_terms(term);
+
+            for matching_phrase in matching_phrases {
+                phrases.push(LocatedMatchingPhrase {
+                    value: matching_phrase,
+                    positions: located_term.positions.clone(),
+                });
+            }
+            words.push(LocatedMatchingWords {
+                value: matching_words,
+                positions: located_term.positions.clone(),
+                is_prefix: term.is_prefix,
+            });
+        }
+
+        // Sort word to put prefixes at the bottom prioritizing the exact matches.
+        words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
+
+        Self {
+            phrases,
+            words,
+            word_interner: &ctx.word_interner,
+            phrase_interner: &ctx.phrase_interner,
+        }
+    }
+
+    /// Returns an iterator over terms that match or partially match the given token.
+    pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> {
+        MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
+    }
+
+    /// Try to match the token with one of the located_words.
+    fn match_unique_words(&'ctx self, token: &Token) -> Option<MatchType<'ctx>> {
+        for located_words in &self.words {
+            for word in &located_words.value {
+                let word = self.word_interner.get(*word);
+                // if the word is a prefix we match using starts_with.
+                if located_words.is_prefix && token.lemma().starts_with(word) {
+                    let char_len = token.original_lengths(word.len()).0;
+                    let ids = &located_words.positions;
+                    return Some(MatchType::Full { char_len, ids });
+                // else we exact match the token.
+                } else if token.lemma() == word {
+                    let char_len = token.char_end - token.char_start;
+                    let ids = &located_words.positions;
+                    return Some(MatchType::Full { char_len, ids });
+                }
+            }
+        }
+
+        None
+    }
+}
+
+/// Iterator over terms that match the given token,
+/// This allow to lazily evaluate matches.
+pub struct MatchesIter<'a, 'b> {
+    matching_words: &'a MatchingWords<'a>,
+    phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
+    token: &'b Token<'b>,
+}
+
+impl<'a> Iterator for MatchesIter<'a, '_> {
+    type Item = MatchType<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.phrases.next() {
+            // Try to match all the phrases first.
+            Some(located_phrase) => {
+                let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
+
+                // create a PartialMatch struct to make it compute the first match
+                // instead of duplicating the code.
+                let ids = &located_phrase.positions;
+                // collect the references of words from the interner.
+                let words = phrase
+                    .words
+                    .iter()
+                    .map(|word| {
+                        word.map(|word| self.matching_words.word_interner.get(word).as_str())
+                    })
+                    .collect();
+                let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
+
+                partial.match_token(self.token).or_else(|| self.next())
+            }
+            // If no phrases matches, try to match uiques words.
+            None => self.matching_words.match_unique_words(self.token),
+        }
+    }
+}
+
+/// Id of a matching term corespounding to a word written by the end user.
+pub type WordId = u16;
+
+/// A given token can partially match a query word for several reasons:
+/// - split words
+/// - multi-word synonyms
+/// In these cases we need to match consecutively several tokens to consider that the match is full.
+#[derive(Debug, PartialEq)]
+pub enum MatchType<'a> {
+    Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
+    Partial(PartialMatch<'a>),
+}
+
+/// Structure helper to match several tokens in a row in order to complete a partial match.
+#[derive(Debug, PartialEq)]
+pub struct PartialMatch<'a> {
+    matching_words: Vec<Option<&'a str>>,
+    ids: &'a RangeInclusive<WordId>,
+    char_len: usize,
+}
+
+impl<'a> PartialMatch<'a> {
+    /// Returns:
+    /// - None if the given token breaks the partial match
+    /// - Partial if the given token matches the partial match but doesn't complete it
+    /// - Full if the given token completes the partial match
+    pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
+        let Self { mut matching_words, ids, .. } = self;
+
+        let is_matching = match matching_words.first()? {
+            Some(word) => &token.lemma() == word,
+            // a None value in the phrase corresponds to a stop word,
+            // the walue is considered a match if the current token is categorized as a stop word.
+            None => token.is_stopword(),
+        };
+
+        let char_len = token.char_end - token.char_start;
+        // if there are remaining words to match in the phrase and the current token is matching,
+        // return a new Partial match allowing the highlighter to continue.
+        if is_matching && matching_words.len() > 1 {
+            matching_words.remove(0);
+            Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
+        // if there is no remaining word to match in the phrase and the current token is matching,
+        // return a Full match.
+        } else if is_matching {
+            Some(MatchType::Full { char_len, ids })
+        // if the current token doesn't match, return None to break the match sequence.
+        } else {
+            None
+        }
+    }
+
+    pub fn char_len(&self) -> usize {
+        self.char_len
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use std::borrow::Cow;
+
+    use charabia::{TokenKind, TokenizerBuilder};
+
+    use super::super::super::located_query_terms_from_string;
+    use super::*;
+    use crate::index::tests::TempIndex;
+
+    pub(crate) fn temp_index_with_documents() -> TempIndex {
+        let temp_index = TempIndex::new();
+        temp_index
+            .add_documents(documents!([
+                { "id": 1, "name": "split this world westfali westfalia the" },
+            ]))
+            .unwrap();
+        temp_index
+    }
+
+    #[test]
+    fn matching_words() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let tokenizer = TokenizerBuilder::new().build();
+        let tokens = tokenizer.tokenize("split this world");
+        let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
+        let matching_words = MatchingWords::new(&ctx, query_terms);
+
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("split"),
+                    char_end: "split".chars().count(),
+                    byte_end: "split".len(),
+                    ..Default::default()
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("nyc"),
+                    char_end: "nyc".chars().count(),
+                    byte_end: "nyc".len(),
+                    ..Default::default()
+                })
+                .next(),
+            None
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("world"),
+                    char_end: "world".chars().count(),
+                    byte_end: "world".len(),
+                    ..Default::default()
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("worlded"),
+                    char_end: "worlded".chars().count(),
+                    byte_end: "worlded".len(),
+                    ..Default::default()
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("thisnew"),
+                    char_end: "thisnew".chars().count(),
+                    byte_end: "thisnew".len(),
+                    ..Default::default()
+                })
+                .next(),
+            None
+        );
+    }
+}
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -0,0 +1,848 @@
+use std::borrow::Cow;
+
+use charabia::{SeparatorKind, Token, Tokenizer};
+use matching_words::{MatchType, MatchingWords, PartialMatch, WordId};
+use serde::Serialize;
+
+use super::query_term::LocatedQueryTerm;
+use crate::SearchContext;
+
+pub mod matching_words;
+
+const DEFAULT_CROP_MARKER: &str = "…";
+const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
+const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>";
+
+/// Structure used to build a Matcher allowing to customize formating tags.
+pub struct MatcherBuilder<'a, 'ctx, A> {
+    matching_words: MatchingWords<'ctx>,
+    tokenizer: Tokenizer<'a, 'a, A>,
+    crop_marker: Option<String>,
+    highlight_prefix: Option<String>,
+    highlight_suffix: Option<String>,
+}
+
+impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> {
+    pub fn new(
+        ctx: &'ctx SearchContext,
+        located_terms: Vec<LocatedQueryTerm>,
+        tokenizer: Tokenizer<'a, 'a, A>,
+    ) -> Self {
+        let matching_words = MatchingWords::new(ctx, located_terms);
+        Self {
+            matching_words,
+            tokenizer,
+            crop_marker: None,
+            highlight_prefix: None,
+            highlight_suffix: None,
+        }
+    }
+
+    pub fn crop_marker(&mut self, marker: String) -> &Self {
+        self.crop_marker = Some(marker);
+        self
+    }
+
+    pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
+        self.highlight_prefix = Some(prefix);
+        self
+    }
+
+    pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
+        self.highlight_suffix = Some(suffix);
+        self
+    }
+
+    pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> {
+        let crop_marker = match &self.crop_marker {
+            Some(marker) => marker.as_str(),
+            None => DEFAULT_CROP_MARKER,
+        };
+
+        let highlight_prefix = match &self.highlight_prefix {
+            Some(marker) => marker.as_str(),
+            None => DEFAULT_HIGHLIGHT_PREFIX,
+        };
+        let highlight_suffix = match &self.highlight_suffix {
+            Some(marker) => marker.as_str(),
+            None => DEFAULT_HIGHLIGHT_SUFFIX,
+        };
+        Matcher {
+            text,
+            matching_words: &self.matching_words,
+            tokenizer: &self.tokenizer,
+            crop_marker,
+            highlight_prefix,
+            highlight_suffix,
+            matches: None,
+        }
+    }
+}
+
+#[derive(Copy, Clone, Default)]
+pub struct FormatOptions {
+    pub highlight: bool,
+    pub crop: Option<usize>,
+}
+
+impl FormatOptions {
+    pub fn merge(self, other: Self) -> Self {
+        Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
+    }
+}
+
+#[derive(Clone, Debug)]
+pub struct Match {
+    match_len: usize,
+    // ids of the query words that matches.
+    ids: Vec<WordId>,
+    // position of the word in the whole text.
+    word_position: usize,
+    // position of the token in the whole text.
+    token_position: usize,
+}
+
+#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
+pub struct MatchBounds {
+    pub start: usize,
+    pub length: usize,
+}
+
+/// Structure used to analize a string, compute words that match,
+/// and format the source string, returning a highlighted and cropped sub-string.
+pub struct Matcher<'t, 'm, A> {
+    text: &'t str,
+    matching_words: &'m MatchingWords<'m>,
+    tokenizer: &'m Tokenizer<'m, 'm, A>,
+    crop_marker: &'m str,
+    highlight_prefix: &'m str,
+    highlight_suffix: &'m str,
+    matches: Option<(Vec<Token<'t>>, Vec<Match>)>,
+}
+
+impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
+    /// Iterates over tokens and save any of them that matches the query.
+    fn compute_matches(&mut self) -> &mut Self {
+        /// some words are counted as matches only if they are close together and in the good order,
+        /// compute_partial_match peek into next words to validate if the match is complete.
+        fn compute_partial_match<'a>(
+            mut partial: PartialMatch,
+            token_position: usize,
+            word_position: usize,
+            words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
+            matches: &mut Vec<Match>,
+        ) -> bool {
+            let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
+
+            for (token_position, word_position, word) in words_positions {
+                partial = match partial.match_token(word) {
+                    // token matches the partial match, but the match is not full,
+                    // we temporarly save the current token then we try to match the next one.
+                    Some(MatchType::Partial(partial)) => {
+                        potential_matches.push((token_position, word_position, partial.char_len()));
+                        partial
+                    }
+                    // partial match is now full, we keep this matches and we advance positions
+                    Some(MatchType::Full { char_len, ids }) => {
+                        let ids: Vec<_> = ids.clone().into_iter().collect();
+                        // save previously matched tokens as matches.
+                        let iter = potential_matches.into_iter().map(
+                            |(token_position, word_position, match_len)| Match {
+                                match_len,
+                                ids: ids.clone(),
+                                word_position,
+                                token_position,
+                            },
+                        );
+                        matches.extend(iter);
+
+                        // save the token that closes the partial match as a match.
+                        matches.push(Match {
+                            match_len: char_len,
+                            ids,
+                            word_position,
+                            token_position,
+                        });
+
+                        // the match is complete, we return true.
+                        return true;
+                    }
+                    // no match, continue to next match.
+                    None => break,
+                };
+            }
+
+            // the match is not complete, we return false.
+            false
+        }
+
+        let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect();
+        let mut matches = Vec::new();
+
+        let mut words_positions = tokens
+            .iter()
+            .scan((0, 0), |(token_position, word_position), token| {
+                let current_token_position = *token_position;
+                let current_word_position = *word_position;
+                *token_position += 1;
+                if !token.is_separator() {
+                    *word_position += 1;
+                }
+
+                Some((current_token_position, current_word_position, token))
+            })
+            .filter(|(_, _, token)| !token.is_separator());
+
+        while let Some((token_position, word_position, word)) = words_positions.next() {
+            for match_type in self.matching_words.match_token(word) {
+                match match_type {
+                    // we match, we save the current token as a match,
+                    // then we continue the rest of the tokens.
+                    MatchType::Full { char_len, ids } => {
+                        let ids: Vec<_> = ids.clone().into_iter().collect();
+                        matches.push(Match {
+                            match_len: char_len,
+                            ids,
+                            word_position,
+                            token_position,
+                        });
+                        break;
+                    }
+                    // we match partially, iterate over next tokens to check if we can complete the match.
+                    MatchType::Partial(partial) => {
+                        // if match is completed, we break the matching loop over the current token,
+                        // then we continue the rest of the tokens.
+                        let mut wp = words_positions.clone();
+                        if compute_partial_match(
+                            partial,
+                            token_position,
+                            word_position,
+                            &mut wp,
+                            &mut matches,
+                        ) {
+                            words_positions = wp;
+                            break;
+                        }
+                    }
+                }
+            }
+        }
+
+        self.matches = Some((tokens, matches));
+        self
+    }
+
+    /// Returns boundaries of the words that match the query.
+    pub fn matches(&mut self) -> Vec<MatchBounds> {
+        match &self.matches {
+            None => self.compute_matches().matches(),
+            Some((tokens, matches)) => matches
+                .iter()
+                .map(|m| MatchBounds {
+                    start: tokens[m.token_position].byte_start,
+                    length: m.match_len,
+                })
+                .collect(),
+        }
+    }
+
+    /// Returns the bounds in byte index of the crop window.
+    fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) {
+        // if there is no match, we start from the beginning of the string by default.
+        let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
+        let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
+        let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
+        let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
+
+        // matches needs to be counted in the crop len.
+        let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
+
+        // create the initial state of the crop window: 2 iterators starting from the matches positions,
+        // a reverse iterator starting from the first match token position and going towards the beginning of the text,
+        let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
+        // an iterator starting from the last match token position and going towards the end of the text.
+        let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
+
+        // grows the crop window peeking in both directions
+        // until the window contains the good number of words:
+        while remaining_words > 0 {
+            let before_token = before_tokens.peek().map(|t| t.separator_kind());
+            let after_token = after_tokens.peek().map(|t| t.separator_kind());
+
+            match (before_token, after_token) {
+                // we can expand both sides.
+                (Some(before_token), Some(after_token)) => {
+                    match (before_token, after_token) {
+                        // if they are both separators and are the same kind then advance both,
+                        // or expand in the soft separator separator side.
+                        (Some(before_token_kind), Some(after_token_kind)) => {
+                            if before_token_kind == after_token_kind {
+                                before_tokens.next();
+
+                                // this avoid having an ending separator before crop marker.
+                                if remaining_words > 1 {
+                                    after_tokens.next();
+                                }
+                            } else if before_token_kind == SeparatorKind::Hard {
+                                after_tokens.next();
+                            } else {
+                                before_tokens.next();
+                            }
+                        }
+                        // if one of the tokens is a word, we expend in the side of the word.
+                        // left is a word, advance left.
+                        (None, Some(_)) => {
+                            before_tokens.next();
+                            remaining_words -= 1;
+                        }
+                        // right is a word, advance right.
+                        (Some(_), None) => {
+                            after_tokens.next();
+                            remaining_words -= 1;
+                        }
+                        // both are words, advance left then right if remaining_word > 0.
+                        (None, None) => {
+                            before_tokens.next();
+                            remaining_words -= 1;
+
+                            if remaining_words > 0 {
+                                after_tokens.next();
+                                remaining_words -= 1;
+                            }
+                        }
+                    }
+                }
+                // the end of the text is reached, advance left.
+                (Some(before_token), None) => {
+                    before_tokens.next();
+                    if before_token.is_none() {
+                        remaining_words -= 1;
+                    }
+                }
+                // the start of the text is reached, advance right.
+                (None, Some(after_token)) => {
+                    after_tokens.next();
+                    if after_token.is_none() {
+                        remaining_words -= 1;
+                    }
+                }
+                // no more token to add.
+                (None, None) => break,
+            }
+        }
+
+        // finally, keep the byte index of each bound of the crop window.
+        let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
+        let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
+
+        (crop_byte_start, crop_byte_end)
+    }
+
+    /// Compute the score of a match interval:
+    /// 1) count unique matches
+    /// 2) calculate distance between matches
+    /// 3) count ordered matches
+    fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
+        let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
+        let mut order_score = 0;
+        let mut distance_score = 0;
+
+        let mut iter = matches.iter().peekable();
+        while let Some(m) = iter.next() {
+            if let Some(next_match) = iter.peek() {
+                // if matches are ordered
+                if next_match.ids.iter().min() > m.ids.iter().min() {
+                    order_score += 1;
+                }
+
+                // compute distance between matches
+                distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
+            }
+
+            ids.extend(m.ids.iter());
+        }
+
+        ids.sort_unstable();
+        ids.dedup();
+        let uniq_score = ids.len() as i16;
+
+        // rank by unique match count, then by distance between matches, then by ordered match count.
+        (uniq_score, distance_score, order_score)
+    }
+
+    /// Returns the matches interval where the score computed by match_interval_score is the best.
+    fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
+        // we compute the matches interval if we have at least 2 matches.
+        if matches.len() > 1 {
+            // positions of the first and the last match of the best matches interval in `matches`.
+            let mut best_interval = (0, 0);
+            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
+            // current interval positions.
+            let mut interval_first = 0;
+            let mut interval_last = 0;
+            for (index, next_match) in matches.iter().enumerate().skip(1) {
+                // if next match would make interval gross more than crop_size,
+                // we compare the current interval with the best one,
+                // then we increase `interval_first` until next match can be added.
+                if next_match.word_position - matches[interval_first].word_position >= crop_size {
+                    let interval_score =
+                        self.match_interval_score(&matches[interval_first..=interval_last]);
+
+                    // keep interval if it's the best
+                    if interval_score > best_interval_score {
+                        best_interval = (interval_first, interval_last);
+                        best_interval_score = interval_score;
+                    }
+
+                    // advance start of the interval while interval is longer than crop_size.
+                    while next_match.word_position - matches[interval_first].word_position
+                        >= crop_size
+                    {
+                        interval_first += 1;
+                    }
+                }
+                interval_last = index;
+            }
+
+            // compute the last interval score and compare it to the best one.
+            let interval_score =
+                self.match_interval_score(&matches[interval_first..=interval_last]);
+            if interval_score > best_interval_score {
+                best_interval = (interval_first, interval_last);
+            }
+
+            &matches[best_interval.0..=best_interval.1]
+        } else {
+            matches
+        }
+    }
+
+    // Returns the formatted version of the original text.
+    pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
+        if !format_options.highlight && format_options.crop.is_none() {
+            // compute matches is not needed if no highlight nor crop is requested.
+            Cow::Borrowed(self.text)
+        } else {
+            match &self.matches {
+                Some((tokens, matches)) => {
+                    // If the text has to be cropped,
+                    // compute the best interval to crop around.
+                    let matches = match format_options.crop {
+                        Some(crop_size) if crop_size > 0 => {
+                            self.find_best_match_interval(matches, crop_size)
+                        }
+                        _ => matches,
+                    };
+
+                    // If the text has to be cropped,
+                    // crop around the best interval.
+                    let (byte_start, byte_end) = match format_options.crop {
+                        Some(crop_size) if crop_size > 0 => {
+                            self.crop_bounds(tokens, matches, crop_size)
+                        }
+                        _ => (0, self.text.len()),
+                    };
+
+                    let mut formatted = Vec::new();
+
+                    // push crop marker if it's not the start of the text.
+                    if byte_start > 0 && !self.crop_marker.is_empty() {
+                        formatted.push(self.crop_marker);
+                    }
+
+                    let mut byte_index = byte_start;
+
+                    if format_options.highlight {
+                        // insert highlight markers around matches.
+                        for m in matches {
+                            let token = &tokens[m.token_position];
+
+                            if byte_index < token.byte_start {
+                                formatted.push(&self.text[byte_index..token.byte_start]);
+                            }
+
+                            let highlight_byte_index = self.text[token.byte_start..]
+                                .char_indices()
+                                .enumerate()
+                                .find(|(i, _)| *i == m.match_len)
+                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
+                            formatted.push(self.highlight_prefix);
+                            formatted.push(&self.text[token.byte_start..highlight_byte_index]);
+                            formatted.push(self.highlight_suffix);
+                            // if it's a prefix highlight, we put the end of the word after the highlight marker.
+                            if highlight_byte_index < token.byte_end {
+                                formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            }
+
+                            byte_index = token.byte_end;
+                        }
+                    }
+
+                    // push the rest of the text between last match and the end of crop.
+                    if byte_index < byte_end {
+                        formatted.push(&self.text[byte_index..byte_end]);
+                    }
+
+                    // push crop marker if it's not the end of the text.
+                    if byte_end < self.text.len() && !self.crop_marker.is_empty() {
+                        formatted.push(self.crop_marker);
+                    }
+
+                    if formatted.len() == 1 {
+                        // avoid concatenating if there is already 1 slice.
+                        Cow::Borrowed(&self.text[byte_start..byte_end])
+                    } else {
+                        Cow::Owned(formatted.concat())
+                    }
+                }
+                None => self.compute_matches().format(format_options),
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use charabia::TokenizerBuilder;
+    use matching_words::tests::temp_index_with_documents;
+
+    use super::super::located_query_terms_from_string;
+    use super::*;
+
+    impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> {
+        pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self {
+            let tokenizer = TokenizerBuilder::new().build();
+            let tokens = tokenizer.tokenize(query);
+            let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap();
+            Self::new(ctx, query_terms, TokenizerBuilder::new().build())
+        }
+    }
+
+    #[test]
+    fn format_identity() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(&mut ctx, "split the world");
+
+        let format_options = FormatOptions { highlight: false, crop: None };
+
+        // Text without any match.
+        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
+        let mut matcher = builder.build(text);
+        // no crop and no highlight should return complete text.
+        assert_eq!(&matcher.format(format_options), &text);
+
+        // Text containing all matches.
+        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
+        let mut matcher = builder.build(text);
+        // no crop and no highlight should return complete text.
+        assert_eq!(&matcher.format(format_options), &text);
+
+        // Text containing some matches.
+        let text = "Natalie risk her future to build a world with the boy she loves.";
+        let mut matcher = builder.build(text);
+        // no crop and no highlight should return complete text.
+        assert_eq!(&matcher.format(format_options), &text);
+    }
+
+    #[test]
+    fn format_highlight() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(&mut ctx, "split the world");
+
+        let format_options = FormatOptions { highlight: true, crop: None };
+
+        // empty text.
+        let text = "";
+        let mut matcher = builder.build(text);
+        assert_eq!(&matcher.format(format_options), "");
+
+        // text containing only separators.
+        let text = ":-)";
+        let mut matcher = builder.build(text);
+        assert_eq!(&matcher.format(format_options), ":-)");
+
+        // Text without any match.
+        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
+        let mut matcher = builder.build(text);
+        // no crop should return complete text, because there is no matches.
+        assert_eq!(&matcher.format(format_options), &text);
+
+        // Text containing all matches.
+        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
+        let mut matcher = builder.build(text);
+        // no crop should return complete text with highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
+        );
+
+        // Text containing some matches.
+        let text = "Natalie risk her future to build a world with the boy she loves.";
+        let mut matcher = builder.build(text);
+        // no crop should return complete text with highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
+        );
+    }
+
+    #[test]
+    fn highlight_unicode() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(&mut ctx, "world");
+        let format_options = FormatOptions { highlight: true, crop: None };
+
+        // Text containing prefix match.
+        let text = "Ŵôřlḑôle";
+        let mut matcher = builder.build(text);
+        // no crop should return complete text with highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"<em>Ŵôřlḑ</em>ôle"
+        );
+
+        // Text containing unicode match.
+        let text = "Ŵôřlḑ";
+        let mut matcher = builder.build(text);
+        // no crop should return complete text with highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"<em>Ŵôřlḑ</em>"
+        );
+
+        let builder = MatcherBuilder::new_test(&mut ctx, "westfali");
+        let format_options = FormatOptions { highlight: true, crop: None };
+
+        // Text containing unicode match.
+        let text = "Westfália";
+        let mut matcher = builder.build(text);
+        // no crop should return complete text with highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"<em>Westfáli</em>a"
+        );
+    }
+
+    #[test]
+    fn format_crop() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(&mut ctx, "split the world");
+
+        let format_options = FormatOptions { highlight: false, crop: Some(10) };
+
+        // empty text.
+        let text = "";
+        let mut matcher = builder.build(text);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @""
+        );
+
+        // text containing only separators.
+        let text = ":-)";
+        let mut matcher = builder.build(text);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @":-)"
+        );
+
+        // Text without any match.
+        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
+        let mut matcher = builder.build(text);
+        // no highlight should return 10 first words with a marker at the end.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"A quick brown fox can not jump 32 feet, right…"
+        );
+
+        // Text without any match starting by a separator.
+        let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
+        let mut matcher = builder.build(text);
+        // no highlight should return 10 first words with a marker at the end.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"(A quick brown fox can not jump 32 feet, right…"
+        );
+
+        // Test phrase propagation
+        let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
+        let mut matcher = builder.build(text);
+        // should crop the phrase instead of croping around the match.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"… Split The World is a book written by Emily Henry…"
+        );
+
+        // Text containing some matches.
+        let text = "Natalie risk her future to build a world with the boy she loves.";
+        let mut matcher = builder.build(text);
+        // no highlight should return 10 last words with a marker at the start.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…future to build a world with the boy she loves…"
+        );
+
+        // Text containing all matches.
+        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
+        let mut matcher = builder.build(text);
+        // no highlight should return 10 last words with a marker at the start.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…she loves. Emily Henry: The Love That Split The World."
+        );
+
+        // Text containing a match unordered and a match ordered.
+        let text = "The world split void void void void void void void void void split the world void void";
+        let mut matcher = builder.build(text);
+        // crop should return 10 last words with a marker at the start.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…void void void void void split the world void void"
+        );
+
+        // Text containing matches with diferent density.
+        let text = "split void the void void world void void void void void void void void void void split the world void void";
+        let mut matcher = builder.build(text);
+        // crop should return 10 last words with a marker at the start.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…void void void void void split the world void void"
+        );
+
+        // Text containing matches with same word.
+        let text = "split split split split split split void void void void void void void void void void split the world void void";
+        let mut matcher = builder.build(text);
+        // crop should return 10 last words with a marker at the start.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…void void void void void split the world void void"
+        );
+    }
+
+    #[test]
+    fn format_highlight_crop() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(&mut ctx, "split the world");
+
+        let format_options = FormatOptions { highlight: true, crop: Some(10) };
+
+        // empty text.
+        let text = "";
+        let mut matcher = builder.build(text);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @""
+        );
+
+        // text containing only separators.
+        let text = ":-)";
+        let mut matcher = builder.build(text);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @":-)"
+        );
+
+        // Text without any match.
+        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
+        let mut matcher = builder.build(text);
+        // both should return 10 first words with a marker at the end.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"A quick brown fox can not jump 32 feet, right…"
+        );
+
+        // Text containing some matches.
+        let text = "Natalie risk her future to build a world with the boy she loves.";
+        let mut matcher = builder.build(text);
+        // both should return 10 last words with a marker at the start and highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…future to build a <em>world</em> with <em>the</em> boy she loves…"
+        );
+
+        // Text containing all matches.
+        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
+        let mut matcher = builder.build(text);
+        // both should return 10 last words with a marker at the start and highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."
+        );
+
+        // Text containing a match unordered and a match ordered.
+        let text = "The world split void void void void void void void void void split the world void void";
+        let mut matcher = builder.build(text);
+        // crop should return 10 last words with a marker at the start.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
+        );
+    }
+
+    #[test]
+    fn smaller_crop_size() {
+        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(&mut ctx, "split the world");
+
+        let text = "void void split the world void void.";
+
+        // set a smaller crop size
+        let format_options = FormatOptions { highlight: false, crop: Some(2) };
+        let mut matcher = builder.build(text);
+        // because crop size < query size, partially format matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…split the…"
+        );
+
+        // set a smaller crop size
+        let format_options = FormatOptions { highlight: false, crop: Some(1) };
+        let mut matcher = builder.build(text);
+        // because crop size < query size, partially format matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…split…"
+        );
+
+        // set  crop size to 0
+        let format_options = FormatOptions { highlight: false, crop: Some(0) };
+        let mut matcher = builder.build(text);
+        // because crop size is 0, crop is ignored.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"void void split the world void void."
+        );
+    }
+
+    #[test]
+    fn partial_matches() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\"");
+        builder.highlight_prefix("_".to_string());
+        builder.highlight_suffix("_".to_string());
+
+        let format_options = FormatOptions { highlight: true, crop: None };
+
+        let text = "the do or die can't be he do and or isn't he";
+        let mut matcher = builder.build(text);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
+        );
+    }
+}
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -4,6 +4,7 @@ mod graph_based_ranking_rule;
 mod interner;
 mod limits;
 mod logger;
+mod matches;
 mod query_graph;
 mod query_term;
 mod ranking_rule_graph;