584: Chores: Enhance smart-crop code comments r=curquiza a=ManyTheFish

Enhance explanation around smart crop algorithms

Co-authored-by: ManyTheFish <many@meilisearch.com>
Co-authored-by: Many the fish <many@meilisearch.com>
This commit is contained in:
bors[bot]
2022-07-19 07:08:14 +00:00
committed by GitHub

View File

@@ -11,6 +11,7 @@ const DEFAULT_CROP_MARKER: &'static str = "…";
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>"; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
/// Structure used to build a Matcher allowing to customize formating tags.
pub struct MatcherBuilder<'a, A> { pub struct MatcherBuilder<'a, A> {
matching_words: MatchingWords, matching_words: MatchingWords,
tokenizer: Tokenizer<'a, A>, tokenizer: Tokenizer<'a, A>,
@@ -100,6 +101,8 @@ pub struct MatchBounds {
pub length: usize, pub length: usize,
} }
/// Structure used to analize a string, compute words that match,
/// and format the source string, returning a highlighted and cropped sub-string.
pub struct Matcher<'t, 'm, A> { pub struct Matcher<'t, 'm, A> {
text: &'t str, text: &'t str,
matching_words: &'m MatchingWords, matching_words: &'m MatchingWords,
@@ -113,6 +116,8 @@ pub struct Matcher<'t, 'm, A> {
impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
/// Iterates over tokens and save any of them that matches the query. /// Iterates over tokens and save any of them that matches the query.
fn compute_matches(&mut self) -> &mut Self { fn compute_matches(&mut self) -> &mut Self {
/// some words are counted as matches only if they are close together and in the good order,
/// compute_partial_match peek into next words to validate if the match is complete.
fn compute_partial_match<'a>( fn compute_partial_match<'a>(
mut partial: PartialMatch, mut partial: PartialMatch,
token_position: usize, token_position: usize,
@@ -246,9 +251,14 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
// matches needs to be counted in the crop len. // matches needs to be counted in the crop len.
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
// create the initial state of the crop window: 2 iterators starting from the matches positions,
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
// an iterator starting from the last match token position and going towards the end of the text.
let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
// grows the crop window peeking in both directions
// until the window contains the good number of words:
while remaining_words > 0 { while remaining_words > 0 {
let before_token = before_tokens.peek().map(|t| t.separator_kind()); let before_token = before_tokens.peek().map(|t| t.separator_kind());
let after_token = after_tokens.peek().map(|t| t.separator_kind()); let after_token = after_tokens.peek().map(|t| t.separator_kind());
@@ -315,6 +325,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
} }
} }
// finally, keep the byte index of each bound of the crop window.
let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
@@ -353,7 +364,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
(uniq_score, distance_score, order_score) (uniq_score, distance_score, order_score)
} }
/// Returns the matches interval where the score computed by match_interval_score is maximal. /// Returns the matches interval where the score computed by match_interval_score is the best.
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
// we compute the matches interval if we have at least 2 matches. // we compute the matches interval if we have at least 2 matches.
if matches.len() > 1 { if matches.len() > 1 {
@@ -408,6 +419,8 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
} else { } else {
match &self.matches { match &self.matches {
Some((tokens, matches)) => { Some((tokens, matches)) => {
// If the text has to be cropped,
// compute the best interval to crop around.
let matches = match format_options.crop { let matches = match format_options.crop {
Some(crop_size) if crop_size > 0 => { Some(crop_size) if crop_size > 0 => {
self.find_best_match_interval(matches, crop_size) self.find_best_match_interval(matches, crop_size)
@@ -415,6 +428,8 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
_ => matches, _ => matches,
}; };
// If the text has to be cropped,
// crop around the best interval.
let (byte_start, byte_end) = match format_options.crop { let (byte_start, byte_end) = match format_options.crop {
Some(crop_size) if crop_size > 0 => { Some(crop_size) if crop_size > 0 => {
self.crop_bounds(tokens, matches, crop_size) self.crop_bounds(tokens, matches, crop_size)