Fix match count

2025-07-28 01:01:00 +00:00 · 2022-04-04 18:56:59 +02:00
parent 56e0edd621
commit 3bb1e35ada
4 changed files with 469 additions and 231 deletions
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -8,7 +8,8 @@ use meilisearch_tokenizer::TokenKind;
 use roaring::RoaringBitmap;
 use slice_group_by::GroupBy;

-use crate::{Index, Result};
+use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
+use crate::{Index, MatchingWords, Result};

 type IsOptionalWord = bool;
 type IsPrefix = bool;
@ -233,7 +234,10 @@ impl<'a> QueryTreeBuilder<'a> {
    /// - if `authorize_typos` is set to `false` the query tree will be generated
    ///   forcing all query words to match documents without any typo
    ///   (the criterion `typo` will be ignored)
-    pub fn build(&self, query: TokenStream) -> Result<Option<(Operation, PrimitiveQuery)>> {
+    pub fn build(
+        &self,
+        query: TokenStream,
+    ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
        let stop_words = self.index.stop_words(self.rtxn)?;
        let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
        if !primitive_query.is_empty() {
@ -243,7 +247,9 @@ impl<'a> QueryTreeBuilder<'a> {
                self.authorize_typos,
                &primitive_query,
            )?;
-            Ok(Some((qt, primitive_query)))
+            let matching_words =
+                create_matching_words(self, self.authorize_typos, &primitive_query)?;
+            Ok(Some((qt, primitive_query, matching_words)))
        } else {
            Ok(None)
        }
@ -251,7 +257,7 @@ impl<'a> QueryTreeBuilder<'a> {
 }

 /// Split the word depending on the frequency of subwords in the database documents.
-fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<Operation>> {
+fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<(String, String)>> {
    let chars = word.char_indices().skip(1);
    let mut best = None;

@ -267,7 +273,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
        }
    }

-    Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
+    Ok(best.map(|(_, left, right)| (left.to_string(), right.to_string())))
 }

 #[derive(Clone)]
@ -336,8 +342,8 @@ fn create_query_tree(
            // 4. wrap all in an OR operation
            PrimitiveQueryPart::Word(word, prefix) => {
                let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
-                if let Some(child) = split_best_frequency(ctx, &word)? {
-                    children.push(child);
+                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
+                    children.push(Operation::Phrase(vec![left, right]));
                }
                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
                let exact_words = ctx.exact_words()?;
@ -464,6 +470,154 @@ fn create_query_tree(
    }
 }

+/// Main function that matchings words used for crop and highlight.
+fn create_matching_words(
+    ctx: &impl Context,
+    authorize_typos: bool,
+    query: &[PrimitiveQueryPart],
+) -> Result<MatchingWords> {
+    /// Matches on the `PrimitiveQueryPart` and create matchings words from it.
+    fn resolve_primitive_part(
+        ctx: &impl Context,
+        authorize_typos: bool,
+        part: PrimitiveQueryPart,
+        matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+        id: PrimitiveWordId,
+    ) -> Result<()> {
+        match part {
+            // 1. try to split word in 2
+            // 2. try to fetch synonyms
+            PrimitiveQueryPart::Word(word, prefix) => {
+                if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
+                    for synonym in synonyms {
+                        let synonym = synonym
+                            .into_iter()
+                            .map(|syn| MatchingWord::new(syn.to_string(), 0, false))
+                            .collect();
+                        matching_words.push((synonym, vec![id]));
+                    }
+                }
+
+                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
+                    let left = MatchingWord::new(left, 0, false);
+                    let right = MatchingWord::new(right, 0, false);
+                    matching_words.push((vec![left, right], vec![id]));
+                }
+
+                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
+                let exact_words = ctx.exact_words()?;
+                let config =
+                    TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
+
+                let matching_word = match typos(word, authorize_typos, config) {
+                    QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
+                    QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
+                };
+                matching_words.push((vec![matching_word], vec![id]));
+            }
+            // create a CONSECUTIVE matchings words wrapping all word in the phrase
+            PrimitiveQueryPart::Phrase(words) => {
+                let ids: Vec<_> =
+                    (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
+                let words =
+                    words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect();
+                matching_words.push((words, ids));
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Create all ngrams 1..=3 generating query tree branches.
+    fn ngrams(
+        ctx: &impl Context,
+        authorize_typos: bool,
+        query: &[PrimitiveQueryPart],
+        matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+        mut id: PrimitiveWordId,
+    ) -> Result<()> {
+        const MAX_NGRAM: usize = 3;
+
+        for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
+            for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
+                if let Some(group) = sub_query.get(..ngram) {
+                    let tail = &sub_query[ngram..];
+                    let is_last = tail.is_empty();
+
+                    match group {
+                        [part] => {
+                            resolve_primitive_part(
+                                ctx,
+                                authorize_typos,
+                                part.clone(),
+                                matching_words,
+                                id,
+                            )?;
+                        }
+                        words => {
+                            let is_prefix = words.last().map_or(false, |part| part.is_prefix());
+                            let words: Vec<_> = words
+                                .iter()
+                                .filter_map(|part| {
+                                    if let PrimitiveQueryPart::Word(word, _) = part {
+                                        Some(word.as_str())
+                                    } else {
+                                        None
+                                    }
+                                })
+                                .collect();
+                            let ids: Vec<_> = (0..words.len())
+                                .into_iter()
+                                .map(|i| id + i as PrimitiveWordId)
+                                .collect();
+
+                            if let Some(synonyms) = ctx.synonyms(&words)? {
+                                for synonym in synonyms {
+                                    let synonym = synonym
+                                        .into_iter()
+                                        .map(|syn| MatchingWord::new(syn.to_string(), 0, false))
+                                        .collect();
+                                    matching_words.push((synonym, ids.clone()));
+                                }
+                            }
+                            let word = words.concat();
+                            let (word_len_one_typo, word_len_two_typo) =
+                                ctx.min_word_len_for_typo()?;
+                            let exact_words = ctx.exact_words()?;
+                            let config = TypoConfig {
+                                max_typos: 1,
+                                word_len_one_typo,
+                                word_len_two_typo,
+                                exact_words,
+                            };
+                            let matching_word = match typos(word, authorize_typos, config) {
+                                QueryKind::Exact { word, .. } => {
+                                    MatchingWord::new(word, 0, is_prefix)
+                                }
+                                QueryKind::Tolerant { typo, word } => {
+                                    MatchingWord::new(word, typo, is_prefix)
+                                }
+                            };
+                            matching_words.push((vec![matching_word], ids));
+                        }
+                    }
+
+                    if !is_last {
+                        ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
+                    }
+                }
+            }
+            id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::<PrimitiveWordId>();
+        }
+
+        Ok(())
+    }
+
+    let mut matching_words = Vec::new();
+    ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
+    Ok(MatchingWords::new(matching_words))
+}
+
 pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;

 #[derive(Debug, Clone)]
@ -480,6 +634,13 @@ impl PrimitiveQueryPart {
    fn is_prefix(&self) -> bool {
        matches!(self, Self::Word(_, is_prefix) if *is_prefix)
    }
+
+    fn len(&self) -> usize {
+        match self {
+            Self::Phrase(words) => words.len(),
+            Self::Word(_, _) => 1,
+        }
+    }
 }

 /// Create primitive query from tokenized query string,