mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 01:46:28 +00:00 
			
		
		
		
	Matching words fixes
This commit is contained in:
		@@ -1,8 +1,6 @@
 | 
			
		||||
// #[cfg(test)]
 | 
			
		||||
pub mod detailed;
 | 
			
		||||
 | 
			
		||||
pub mod test_logger;
 | 
			
		||||
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
 | 
			
		||||
use super::interner::{Interned, MappedInterner};
 | 
			
		||||
 
 | 
			
		||||
@@ -5,9 +5,7 @@ use std::ops::RangeInclusive;
 | 
			
		||||
use charabia::Token;
 | 
			
		||||
 | 
			
		||||
use super::super::interner::Interned;
 | 
			
		||||
use super::super::query_term::{
 | 
			
		||||
    Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
 | 
			
		||||
};
 | 
			
		||||
use super::super::query_term::LocatedQueryTerm;
 | 
			
		||||
use super::super::{DedupInterner, Phrase};
 | 
			
		||||
use crate::SearchContext;
 | 
			
		||||
 | 
			
		||||
@@ -33,68 +31,16 @@ pub struct MatchingWords {
 | 
			
		||||
    words: Vec<LocatedMatchingWords>,
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
 | 
			
		||||
fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
 | 
			
		||||
    let mut matching_words = Vec::new();
 | 
			
		||||
    let mut matching_phrases = Vec::new();
 | 
			
		||||
 | 
			
		||||
    // the structure is exhaustively extracted to ensure that no field is missing.
 | 
			
		||||
    let QueryTerm {
 | 
			
		||||
        original: _,
 | 
			
		||||
        is_multiple_words: _,
 | 
			
		||||
        max_nbr_typos: _,
 | 
			
		||||
        is_prefix: _,
 | 
			
		||||
        zero_typo,
 | 
			
		||||
        one_typo,
 | 
			
		||||
        two_typo,
 | 
			
		||||
    } = term;
 | 
			
		||||
 | 
			
		||||
    // the structure is exhaustively extracted to ensure that no field is missing.
 | 
			
		||||
    let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
 | 
			
		||||
 | 
			
		||||
    // zero typo
 | 
			
		||||
    if let Some(phrase) = phrase {
 | 
			
		||||
        matching_phrases.push(*phrase);
 | 
			
		||||
    }
 | 
			
		||||
    if let Some(zero_typo) = zero_typo {
 | 
			
		||||
        matching_words.push(*zero_typo);
 | 
			
		||||
    }
 | 
			
		||||
    for synonym in synonyms {
 | 
			
		||||
        matching_phrases.push(*synonym);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // one typo
 | 
			
		||||
    // the structure is exhaustively extracted to ensure that no field is missing.
 | 
			
		||||
    if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
 | 
			
		||||
        if let Some(split_words) = split_words {
 | 
			
		||||
            matching_phrases.push(*split_words);
 | 
			
		||||
        }
 | 
			
		||||
        for one_typo in one_typo {
 | 
			
		||||
            matching_words.push(*one_typo);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    // two typos
 | 
			
		||||
    // the structure is exhaustively extracted to ensure that no field is missing.
 | 
			
		||||
    if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
 | 
			
		||||
        for two_typos in two_typos {
 | 
			
		||||
            matching_words.push(*two_typos);
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    (matching_phrases, matching_words)
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl MatchingWords {
 | 
			
		||||
    pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
 | 
			
		||||
        let mut phrases = Vec::new();
 | 
			
		||||
        let mut words = Vec::new();
 | 
			
		||||
 | 
			
		||||
        // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
 | 
			
		||||
        // Extract and centralize the different phrases and words to match stored in a QueryTerm
 | 
			
		||||
        // and wrap them in dedicated structures.
 | 
			
		||||
        for located_term in located_terms {
 | 
			
		||||
            let term = ctx.term_interner.get(located_term.value);
 | 
			
		||||
            let (matching_phrases, matching_words) = extract_matching_terms(term);
 | 
			
		||||
            let (matching_words, matching_phrases) = term.all_computed_derivations();
 | 
			
		||||
 | 
			
		||||
            for matching_phrase in matching_phrases {
 | 
			
		||||
                phrases.push(LocatedMatchingPhrase {
 | 
			
		||||
@@ -106,8 +52,8 @@ impl MatchingWords {
 | 
			
		||||
            words.push(LocatedMatchingWords {
 | 
			
		||||
                value: matching_words,
 | 
			
		||||
                positions: located_term.positions.clone(),
 | 
			
		||||
                is_prefix: term.is_prefix,
 | 
			
		||||
                original_char_count: ctx.word_interner.get(term.original).chars().count(),
 | 
			
		||||
                is_prefix: term.is_cached_prefix(),
 | 
			
		||||
                original_char_count: term.original_word(&ctx).chars().count(),
 | 
			
		||||
            });
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -137,7 +137,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
 | 
			
		||||
                    }
 | 
			
		||||
                    // partial match is now full, we keep this matches and we advance positions
 | 
			
		||||
                    Some(MatchType::Full { char_len, ids }) => {
 | 
			
		||||
                        let ids: Vec<_> = ids.clone().into_iter().collect();
 | 
			
		||||
                        let ids: Vec<_> = ids.clone().collect();
 | 
			
		||||
                        // save previously matched tokens as matches.
 | 
			
		||||
                        let iter = potential_matches.into_iter().map(
 | 
			
		||||
                            |(token_position, word_position, match_len)| Match {
 | 
			
		||||
@@ -192,7 +192,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
 | 
			
		||||
                    // we match, we save the current token as a match,
 | 
			
		||||
                    // then we continue the rest of the tokens.
 | 
			
		||||
                    MatchType::Full { char_len, ids } => {
 | 
			
		||||
                        let ids: Vec<_> = ids.clone().into_iter().collect();
 | 
			
		||||
                        let ids: Vec<_> = ids.clone().collect();
 | 
			
		||||
                        matches.push(Match {
 | 
			
		||||
                            match_len: char_len,
 | 
			
		||||
                            ids,
 | 
			
		||||
 
 | 
			
		||||
@@ -35,20 +35,20 @@ pub use logger::detailed::DetailedSearchLogger;
 | 
			
		||||
pub use logger::{DefaultSearchLogger, SearchLogger};
 | 
			
		||||
use query_graph::{QueryGraph, QueryNode};
 | 
			
		||||
use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
 | 
			
		||||
use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
 | 
			
		||||
use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
 | 
			
		||||
use resolve_query_graph::PhraseDocIdsCache;
 | 
			
		||||
use roaring::RoaringBitmap;
 | 
			
		||||
use words::Words;
 | 
			
		||||
 | 
			
		||||
use self::bucket_sort::BucketSortOutput;
 | 
			
		||||
use self::exact_attribute::ExactAttribute;
 | 
			
		||||
use self::graph_based_ranking_rule::Exactness;
 | 
			
		||||
use self::interner::Interner;
 | 
			
		||||
use self::ranking_rules::{BoxRankingRule, RankingRule};
 | 
			
		||||
use self::resolve_query_graph::compute_query_graph_docids;
 | 
			
		||||
use self::sort::Sort;
 | 
			
		||||
use crate::search::new::distinct::apply_distinct_rule;
 | 
			
		||||
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
 | 
			
		||||
use bucket_sort::BucketSortOutput;
 | 
			
		||||
use exact_attribute::ExactAttribute;
 | 
			
		||||
use graph_based_ranking_rule::Exactness;
 | 
			
		||||
use interner::Interner;
 | 
			
		||||
use ranking_rules::{BoxRankingRule, RankingRule};
 | 
			
		||||
use resolve_query_graph::compute_query_graph_docids;
 | 
			
		||||
use sort::Sort;
 | 
			
		||||
 | 
			
		||||
/// A structure used throughout the execution of a search query.
 | 
			
		||||
pub struct SearchContext<'ctx> {
 | 
			
		||||
@@ -361,6 +361,7 @@ pub fn execute_search(
 | 
			
		||||
    Ok(PartialSearchResult {
 | 
			
		||||
        candidates: all_candidates,
 | 
			
		||||
        documents_ids: docids,
 | 
			
		||||
        located_query_terms,
 | 
			
		||||
    })
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
 
 | 
			
		||||
@@ -188,17 +188,35 @@ impl QueryTermSubset {
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        let original = ctx.term_interner.get_mut(self.original);
 | 
			
		||||
        if !self.zero_typo_subset.is_empty() {
 | 
			
		||||
            let ZeroTypoTerm {
 | 
			
		||||
                phrase: _,
 | 
			
		||||
                exact: zero_typo,
 | 
			
		||||
                prefix_of,
 | 
			
		||||
                synonyms: _,
 | 
			
		||||
                use_prefix_db: _,
 | 
			
		||||
            } = &original.zero_typo;
 | 
			
		||||
            result.extend(zero_typo.iter().copied());
 | 
			
		||||
            result.extend(prefix_of.iter().copied());
 | 
			
		||||
        };
 | 
			
		||||
        match &self.zero_typo_subset {
 | 
			
		||||
            NTypoTermSubset::All => {
 | 
			
		||||
                let ZeroTypoTerm {
 | 
			
		||||
                    phrase: _,
 | 
			
		||||
                    exact: zero_typo,
 | 
			
		||||
                    prefix_of,
 | 
			
		||||
                    synonyms: _,
 | 
			
		||||
                    use_prefix_db: _,
 | 
			
		||||
                } = &original.zero_typo;
 | 
			
		||||
                result.extend(zero_typo.iter().copied());
 | 
			
		||||
                result.extend(prefix_of.iter().copied());
 | 
			
		||||
            }
 | 
			
		||||
            NTypoTermSubset::Subset { words, phrases: _ } => {
 | 
			
		||||
                let ZeroTypoTerm {
 | 
			
		||||
                    phrase: _,
 | 
			
		||||
                    exact: zero_typo,
 | 
			
		||||
                    prefix_of,
 | 
			
		||||
                    synonyms: _,
 | 
			
		||||
                    use_prefix_db: _,
 | 
			
		||||
                } = &original.zero_typo;
 | 
			
		||||
                if let Some(zero_typo) = zero_typo {
 | 
			
		||||
                    if words.contains(zero_typo) {
 | 
			
		||||
                        result.insert(*zero_typo);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
                result.extend(prefix_of.intersection(words).copied());
 | 
			
		||||
            }
 | 
			
		||||
            NTypoTermSubset::Nothing => {}
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        match &self.one_typo_subset {
 | 
			
		||||
            NTypoTermSubset::All => {
 | 
			
		||||
@@ -248,11 +266,24 @@ impl QueryTermSubset {
 | 
			
		||||
        result.extend(phrase.iter().copied());
 | 
			
		||||
        result.extend(synonyms.iter().copied());
 | 
			
		||||
 | 
			
		||||
        if !self.one_typo_subset.is_empty() {
 | 
			
		||||
            let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
 | 
			
		||||
                panic!();
 | 
			
		||||
            };
 | 
			
		||||
            result.extend(split_words.iter().copied());
 | 
			
		||||
        match &self.one_typo_subset {
 | 
			
		||||
            NTypoTermSubset::All => {
 | 
			
		||||
                let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
 | 
			
		||||
                    panic!();
 | 
			
		||||
                };
 | 
			
		||||
                result.extend(split_words.iter().copied());
 | 
			
		||||
            }
 | 
			
		||||
            NTypoTermSubset::Subset { phrases, .. } => {
 | 
			
		||||
                let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
 | 
			
		||||
                    panic!();
 | 
			
		||||
                };
 | 
			
		||||
                if let Some(split_words) = split_words {
 | 
			
		||||
                    if phrases.contains(split_words) {
 | 
			
		||||
                        result.insert(*split_words);
 | 
			
		||||
                    }
 | 
			
		||||
                }
 | 
			
		||||
            }
 | 
			
		||||
            NTypoTermSubset::Nothing => {}
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        Ok(result)
 | 
			
		||||
@@ -368,3 +399,34 @@ impl LocatedQueryTerm {
 | 
			
		||||
        interner.get(self.value).is_empty()
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl QueryTerm {
 | 
			
		||||
    pub fn is_cached_prefix(&self) -> bool {
 | 
			
		||||
        self.zero_typo.use_prefix_db.is_some()
 | 
			
		||||
    }
 | 
			
		||||
    pub fn original_word(&self, ctx: &SearchContext) -> String {
 | 
			
		||||
        ctx.word_interner.get(self.original).clone()
 | 
			
		||||
    }
 | 
			
		||||
    pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
 | 
			
		||||
        let mut words = BTreeSet::new();
 | 
			
		||||
        let mut phrases = BTreeSet::new();
 | 
			
		||||
 | 
			
		||||
        let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
 | 
			
		||||
            &self.zero_typo;
 | 
			
		||||
        words.extend(zero_typo.iter().copied());
 | 
			
		||||
        words.extend(prefix_of.iter().copied());
 | 
			
		||||
        phrases.extend(phrase.iter().copied());
 | 
			
		||||
        phrases.extend(synonyms.iter().copied());
 | 
			
		||||
 | 
			
		||||
        if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
 | 
			
		||||
            words.extend(one_typo.iter().copied());
 | 
			
		||||
            phrases.extend(split_words.iter().copied());
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
 | 
			
		||||
            words.extend(two_typos.iter().copied());
 | 
			
		||||
        };
 | 
			
		||||
 | 
			
		||||
        (words.into_iter().collect(), phrases.into_iter().collect())
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user