mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-11-04 01:46:28 +00:00 
			
		
		
		
	Enhance word splitting strategy
This commit is contained in:
		
							
								
								
									
										53
									
								
								milli/src/search/query_tree.rs
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										53
									
								
								milli/src/search/query_tree.rs
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@@ -1,6 +1,6 @@
 | 
			
		||||
use std::borrow::Cow;
 | 
			
		||||
use std::cmp::max;
 | 
			
		||||
use std::{cmp, fmt, mem};
 | 
			
		||||
use std::{fmt, mem};
 | 
			
		||||
 | 
			
		||||
use charabia::classifier::ClassifiedTokenIter;
 | 
			
		||||
use charabia::{SeparatorKind, TokenKind};
 | 
			
		||||
@@ -10,7 +10,7 @@ use slice_group_by::GroupBy;
 | 
			
		||||
 | 
			
		||||
use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
 | 
			
		||||
use crate::search::TermsMatchingStrategy;
 | 
			
		||||
use crate::{Index, MatchingWords, Result};
 | 
			
		||||
use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result};
 | 
			
		||||
 | 
			
		||||
type IsOptionalWord = bool;
 | 
			
		||||
type IsPrefix = bool;
 | 
			
		||||
@@ -146,6 +146,7 @@ impl fmt::Debug for Query {
 | 
			
		||||
 | 
			
		||||
trait Context {
 | 
			
		||||
    fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
 | 
			
		||||
    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
 | 
			
		||||
    fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
 | 
			
		||||
    fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
 | 
			
		||||
        match self.word_docids(word)? {
 | 
			
		||||
@@ -156,6 +157,12 @@ trait Context {
 | 
			
		||||
    /// Returns the minimum word len for 1 and 2 typos.
 | 
			
		||||
    fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
 | 
			
		||||
    fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
 | 
			
		||||
    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
 | 
			
		||||
        match self.word_pair_proximity_docids(right_word, left_word, proximity)? {
 | 
			
		||||
            Some(rb) => Ok(Some(rb.len())),
 | 
			
		||||
            None => Ok(None),
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/// The query tree builder is the interface to build a query tree.
 | 
			
		||||
@@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
 | 
			
		||||
        self.index.word_docids.get(self.rtxn, word)
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
 | 
			
		||||
        self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity))
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
 | 
			
		||||
        self.index.words_synonyms(self.rtxn, words)
 | 
			
		||||
    }
 | 
			
		||||
@@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> {
 | 
			
		||||
    fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
 | 
			
		||||
        self.exact_words.as_ref()
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> {
 | 
			
		||||
        let key = (left_word, right_word, proximity);
 | 
			
		||||
        self.index.word_pair_proximity_docids.remap_data_type::<CboRoaringBitmapLenCodec>().get(&self.rtxn, &key)
 | 
			
		||||
    }
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
impl<'a> QueryTreeBuilder<'a> {
 | 
			
		||||
@@ -274,12 +290,10 @@ fn split_best_frequency<'a>(
 | 
			
		||||
    for (i, _) in chars {
 | 
			
		||||
        let (left, right) = word.split_at(i);
 | 
			
		||||
 | 
			
		||||
        let left_freq = ctx.word_documents_count(left)?.unwrap_or(0);
 | 
			
		||||
        let right_freq = ctx.word_documents_count(right)?.unwrap_or(0);
 | 
			
		||||
        let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0);
 | 
			
		||||
 | 
			
		||||
        let min_freq = cmp::min(left_freq, right_freq);
 | 
			
		||||
        if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
 | 
			
		||||
            best = Some((min_freq, left, right));
 | 
			
		||||
        if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) {
 | 
			
		||||
            best = Some((pair_freq, left, right));
 | 
			
		||||
        }
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
@@ -824,6 +838,11 @@ mod test {
 | 
			
		||||
            Ok(self.postings.get(word).cloned())
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result<Option<RoaringBitmap>> {
 | 
			
		||||
            let bitmap = self.postings.get(&format!("{} {}", left_word, right_word));
 | 
			
		||||
            Ok(bitmap.cloned())
 | 
			
		||||
        }
 | 
			
		||||
 | 
			
		||||
        fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
 | 
			
		||||
            let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
 | 
			
		||||
            Ok(self.synonyms.get(&words).cloned())
 | 
			
		||||
@@ -894,6 +913,9 @@ mod test {
 | 
			
		||||
                    String::from("this")            => random_postings(rng, 50_000),
 | 
			
		||||
                    String::from("good")            => random_postings(rng,   1250),
 | 
			
		||||
                    String::from("morning")         => random_postings(rng,    125),
 | 
			
		||||
                    String::from("word split")      => random_postings(rng,   5000),
 | 
			
		||||
                    String::from("quick brownfox")  => random_postings(rng,   7000),
 | 
			
		||||
                    String::from("quickbrown fox")  => random_postings(rng,   8000),
 | 
			
		||||
                },
 | 
			
		||||
                exact_words,
 | 
			
		||||
            }
 | 
			
		||||
@@ -1041,6 +1063,23 @@ mod test {
 | 
			
		||||
        "###);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn word_split_choose_pair_with_max_freq() {
 | 
			
		||||
        let query = "quickbrownfox";
 | 
			
		||||
        let tokens = query.tokenize();
 | 
			
		||||
 | 
			
		||||
        let (query_tree, _) = TestContext::default()
 | 
			
		||||
            .build(TermsMatchingStrategy::All, true, None, tokens)
 | 
			
		||||
            .unwrap()
 | 
			
		||||
            .unwrap();
 | 
			
		||||
 | 
			
		||||
        insta::assert_debug_snapshot!(query_tree, @r###"
 | 
			
		||||
        OR
 | 
			
		||||
          PHRASE ["quickbrown", "fox"]
 | 
			
		||||
          PrefixTolerant { word: "quickbrownfox", max typo: 2 }
 | 
			
		||||
        "###);
 | 
			
		||||
    }
 | 
			
		||||
 | 
			
		||||
    #[test]
 | 
			
		||||
    fn phrase() {
 | 
			
		||||
        let query = "\"hey friends\" \" \" \"wooop";
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user