mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 21:46:27 +00:00 
			
		
		
		
	Enhance word splitting strategy
This commit is contained in:
		
							
								
								
									
										79
									
								
								milli/src/search/query_tree.rs
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										79
									
								
								milli/src/search/query_tree.rs
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							| @@ -1,6 +1,6 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::cmp::max; | ||||
| use std::{cmp, fmt, mem}; | ||||
| use std::{fmt, mem}; | ||||
|  | ||||
| use charabia::classifier::ClassifiedTokenIter; | ||||
| use charabia::{SeparatorKind, TokenKind}; | ||||
| @@ -10,7 +10,7 @@ use slice_group_by::GroupBy; | ||||
|  | ||||
| use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; | ||||
| use crate::search::TermsMatchingStrategy; | ||||
| use crate::{Index, MatchingWords, Result}; | ||||
| use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result}; | ||||
|  | ||||
| type IsOptionalWord = bool; | ||||
| type IsPrefix = bool; | ||||
| @@ -146,6 +146,7 @@ impl fmt::Debug for Query { | ||||
|  | ||||
| trait Context { | ||||
|     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; | ||||
|     fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>; | ||||
|     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>; | ||||
|     fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> { | ||||
|         match self.word_docids(word)? { | ||||
| @@ -156,6 +157,12 @@ trait Context { | ||||
|     /// Returns the minimum word len for 1 and 2 typos. | ||||
|     fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; | ||||
|     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>; | ||||
|     fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> { | ||||
|         match self.word_pair_proximity_docids(right_word, left_word, proximity)? { | ||||
|             Some(rb) => Ok(Some(rb.len())), | ||||
|             None => Ok(None), | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// The query tree builder is the interface to build a query tree. | ||||
| @@ -173,6 +180,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { | ||||
|         self.index.word_docids.get(self.rtxn, word) | ||||
|     } | ||||
|  | ||||
|     fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> { | ||||
|         self.index.word_pair_proximity_docids.get(self.rtxn, &(left_word, right_word, proximity)) | ||||
|     } | ||||
|  | ||||
|     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> { | ||||
|         self.index.words_synonyms(self.rtxn, words) | ||||
|     } | ||||
| @@ -190,6 +201,11 @@ impl<'a> Context for QueryTreeBuilder<'a> { | ||||
|     fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> { | ||||
|         self.exact_words.as_ref() | ||||
|     } | ||||
|  | ||||
|     fn word_pair_frequency(&self, left_word: &str, right_word: &str, proximity: u8) -> heed::Result<Option<u64>> { | ||||
|         let key = (left_word, right_word, proximity); | ||||
|         self.index.word_pair_proximity_docids.remap_data_type::<CboRoaringBitmapLenCodec>().get(&self.rtxn, &key) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> QueryTreeBuilder<'a> { | ||||
| @@ -274,12 +290,10 @@ fn split_best_frequency<'a>( | ||||
|     for (i, _) in chars { | ||||
|         let (left, right) = word.split_at(i); | ||||
|  | ||||
|         let left_freq = ctx.word_documents_count(left)?.unwrap_or(0); | ||||
|         let right_freq = ctx.word_documents_count(right)?.unwrap_or(0); | ||||
|         let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0); | ||||
|  | ||||
|         let min_freq = cmp::min(left_freq, right_freq); | ||||
|         if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { | ||||
|             best = Some((min_freq, left, right)); | ||||
|         if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) { | ||||
|             best = Some((pair_freq, left, right)); | ||||
|         } | ||||
|     } | ||||
|  | ||||
| @@ -824,6 +838,11 @@ mod test { | ||||
|             Ok(self.postings.get(word).cloned()) | ||||
|         } | ||||
|  | ||||
|         fn word_pair_proximity_docids(&self, right_word: &str, left_word: &str, _: u8) -> heed::Result<Option<RoaringBitmap>> { | ||||
|             let bitmap = self.postings.get(&format!("{} {}", left_word, right_word)); | ||||
|             Ok(bitmap.cloned()) | ||||
|         } | ||||
|  | ||||
|         fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> { | ||||
|             let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); | ||||
|             Ok(self.synonyms.get(&words).cloned()) | ||||
| @@ -881,19 +900,22 @@ mod test { | ||||
|                     ], | ||||
|                 }, | ||||
|                 postings: hashmap! { | ||||
|                     String::from("hello")      => random_postings(rng,   1500), | ||||
|                     String::from("hi")         => random_postings(rng,   4000), | ||||
|                     String::from("word")       => random_postings(rng,   2500), | ||||
|                     String::from("split")      => random_postings(rng,    400), | ||||
|                     String::from("ngrams")     => random_postings(rng,   1400), | ||||
|                     String::from("world")      => random_postings(rng, 15_000), | ||||
|                     String::from("earth")      => random_postings(rng,   8000), | ||||
|                     String::from("2021")       => random_postings(rng,    100), | ||||
|                     String::from("2020")       => random_postings(rng,    500), | ||||
|                     String::from("is")         => random_postings(rng, 50_000), | ||||
|                     String::from("this")       => random_postings(rng, 50_000), | ||||
|                     String::from("good")       => random_postings(rng,   1250), | ||||
|                     String::from("morning")    => random_postings(rng,    125), | ||||
|                     String::from("hello")           => random_postings(rng,   1500), | ||||
|                     String::from("hi")              => random_postings(rng,   4000), | ||||
|                     String::from("word")            => random_postings(rng,   2500), | ||||
|                     String::from("split")           => random_postings(rng,    400), | ||||
|                     String::from("ngrams")          => random_postings(rng,   1400), | ||||
|                     String::from("world")           => random_postings(rng, 15_000), | ||||
|                     String::from("earth")           => random_postings(rng,   8000), | ||||
|                     String::from("2021")            => random_postings(rng,    100), | ||||
|                     String::from("2020")            => random_postings(rng,    500), | ||||
|                     String::from("is")              => random_postings(rng, 50_000), | ||||
|                     String::from("this")            => random_postings(rng, 50_000), | ||||
|                     String::from("good")            => random_postings(rng,   1250), | ||||
|                     String::from("morning")         => random_postings(rng,    125), | ||||
|                     String::from("word split")      => random_postings(rng,   5000), | ||||
|                     String::from("quick brownfox")  => random_postings(rng,   7000), | ||||
|                     String::from("quickbrown fox")  => random_postings(rng,   8000), | ||||
|                 }, | ||||
|                 exact_words, | ||||
|             } | ||||
| @@ -1041,6 +1063,23 @@ mod test { | ||||
|         "###); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn word_split_choose_pair_with_max_freq() { | ||||
|         let query = "quickbrownfox"; | ||||
|         let tokens = query.tokenize(); | ||||
|  | ||||
|         let (query_tree, _) = TestContext::default() | ||||
|             .build(TermsMatchingStrategy::All, true, None, tokens) | ||||
|             .unwrap() | ||||
|             .unwrap(); | ||||
|  | ||||
|         insta::assert_debug_snapshot!(query_tree, @r###" | ||||
|         OR | ||||
|           PHRASE ["quickbrown", "fox"] | ||||
|           PrefixTolerant { word: "quickbrownfox", max typo: 2 } | ||||
|         "###); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn phrase() { | ||||
|         let query = "\"hey friends\" \" \" \"wooop"; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user