mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Integrate the stop_words in the querytree
remove the stop_words from the querytree except if it was a prefix or a typo
This commit is contained in:
		| @@ -1,6 +1,7 @@ | |||||||
| use std::collections::HashSet; | use std::collections::HashSet; | ||||||
| use std::{fmt, cmp, mem}; | use std::{fmt, cmp, mem}; | ||||||
|  |  | ||||||
|  | use fst::Set; | ||||||
| use levenshtein_automata::{DFA, Distance}; | use levenshtein_automata::{DFA, Distance}; | ||||||
| use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; | use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| @@ -154,6 +155,10 @@ impl fmt::Debug for Query { | |||||||
|  |  | ||||||
| trait Context { | trait Context { | ||||||
|     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; |     fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; | ||||||
|  |     fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>>; | ||||||
|  |     fn is_stop_word(&self, word: &str) -> anyhow::Result<bool> { | ||||||
|  |         Ok(self.stop_words()?.map_or(false, |s| s.contains(word))) | ||||||
|  |     } | ||||||
|     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>; |     fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>; | ||||||
|     fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> { |     fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> { | ||||||
|         match self.word_docids(word)? { |         match self.word_docids(word)? { | ||||||
| @@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> { | |||||||
|     fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> { |     fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> { | ||||||
|         Ok(None) |         Ok(None) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> { | ||||||
|  |         self.index.stop_words(self.rtxn) | ||||||
|  |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> QueryTreeBuilder<'a> { | impl<'a> QueryTreeBuilder<'a> { | ||||||
| @@ -331,8 +340,7 @@ fn create_query_tree( | |||||||
|     optional_words: bool, |     optional_words: bool, | ||||||
|     authorize_typos: bool, |     authorize_typos: bool, | ||||||
|     query: PrimitiveQuery, |     query: PrimitiveQuery, | ||||||
| ) -> anyhow::Result<Operation> | ) -> anyhow::Result<Operation> { | ||||||
| { |  | ||||||
|     /// Matches on the `PrimitiveQueryPart` and create an operation from it. |     /// Matches on the `PrimitiveQueryPart` and create an operation from it. | ||||||
|     fn resolve_primitive_part( |     fn resolve_primitive_part( | ||||||
|         ctx: &impl Context, |         ctx: &impl Context, | ||||||
| @@ -350,7 +358,12 @@ fn create_query_tree( | |||||||
|                 if let Some(child) = split_best_frequency(ctx, &word)? { |                 if let Some(child) = split_best_frequency(ctx, &word)? { | ||||||
|                     children.push(child); |                     children.push(child); | ||||||
|                 } |                 } | ||||||
|                 children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) })); |  | ||||||
|  |                 let is_stop_word = ctx.is_stop_word(&word)?; | ||||||
|  |                 let query = Query { prefix, kind: typos(word, authorize_typos) }; | ||||||
|  |                 if query.prefix || query.kind.is_tolerant() || !is_stop_word { | ||||||
|  |                     children.push(Operation::Query(query)); | ||||||
|  |                 } | ||||||
|                 Ok(Operation::or(false, children)) |                 Ok(Operation::or(false, children)) | ||||||
|             }, |             }, | ||||||
|             // create a CONSECUTIVE operation wrapping all word in the phrase |             // create a CONSECUTIVE operation wrapping all word in the phrase | ||||||
| @@ -365,12 +378,11 @@ fn create_query_tree( | |||||||
|         ctx: &impl Context, |         ctx: &impl Context, | ||||||
|         authorize_typos: bool, |         authorize_typos: bool, | ||||||
|         query: &[PrimitiveQueryPart], |         query: &[PrimitiveQueryPart], | ||||||
|     ) -> anyhow::Result<Operation> |     ) -> anyhow::Result<Operation> { | ||||||
|     { |  | ||||||
|         const MAX_NGRAM: usize = 3; |         const MAX_NGRAM: usize = 3; | ||||||
|         let mut op_children = Vec::new(); |         let mut op_children = Vec::new(); | ||||||
|  |  | ||||||
|         for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { |         for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { | ||||||
|             let mut or_op_children = Vec::new(); |             let mut or_op_children = Vec::new(); | ||||||
|  |  | ||||||
|             for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { |             for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { | ||||||
| @@ -381,25 +393,33 @@ fn create_query_tree( | |||||||
|  |  | ||||||
|                     match group { |                     match group { | ||||||
|                         [part] => { |                         [part] => { | ||||||
|                             let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; |                             let operation = | ||||||
|  |                                 resolve_primitive_part(ctx, authorize_typos, part.clone())?; | ||||||
|                             and_op_children.push(operation); |                             and_op_children.push(operation); | ||||||
|                         }, |                         } | ||||||
|                         words => { |                         words => { | ||||||
|                             let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); |                             let is_prefix = words.last().map_or(false, |part| part.is_prefix()); | ||||||
|                             let words: Vec<_> = words.iter().filter_map(| part| { |                             let words: Vec<_> = words | ||||||
|  |                                 .iter() | ||||||
|  |                                 .filter_map(|part| { | ||||||
|                                     if let PrimitiveQueryPart::Word(word, _) = part { |                                     if let PrimitiveQueryPart::Word(word, _) = part { | ||||||
|                                         Some(word.as_str()) |                                         Some(word.as_str()) | ||||||
|                                     } else { |                                     } else { | ||||||
|                                         None |                                         None | ||||||
|                                     } |                                     } | ||||||
|                             }).collect(); |                                 }) | ||||||
|  |                                 .collect(); | ||||||
|                             let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); |                             let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); | ||||||
|                             let concat = words.concat(); |                             let concat = words.concat(); | ||||||
|  |  | ||||||
|  |                             let is_stop_word = ctx.is_stop_word(&concat)?; | ||||||
|                             let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; |                             let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; | ||||||
|  |                             if query.prefix || query.kind.is_tolerant() || !is_stop_word { | ||||||
|                                 operations.push(Operation::Query(query)); |                                 operations.push(Operation::Query(query)); | ||||||
|                                 and_op_children.push(Operation::or(false, operations)); |                                 and_op_children.push(Operation::or(false, operations)); | ||||||
|                             } |                             } | ||||||
|                         } |                         } | ||||||
|  |                     } | ||||||
|  |  | ||||||
|                     if !is_last { |                     if !is_last { | ||||||
|                         let ngrams = ngrams(ctx, authorize_typos, tail)?; |                         let ngrams = ngrams(ctx, authorize_typos, tail)?; | ||||||
| @@ -581,6 +601,10 @@ mod test { | |||||||
|             let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); |             let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); | ||||||
|             Ok(self.synonyms.get(&words).cloned()) |             Ok(self.synonyms.get(&words).cloned()) | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> { | ||||||
|  |             Ok(None) | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     impl Default for TestContext { |     impl Default for TestContext { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user