Integrate the stop_words in the querytree

remove the stop_words from the querytree except if it was a prefix or a typo
This commit is contained in:
tamo
2021-03-31 14:41:22 +02:00
parent a2f46029c7
commit 12fb509d84

View File

@@ -1,6 +1,7 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::{fmt, cmp, mem}; use std::{fmt, cmp, mem};
use fst::Set;
use levenshtein_automata::{DFA, Distance}; use levenshtein_automata::{DFA, Distance};
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@@ -154,6 +155,10 @@ impl fmt::Debug for Query {
trait Context { trait Context {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>; fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>>;
fn is_stop_word(&self, word: &str) -> anyhow::Result<bool> {
Ok(self.stop_words()?.map_or(false, |s| s.contains(word)))
}
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>; fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> { fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
match self.word_docids(word)? { match self.word_docids(word)? {
@@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> { fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
Ok(None) Ok(None)
} }
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
self.index.stop_words(self.rtxn)
}
} }
impl<'a> QueryTreeBuilder<'a> { impl<'a> QueryTreeBuilder<'a> {
@@ -331,8 +340,7 @@ fn create_query_tree(
optional_words: bool, optional_words: bool,
authorize_typos: bool, authorize_typos: bool,
query: PrimitiveQuery, query: PrimitiveQuery,
) -> anyhow::Result<Operation> ) -> anyhow::Result<Operation> {
{
/// Matches on the `PrimitiveQueryPart` and create an operation from it. /// Matches on the `PrimitiveQueryPart` and create an operation from it.
fn resolve_primitive_part( fn resolve_primitive_part(
ctx: &impl Context, ctx: &impl Context,
@@ -350,7 +358,12 @@ fn create_query_tree(
if let Some(child) = split_best_frequency(ctx, &word)? { if let Some(child) = split_best_frequency(ctx, &word)? {
children.push(child); children.push(child);
} }
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
let is_stop_word = ctx.is_stop_word(&word)?;
let query = Query { prefix, kind: typos(word, authorize_typos) };
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
children.push(Operation::Query(query));
}
Ok(Operation::or(false, children)) Ok(Operation::or(false, children))
}, },
// create a CONSECUTIVE operation wrapping all word in the phrase // create a CONSECUTIVE operation wrapping all word in the phrase
@@ -365,12 +378,11 @@ fn create_query_tree(
ctx: &impl Context, ctx: &impl Context,
authorize_typos: bool, authorize_typos: bool,
query: &[PrimitiveQueryPart], query: &[PrimitiveQueryPart],
) -> anyhow::Result<Operation> ) -> anyhow::Result<Operation> {
{
const MAX_NGRAM: usize = 3; const MAX_NGRAM: usize = 3;
let mut op_children = Vec::new(); let mut op_children = Vec::new();
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) { for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
let mut or_op_children = Vec::new(); let mut or_op_children = Vec::new();
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
@@ -381,23 +393,31 @@ fn create_query_tree(
match group { match group {
[part] => { [part] => {
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?; let operation =
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
and_op_children.push(operation); and_op_children.push(operation);
}, }
words => { words => {
let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); let is_prefix = words.last().map_or(false, |part| part.is_prefix());
let words: Vec<_> = words.iter().filter_map(| part| { let words: Vec<_> = words
if let PrimitiveQueryPart::Word(word, _) = part { .iter()
Some(word.as_str()) .filter_map(|part| {
} else { if let PrimitiveQueryPart::Word(word, _) = part {
None Some(word.as_str())
} } else {
}).collect(); None
}
})
.collect();
let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
let concat = words.concat(); let concat = words.concat();
let is_stop_word = ctx.is_stop_word(&concat)?;
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) }; let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
operations.push(Operation::Query(query)); if query.prefix || query.kind.is_tolerant() || !is_stop_word {
and_op_children.push(Operation::or(false, operations)); operations.push(Operation::Query(query));
and_op_children.push(Operation::or(false, operations));
}
} }
} }
@@ -581,6 +601,10 @@ mod test {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms.get(&words).cloned()) Ok(self.synonyms.get(&words).cloned())
} }
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
Ok(None)
}
} }
impl Default for TestContext { impl Default for TestContext {