[WIP] Fix phrase search containing stop words

Fixes #661 and meilisearch/meilisearch#2905
This commit is contained in:
Samyak S Sarnayak
2022-10-26 19:08:06 +05:30
parent 365f44c39b
commit 62816dddde
6 changed files with 43 additions and 16 deletions

View File

@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
Phrase(words) => {
let queries = words
.iter()
.filter_map(|w| w.as_ref())
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect();
vec![queries]

View File

@ -298,7 +298,7 @@ fn attribute_start_with_docids(
pos += 1;
}
Phrase(phrase) => {
for word in phrase {
for word in phrase.iter().filter_map(|w| w.as_ref()) {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
@ -323,7 +323,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {
#[derive(Debug, Clone)]
pub enum ExactQueryPart {
Phrase(Vec<String>),
Phrase(Vec<Option<String>>),
Synonyms(Vec<String>),
}

View File

@ -418,15 +418,21 @@ pub fn resolve_query_tree(
resolve_operation(ctx, query_tree, wdcache)
}
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option<String>]) -> Result<RoaringBitmap> {
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = phrase.len().min(3);
for win in phrase.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
for (offset, s1) in win.iter().filter_map(|w| w.as_ref()).enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| {
if let Some(word) = word {
Some((index, word))
} else {
None
}
}) {
if dist == 0 {
match ctx.word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(m),

View File

@ -188,9 +188,13 @@ fn resolve_candidates<'t>(
if proximity == 0 {
let most_left = words
.first()
.map(|o| o.as_ref())
.flatten()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words
.last()
.map(|o| o.as_ref())
.flatten()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
match (most_left, most_right) {
@ -473,7 +477,7 @@ fn resolve_plane_sweep_candidates(
}
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
for word in words.iter().filter_map(|w| w.as_ref()) {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => return Ok(vec![]),

View File

@ -2,6 +2,7 @@ use std::borrow::Cow;
use std::collections::HashMap;
use std::mem::take;
use itertools::Itertools;
use log::debug;
use roaring::RoaringBitmap;
@ -259,8 +260,7 @@ fn resolve_candidates<'t>(
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
for (left, right) in words.iter().filter_map(|w| w.as_ref()).tuple_windows() {
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {