664: Fix phrase search containing stop words r=ManyTheFish a=Samyak2

# Pull Request

This a WIP draft PR I wanted to create to let other potential contributors know that I'm working on this issue. I'll be completing this in a few hours from opening this.

## Related issue
Fixes #661 and towards fixing meilisearch/meilisearch#2905

## What does this PR do?
- [x] Change Phrase Operation to use a `Vec<Option<String>>` instead of `Vec<String>` where `None` corresponds to a stop word
- [x] Update all other uses of phrase operation
- [x] Update `resolve_phrase`
- [x] Update `create_primitive_query`?
- [x] Add test

## PR checklist
Please check if your PR fulfills the following requirements:
- [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [x] Have you read the contributing guidelines?
- [x] Have you made sure that the title is accurate and descriptive of the changes?


Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com>
Co-authored-by: Samyak Sarnayak <samyak201@gmail.com>
This commit is contained in:
bors[bot]
2022-10-29 13:42:52 +00:00
committed by GitHub
8 changed files with 156 additions and 58 deletions

View File

@ -579,6 +579,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
Phrase(words) => {
let queries = words
.iter()
.filter_map(|w| w.as_ref())
.map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }])
.collect();
vec![queries]

View File

@ -299,9 +299,11 @@ fn attribute_start_with_docids(
}
Phrase(phrase) => {
for word in phrase {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
if let Some(word) = word {
let wc = ctx.word_position_docids(word, pos)?;
if let Some(word_candidates) = wc {
attribute_candidates_array.push(word_candidates);
}
}
pos += 1;
}
@ -323,7 +325,7 @@ fn intersection_of(mut rbs: Vec<&RoaringBitmap>) -> RoaringBitmap {
#[derive(Debug, Clone)]
pub enum ExactQueryPart {
Phrase(Vec<String>),
Phrase(Vec<Option<String>>),
Synonyms(Vec<String>),
}

View File

@ -418,15 +418,32 @@ pub fn resolve_query_tree(
resolve_operation(ctx, query_tree, wdcache)
}
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result<RoaringBitmap> {
pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option<String>]) -> Result<RoaringBitmap> {
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = phrase.len().min(3);
if phrase.is_empty() {
return Ok(candidates);
}
for win in phrase.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win.iter().enumerate() {
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
for (offset, s1) in win.iter().enumerate().filter_map(|(index, word)| {
if let Some(word) = word {
Some((index, word))
} else {
None
}
}) {
for (dist, s2) in win.iter().skip(offset + 1).enumerate().filter_map(|(index, word)| {
if let Some(word) = word {
Some((index, word))
} else {
None
}
}) {
if dist == 0 {
match ctx.word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(m),

View File

@ -4,6 +4,7 @@ use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
use super::{
query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context,
@ -187,10 +188,15 @@ fn resolve_candidates<'t>(
Phrase(words) => {
if proximity == 0 {
let most_left = words
.first()
.iter()
.filter_map(|o| o.as_ref())
.next()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words
.last()
.iter()
.rev()
.filter_map(|o| o.as_ref())
.next()
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
match (most_left, most_right) {
@ -473,14 +479,34 @@ fn resolve_plane_sweep_candidates(
}
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => return Ok(vec![]),
};
groups_positions.push(positions);
// group stop_words together.
for words in words.linear_group_by_key(Option::is_none) {
// skip if it's a group of stop words.
if matches!(words.first(), None | Some(None)) {
continue;
}
// make a consecutive plane-sweep on the subgroup of words.
let mut subgroup = Vec::with_capacity(words.len());
for word in words.into_iter().map(|w| w.as_deref().unwrap()) {
match words_positions.get(word) {
Some(positions) => {
subgroup.push(positions.iter().map(|p| (p, 0, p)).collect())
}
None => return Ok(vec![]),
}
}
match subgroup.len() {
0 => {}
1 => groups_positions.push(subgroup.pop().unwrap()),
_ => groups_positions.push(plane_sweep(subgroup, true)?),
}
}
match groups_positions.len() {
0 => vec![],
1 => groups_positions.pop().unwrap(),
_ => plane_sweep(groups_positions, false)?,
}
plane_sweep(groups_positions, true)?
}
Or(_, ops) => {
let mut result = Vec::new();

View File

@ -9,6 +9,7 @@ use super::{
query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters,
CriterionResult,
};
use crate::search::criteria::resolve_phrase;
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::{word_derivations, WordDerivationsCache};
use crate::Result;
@ -256,27 +257,7 @@ fn resolve_candidates<'t>(
match query_tree {
And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache),
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
} else {
candidates &= pair_docids;
}
}
None => return Ok(RoaringBitmap::new()),
}
}
Ok(candidates)
}
Phrase(words) => resolve_phrase(ctx, words),
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {