227: Replace Consecutive by Phrase in query tree r=Kerollmops a=ManyTheFish

Replace `Consecutive` by `Phrase` in the query tree in order to remove theoretical bugs,
due to the `Consecutive` enum type.

Co-authored-by: many <maxime@meilisearch.com>
Co-authored-by: Many <legendre.maxime.isn@gmail.com>
This commit is contained in:
bors[bot]
2021-06-10 09:31:39 +00:00
committed by GitHub
6 changed files with 130 additions and 104 deletions

View File

@@ -578,7 +578,6 @@ fn linear_compute_candidates(
fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value();
for branch in branches {
let branch_len = branch.len();
let mut branch_rank = Vec::with_capacity(branch_len);
for derivates in branch {
@@ -661,7 +660,7 @@ fn linear_compute_candidates(
// TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Consecutive};
use crate::search::criteria::Operation::{And, Or, Phrase};
fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() {
@@ -683,7 +682,7 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
fn recurse(op: &Operation) -> FlattenedQueryTree {
match op {
And(ops) | Consecutive(ops) => {
And(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))
},
Or(_, ops) => if ops.iter().all(|op| op.query().is_some()) {
@@ -691,6 +690,12 @@ fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
} else {
ops.iter().map(recurse).flatten().collect()
},
Phrase(words) => {
let queries = words.iter().map(|word| {
vec![Query {prefix: false, kind: QueryKind::exact(word.clone())}]
}).collect();
vec![queries]
}
Operation::Query(query) => vec![vec![vec![query.clone()]]],
}
}

View File

@@ -1,7 +1,6 @@
use std::collections::HashMap;
use std::borrow::Cow;
use anyhow::bail;
use roaring::RoaringBitmap;
use crate::{FieldId, TreeLevel, search::{word_derivations, WordDerivationsCache}};
@@ -239,7 +238,7 @@ pub fn resolve_query_tree<'t>(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
use Operation::{And, Phrase, Or, Query};
match query_tree {
And(ops) => {
@@ -261,26 +260,23 @@ pub fn resolve_query_tree<'t>(
}
Ok(candidates)
},
Consecutive(ops) => {
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
} else {
candidates &= pair_docids;
}
},
_ => bail!("invalid consecutive query type"),
None => return Ok(RoaringBitmap::new())
}
}
Ok(candidates)

View File

@@ -171,12 +171,33 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{
use Operation::{And, Consecutive, Or, Query};
use Operation::{And, Phrase, Or};
let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
Consecutive(ops) => if proximity == 0 {
mdfs(ctx, ops, 0, cache, wdcache)?
Phrase(words) => if proximity == 0 {
let most_left = words.first().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let most_right = words.last().map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
let mut candidates = None;
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
match candidates.as_mut() {
Some(candidates) => *candidates &= pair_docids,
None => candidates = Some(pair_docids),
}
},
None => {
candidates = None;
break;
}
}
}
match (most_left, most_right, candidates) {
(Some(l), Some(r), Some(c)) => vec![(l, r, c)],
_otherwise => Default::default(),
}
} else {
Default::default()
},
@@ -188,7 +209,7 @@ fn resolve_candidates<'t>(
}
output
},
Query(q) => if proximity == 0 {
Operation::Query(q) => if proximity == 0 {
let candidates = query_docids(ctx, q, wdcache)?;
vec![(q.clone(), q.clone(), candidates)]
} else {
@@ -306,14 +327,9 @@ fn resolve_plane_sweep_candidates(
) -> anyhow::Result<BTreeMap<u8, RoaringBitmap>>
{
/// FIXME may be buggy with query like "new new york"
fn plane_sweep<'a>(
ctx: &dyn Context,
operations: &'a [Operation],
docid: DocumentId,
fn plane_sweep(
groups_positions: Vec<Vec<(Position, u8, Position)>>,
consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
fn compute_groups_proximity(
@@ -362,13 +378,9 @@ fn resolve_plane_sweep_candidates(
}
}
let groups_len = operations.len();
let mut groups_positions = Vec::with_capacity(groups_len);
let groups_len = groups_positions.len();
for operation in operations {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter());
}
let mut groups_positions: Vec<_> = groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
// Pop top elements of each list.
let mut current = Vec::with_capacity(groups_len);
@@ -441,15 +453,32 @@ fn resolve_plane_sweep_candidates(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
use Operation::{And, Consecutive, Or};
use Operation::{And, Phrase, Or};
if let Some(result) = rocache.get(query_tree) {
return Ok(result.clone());
}
let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
And(ops) => {
let mut groups_positions = Vec::with_capacity(ops.len());
for operation in ops {
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions);
}
plane_sweep(groups_positions, false)?
},
Phrase(words) => {
let mut groups_positions = Vec::with_capacity(words.len());
for word in words {
let positions = match words_positions.get(word) {
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
None => vec![],
};
groups_positions.push(positions);
}
plane_sweep(groups_positions, true)?
},
Or(_, ops) => {
let mut result = Vec::new();
for op in ops {

View File

@@ -1,6 +1,5 @@
use std::{borrow::Cow, collections::HashMap, mem::take};
use anyhow::bail;
use log::debug;
use roaring::RoaringBitmap;
@@ -13,7 +12,6 @@ use super::{
CriterionParameters,
CriterionResult,
query_docids,
query_pair_proximity_docids,
resolve_query_tree,
};
@@ -174,12 +172,14 @@ fn alterate_query_tree(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<()>
{
use Operation::{And, Consecutive, Or};
use Operation::{And, Phrase, Or};
match operation {
And(ops) | Consecutive(ops) | Or(_, ops) => {
And(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
},
// Because Phrases don't allow typos, no alteration can be done.
Phrase(_words) => return Ok(()),
Operation::Query(q) => {
if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function,
@@ -228,32 +228,29 @@ fn resolve_candidates<'t>(
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
use Operation::{And, Phrase, Or, Query};
match query_tree {
And(ops) => {
mdfs(ctx, ops, number_typos, cache, wdcache)
},
Consecutive(ops) => {
Phrase(words) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1, wdcache)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
for slice in words.windows(2) {
let (left, right) = (&slice[0], &slice[1]);
match ctx.word_pair_proximity_docids(left, right, 1)? {
Some(pair_docids) => {
if pair_docids.is_empty() {
return Ok(RoaringBitmap::new());
} else if first_loop {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
} else {
candidates &= pair_docids;
}
},
_ => bail!("invalid consecutive query type"),
None => return Ok(RoaringBitmap::new())
}
}
Ok(candidates)

View File

@@ -52,13 +52,18 @@ impl MatchingWords {
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
match tree {
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
Operation::Or(_, ops) | Operation::And(ops) => {
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
},
Operation::Query(Query { prefix, kind }) => {
let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix));
},
Operation::Phrase(words) => {
for word in words {
out.insert((word, 0, false));
}
}
}
}

View File

@@ -15,7 +15,8 @@ type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
Consecutive(Vec<Operation>),
// serie of consecutive non prefix and exact words
Phrase(Vec<String>),
Or(IsOptionalWord, Vec<Operation>),
Query(Query),
}
@@ -28,9 +29,8 @@ impl fmt::Debug for Operation {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
},
Operation::Consecutive(children) => {
writeln!(f, "{:1$}CONSECUTIVE", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
Operation::Phrase(children) => {
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
},
Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
@@ -49,14 +49,6 @@ impl fmt::Debug for Operation {
}
impl Operation {
fn phrase(words: Vec<String>) -> Operation {
Operation::consecutive(
words.into_iter().map(|s| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(s) })
}).collect()
)
}
fn and(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
@@ -73,11 +65,11 @@ impl Operation {
}
}
fn consecutive(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
fn phrase(mut words: Vec<String>) -> Self {
if words.len() == 1 {
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
} else {
Self::Consecutive(ops)
Self::Phrase(words)
}
}
@@ -256,10 +248,10 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
}
}
Ok(best.map(|(_, left, right)| Operation::Consecutive(
Ok(best.map(|(_, left, right)| Operation::Phrase(
vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact(left.to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact(right.to_string()) })
left.to_string(),
right.to_string()
]
)))
}
@@ -494,24 +486,26 @@ fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>, wo
/// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive};
use Operation::{Or, And, Query, Phrase};
match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) | Consecutive(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
Query(q) => q.kind.typo() as usize,
// no typo allowed in phrases
Phrase(_) => 0,
}
}
/// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive};
use Operation::{Or, And, Query, Phrase};
match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>()
+ ops.len().saturating_sub(1) * 7
},
Query(_) | Consecutive(_) => 0,
Query(_) | Phrase(_) => 0,
}
}
@@ -765,9 +759,9 @@ mod test {
let expected = Operation::Or(false, vec![
Operation::And(vec![
Operation::Or(false, vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("word".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Phrase(vec![
"word".to_string(),
"split".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplit".to_string()) }),
]),
@@ -789,9 +783,9 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
Operation::Phrase(vec![
"hey".to_string(),
"friends".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]);
@@ -809,13 +803,13 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("friends".to_string()) }),
Operation::Phrase(vec![
"hey".to_string(),
"friends".to_string(),
]),
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
Operation::Phrase(vec![
"wooop".to_string(),
"wooop".to_string(),
]),
]);
@@ -870,9 +864,9 @@ mod test {
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
let expected = Operation::Phrase(vec![
"hey".to_string(),
"my".to_string(),
]);
let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
@@ -940,9 +934,9 @@ mod test {
let tokens = result.tokens();
let expected = Operation::And(vec![
Operation::Consecutive(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }),
Operation::Phrase(vec![
"hey".to_string(),
"my".to_string(),
]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]);