mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-26 00:01:00 +00:00
This bug is an old bug but was hidden by the proximity criterion, Phrase search were always returning an empty candidates list. Before the fix, we were trying to find any words[n] near words[n] instead of finding any words[n] near words[n+1], for example: for a phrase search '"Hello world"' we were searching for "hello" near "hello" first, instead of "hello" near "world".
585 lines
23 KiB
Rust
585 lines
23 KiB
Rust
use std::collections::btree_map::{self, BTreeMap};
|
|
use std::collections::hash_map::HashMap;
|
|
use std::mem::take;
|
|
|
|
use log::debug;
|
|
use roaring::RoaringBitmap;
|
|
|
|
use super::{
|
|
query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion,
|
|
CriterionParameters, CriterionResult,
|
|
};
|
|
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
|
|
use crate::search::{build_dfa, WordDerivationsCache};
|
|
use crate::{Position, Result};
|
|
|
|
type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;
|
|
|
|
/// Threshold on the number of candidates that will make
|
|
/// the system choose between one algorithm or another.
|
|
const CANDIDATES_THRESHOLD: u64 = 1000;
|
|
|
|
/// Threshold on the number of proximity that will make
|
|
/// the system choose between one algorithm or another.
|
|
const PROXIMITY_THRESHOLD: u8 = 0;
|
|
|
|
pub struct Proximity<'t> {
|
|
ctx: &'t dyn Context<'t>,
|
|
/// (max_proximity, query_tree, allowed_candidates)
|
|
state: Option<(u8, Operation, RoaringBitmap)>,
|
|
proximity: u8,
|
|
bucket_candidates: RoaringBitmap,
|
|
parent: Box<dyn Criterion + 't>,
|
|
candidates_cache: Cache,
|
|
plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
|
|
}
|
|
|
|
impl<'t> Proximity<'t> {
|
|
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
|
|
Proximity {
|
|
ctx,
|
|
state: None,
|
|
proximity: 0,
|
|
bucket_candidates: RoaringBitmap::new(),
|
|
parent,
|
|
candidates_cache: Cache::new(),
|
|
plane_sweep_cache: None,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<'t> Criterion for Proximity<'t> {
|
|
#[logging_timer::time("Proximity::{}")]
|
|
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
|
// remove excluded candidates when next is called, instead of doing it in the loop.
|
|
if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
|
|
*allowed_candidates -= params.excluded_candidates;
|
|
}
|
|
|
|
loop {
|
|
debug!(
|
|
"Proximity at iteration {} (max prox {:?}) ({:?})",
|
|
self.proximity,
|
|
self.state.as_ref().map(|(mp, _, _)| mp),
|
|
self.state.as_ref().map(|(_, _, cd)| cd),
|
|
);
|
|
|
|
match &mut self.state {
|
|
Some((max_prox, _, allowed_candidates))
|
|
if allowed_candidates.is_empty() || self.proximity > *max_prox =>
|
|
{
|
|
self.state = None; // reset state
|
|
}
|
|
Some((_, query_tree, allowed_candidates)) => {
|
|
let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD
|
|
&& self.proximity > PROXIMITY_THRESHOLD
|
|
{
|
|
if let Some(cache) = self.plane_sweep_cache.as_mut() {
|
|
match cache.next() {
|
|
Some((p, candidates)) => {
|
|
self.proximity = p;
|
|
candidates
|
|
}
|
|
None => {
|
|
self.state = None; // reset state
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
let cache = resolve_plane_sweep_candidates(
|
|
self.ctx,
|
|
query_tree,
|
|
allowed_candidates,
|
|
)?;
|
|
self.plane_sweep_cache = Some(cache.into_iter());
|
|
|
|
continue;
|
|
}
|
|
} else {
|
|
// use set theory based algorithm
|
|
resolve_candidates(
|
|
self.ctx,
|
|
&query_tree,
|
|
self.proximity,
|
|
&mut self.candidates_cache,
|
|
params.wdcache,
|
|
)?
|
|
};
|
|
|
|
new_candidates &= &*allowed_candidates;
|
|
*allowed_candidates -= &new_candidates;
|
|
self.proximity += 1;
|
|
|
|
return Ok(Some(CriterionResult {
|
|
query_tree: Some(query_tree.clone()),
|
|
candidates: Some(new_candidates),
|
|
filtered_candidates: None,
|
|
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
|
}));
|
|
}
|
|
None => match self.parent.next(params)? {
|
|
Some(CriterionResult {
|
|
query_tree: Some(query_tree),
|
|
candidates,
|
|
filtered_candidates,
|
|
bucket_candidates,
|
|
}) => {
|
|
let mut candidates = match candidates {
|
|
Some(candidates) => candidates,
|
|
None => {
|
|
resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
|
|
- params.excluded_candidates
|
|
}
|
|
};
|
|
|
|
if let Some(filtered_candidates) = filtered_candidates {
|
|
candidates &= filtered_candidates;
|
|
}
|
|
|
|
match bucket_candidates {
|
|
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
|
None => self.bucket_candidates |= &candidates,
|
|
}
|
|
|
|
let maximum_proximity = maximum_proximity(&query_tree);
|
|
self.state = Some((maximum_proximity as u8, query_tree, candidates));
|
|
self.proximity = 0;
|
|
self.plane_sweep_cache = None;
|
|
}
|
|
Some(CriterionResult {
|
|
query_tree: None,
|
|
candidates,
|
|
filtered_candidates,
|
|
bucket_candidates,
|
|
}) => {
|
|
return Ok(Some(CriterionResult {
|
|
query_tree: None,
|
|
candidates,
|
|
filtered_candidates,
|
|
bucket_candidates,
|
|
}));
|
|
}
|
|
None => return Ok(None),
|
|
},
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
fn resolve_candidates<'t>(
|
|
ctx: &'t dyn Context,
|
|
query_tree: &Operation,
|
|
proximity: u8,
|
|
cache: &mut Cache,
|
|
wdcache: &mut WordDerivationsCache,
|
|
) -> Result<RoaringBitmap> {
|
|
fn resolve_operation<'t>(
|
|
ctx: &'t dyn Context,
|
|
query_tree: &Operation,
|
|
proximity: u8,
|
|
cache: &mut Cache,
|
|
wdcache: &mut WordDerivationsCache,
|
|
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
|
use Operation::{And, Or, Phrase};
|
|
|
|
let result = match query_tree {
|
|
And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
|
|
Phrase(words) => {
|
|
if proximity == 0 {
|
|
let most_left = words
|
|
.first()
|
|
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
|
let most_right = words
|
|
.last()
|
|
.map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
|
|
let mut candidates = RoaringBitmap::new();
|
|
let mut first_iter = true;
|
|
let winsize = words.len().min(7);
|
|
|
|
for win in words.windows(winsize) {
|
|
// Get all the documents with the matching distance for each word pairs.
|
|
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
|
|
for (offset, s1) in win.iter().enumerate() {
|
|
for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
|
|
match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
|
|
Some(m) => bitmaps.push(m),
|
|
// If there are no document for this distance, there will be no
|
|
// results for the phrase query.
|
|
None => return Ok(Default::default()),
|
|
}
|
|
}
|
|
}
|
|
|
|
// We sort the bitmaps so that we perform the small intersections first, which is faster.
|
|
bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len()));
|
|
|
|
for bitmap in bitmaps {
|
|
if first_iter {
|
|
candidates = bitmap;
|
|
first_iter = false;
|
|
} else {
|
|
candidates &= bitmap;
|
|
}
|
|
// There will be no match, return early
|
|
if candidates.is_empty() {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
match (most_left, most_right) {
|
|
(Some(l), Some(r)) => vec![(l, r, candidates)],
|
|
_otherwise => Default::default(),
|
|
}
|
|
} else {
|
|
Default::default()
|
|
}
|
|
}
|
|
Or(_, ops) => {
|
|
let mut output = Vec::new();
|
|
for op in ops {
|
|
let result = resolve_operation(ctx, op, proximity, cache, wdcache)?;
|
|
output.extend(result);
|
|
}
|
|
output
|
|
}
|
|
Operation::Query(q) => {
|
|
if proximity == 0 {
|
|
let candidates = query_docids(ctx, q, wdcache)?;
|
|
vec![(q.clone(), q.clone(), candidates)]
|
|
} else {
|
|
Default::default()
|
|
}
|
|
}
|
|
};
|
|
|
|
Ok(result)
|
|
}
|
|
|
|
fn mdfs_pair<'t>(
|
|
ctx: &'t dyn Context,
|
|
left: &Operation,
|
|
right: &Operation,
|
|
proximity: u8,
|
|
cache: &mut Cache,
|
|
wdcache: &mut WordDerivationsCache,
|
|
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
|
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
|
|
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
|
|
}
|
|
|
|
let pair_max_proximity = 7;
|
|
|
|
let mut output = Vec::new();
|
|
|
|
for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
|
|
for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
|
|
let left_key = (left.clone(), left_p);
|
|
if !cache.contains_key(&left_key) {
|
|
let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?;
|
|
cache.insert(left_key.clone(), candidates);
|
|
}
|
|
|
|
let right_key = (right.clone(), right_p);
|
|
if !cache.contains_key(&right_key) {
|
|
let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?;
|
|
cache.insert(right_key.clone(), candidates);
|
|
}
|
|
|
|
let lefts = cache.get(&left_key).unwrap();
|
|
let rights = cache.get(&right_key).unwrap();
|
|
|
|
for (ll, lr, lcandidates) in lefts {
|
|
for (rl, rr, rcandidates) in rights {
|
|
let mut candidates =
|
|
query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
|
|
if lcandidates.len() < rcandidates.len() {
|
|
candidates &= lcandidates;
|
|
candidates &= rcandidates;
|
|
} else {
|
|
candidates &= rcandidates;
|
|
candidates &= lcandidates;
|
|
}
|
|
if !candidates.is_empty() {
|
|
output.push((ll.clone(), rr.clone(), candidates));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(output)
|
|
}
|
|
|
|
fn mdfs<'t>(
|
|
ctx: &'t dyn Context,
|
|
branches: &[Operation],
|
|
proximity: u8,
|
|
cache: &mut Cache,
|
|
wdcache: &mut WordDerivationsCache,
|
|
) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
|
|
// Extract the first two elements but gives the tail
|
|
// that is just after the first element.
|
|
let next =
|
|
branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));
|
|
|
|
match next {
|
|
Some((head1, Some((head2, [_])))) => {
|
|
mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
|
|
}
|
|
Some((head1, Some((head2, tail)))) => {
|
|
let mut output = Vec::new();
|
|
for p in 0..=proximity {
|
|
for (lhead, _, head_candidates) in
|
|
mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
|
|
{
|
|
if !head_candidates.is_empty() {
|
|
for (_, rtail, mut candidates) in
|
|
mdfs(ctx, tail, proximity - p, cache, wdcache)?
|
|
{
|
|
candidates &= &head_candidates;
|
|
if !candidates.is_empty() {
|
|
output.push((lhead.clone(), rtail, candidates));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Ok(output)
|
|
}
|
|
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
|
|
None => Ok(Default::default()),
|
|
}
|
|
}
|
|
|
|
let mut candidates = RoaringBitmap::new();
|
|
for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? {
|
|
candidates |= cds;
|
|
}
|
|
Ok(candidates)
|
|
}
|
|
|
|
fn resolve_plane_sweep_candidates(
|
|
ctx: &dyn Context,
|
|
query_tree: &Operation,
|
|
allowed_candidates: &RoaringBitmap,
|
|
) -> Result<BTreeMap<u8, RoaringBitmap>> {
|
|
/// FIXME may be buggy with query like "new new york"
|
|
fn plane_sweep(
|
|
groups_positions: Vec<Vec<(Position, u8, Position)>>,
|
|
consecutive: bool,
|
|
) -> Result<Vec<(Position, u8, Position)>> {
|
|
fn compute_groups_proximity(
|
|
groups: &[(usize, (Position, u8, Position))],
|
|
consecutive: bool,
|
|
) -> Option<(Position, u8, Position)> {
|
|
// take the inner proximity of the first group as initial
|
|
let (_, (_, mut proximity, _)) = groups.first()?;
|
|
let (_, (left_most_pos, _, _)) = groups.first()?;
|
|
let (_, (_, _, right_most_pos)) =
|
|
groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;
|
|
|
|
for pair in groups.windows(2) {
|
|
if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
|
|
// if two positions are equal, meaning that they share at least a word, we return None
|
|
if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
|
|
return None;
|
|
}
|
|
|
|
let pair_proximity = {
|
|
// if intervals are disjoint [..].(..)
|
|
if lpos2 > rpos1 {
|
|
lpos2 - rpos1
|
|
}
|
|
// if the second interval is a subset of the first [.(..).]
|
|
else if rpos2 < rpos1 {
|
|
(lpos2 - lpos1).min(rpos1 - rpos2)
|
|
}
|
|
// if intervals overlaps [.(..].)
|
|
else {
|
|
(lpos2 - lpos1).min(rpos2 - rpos1)
|
|
}
|
|
};
|
|
|
|
// if groups are in the good order (query order) we remove 1 to the proximity
|
|
// the proximity is clamped to 7
|
|
let pair_proximity =
|
|
if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };
|
|
|
|
proximity += pair_proximity as u8 + prox2;
|
|
}
|
|
}
|
|
|
|
// if groups should be consecutives, we will only accept groups with a proximity of 0
|
|
if !consecutive || proximity == 0 {
|
|
Some((*left_most_pos, proximity, *right_most_pos))
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
|
|
let groups_len = groups_positions.len();
|
|
|
|
let mut groups_positions: Vec<_> =
|
|
groups_positions.into_iter().map(|pos| pos.into_iter()).collect();
|
|
|
|
// Pop top elements of each list.
|
|
let mut current = Vec::with_capacity(groups_len);
|
|
for (i, positions) in groups_positions.iter_mut().enumerate() {
|
|
match positions.next() {
|
|
Some(p) => current.push((i, p)),
|
|
// if a group return None, it means that the document does not contain all the words,
|
|
// we return an empty result.
|
|
None => return Ok(Vec::new()),
|
|
}
|
|
}
|
|
|
|
// Sort k elements by their positions.
|
|
current.sort_unstable_by_key(|(_, p)| *p);
|
|
|
|
// Find leftmost and rightmost group and their positions.
|
|
let mut leftmost = *current.first().unwrap();
|
|
let mut rightmost = *current.last().unwrap();
|
|
|
|
let mut output = Vec::new();
|
|
loop {
|
|
// Find the position p of the next elements of a list of the leftmost group.
|
|
// If the list is empty, break the loop.
|
|
let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p));
|
|
|
|
// let q be the position q of second group of the interval.
|
|
let q = current[1];
|
|
|
|
// If p > r, then the interval [l, r] is minimal and
|
|
// we insert it into the heap according to its size.
|
|
if p.map_or(true, |p| p.1 > rightmost.1) {
|
|
if let Some(group) = compute_groups_proximity(¤t, consecutive) {
|
|
output.push(group);
|
|
}
|
|
}
|
|
|
|
let p = match p {
|
|
Some(p) => p,
|
|
None => break,
|
|
};
|
|
|
|
// Replace the leftmost group P in the interval.
|
|
current[0] = p;
|
|
|
|
if p.1 > rightmost.1 {
|
|
// if [l, r] is minimal, let r = p and l = q.
|
|
rightmost = p;
|
|
leftmost = q;
|
|
} else {
|
|
// Ohterwise, let l = min{p,q}.
|
|
leftmost = if p.1 < q.1 { p } else { q };
|
|
}
|
|
|
|
// Then update the interval and order of groups_positions in the interval.
|
|
current.sort_unstable_by_key(|(_, p)| *p);
|
|
}
|
|
|
|
// Sort the list according to the size and the positions.
|
|
output.sort_unstable();
|
|
|
|
Ok(output)
|
|
}
|
|
|
|
fn resolve_operation<'a>(
|
|
query_tree: &'a Operation,
|
|
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
|
|
words_positions: &HashMap<String, RoaringBitmap>,
|
|
) -> Result<Vec<(Position, u8, Position)>> {
|
|
use Operation::{And, Or, Phrase};
|
|
|
|
if let Some(result) = rocache.get(query_tree) {
|
|
return Ok(result.clone());
|
|
}
|
|
|
|
let result = match query_tree {
|
|
And(ops) => {
|
|
let mut groups_positions = Vec::with_capacity(ops.len());
|
|
for operation in ops {
|
|
let positions = resolve_operation(operation, rocache, words_positions)?;
|
|
groups_positions.push(positions);
|
|
}
|
|
plane_sweep(groups_positions, false)?
|
|
}
|
|
Phrase(words) => {
|
|
let mut groups_positions = Vec::with_capacity(words.len());
|
|
for word in words {
|
|
let positions = match words_positions.get(word) {
|
|
Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
|
|
None => return Ok(vec![]),
|
|
};
|
|
groups_positions.push(positions);
|
|
}
|
|
plane_sweep(groups_positions, true)?
|
|
}
|
|
Or(_, ops) => {
|
|
let mut result = Vec::new();
|
|
for op in ops {
|
|
result.extend(resolve_operation(op, rocache, words_positions)?)
|
|
}
|
|
|
|
result.sort_unstable();
|
|
result
|
|
}
|
|
Operation::Query(Query { prefix, kind }) => {
|
|
let mut result = Vec::new();
|
|
match kind {
|
|
QueryKind::Exact { word, .. } => {
|
|
if *prefix {
|
|
let iter = word_derivations(word, true, 0, &words_positions)
|
|
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
|
result.extend(iter);
|
|
} else if let Some(positions) = words_positions.get(word) {
|
|
result.extend(positions.iter().map(|p| (p, 0, p)));
|
|
}
|
|
}
|
|
QueryKind::Tolerant { typo, word } => {
|
|
let iter = word_derivations(word, *prefix, *typo, &words_positions)
|
|
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
|
|
result.extend(iter);
|
|
}
|
|
}
|
|
|
|
result.sort_unstable();
|
|
result
|
|
}
|
|
};
|
|
|
|
rocache.insert(query_tree, result.clone());
|
|
Ok(result)
|
|
}
|
|
|
|
fn word_derivations<'a>(
|
|
word: &str,
|
|
is_prefix: bool,
|
|
max_typo: u8,
|
|
words_positions: &'a HashMap<String, RoaringBitmap>,
|
|
) -> impl Iterator<Item = &'a RoaringBitmap> {
|
|
let dfa = build_dfa(word, max_typo, is_prefix);
|
|
words_positions.iter().filter_map(move |(document_word, positions)| {
|
|
use levenshtein_automata::Distance;
|
|
match dfa.eval(document_word) {
|
|
Distance::Exact(_) => Some(positions),
|
|
Distance::AtLeast(_) => None,
|
|
}
|
|
})
|
|
}
|
|
|
|
let mut resolve_operation_cache = HashMap::new();
|
|
let mut candidates = BTreeMap::new();
|
|
for docid in allowed_candidates {
|
|
let words_positions = ctx.docid_words_positions(docid)?;
|
|
resolve_operation_cache.clear();
|
|
let positions =
|
|
resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?;
|
|
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
|
|
let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7);
|
|
candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid);
|
|
}
|
|
|
|
Ok(candidates)
|
|
}
|