meilisearch/milli/src/search/criteria/proximity.rs

use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::HashMap;
use std::mem::take;

use log::debug;
use roaring::RoaringBitmap;

use super::{
    query_docids, query_pair_proximity_docids, resolve_query_tree, Context, Criterion,
    CriterionParameters, CriterionResult,
};
use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind};
use crate::search::{build_dfa, WordDerivationsCache};
use crate::{Position, Result};

type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>;

/// Threshold on the number of candidates that will make
/// the system choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 1000;

/// Threshold on the number of proximity that will make
/// the system choose between one algorithm or another.
const PROXIMITY_THRESHOLD: u8 = 0;

pub struct Proximity<'t> {
    ctx: &'t dyn Context<'t>,
    /// (max_proximity, query_tree, allowed_candidates)
    state: Option<(u8, Operation, RoaringBitmap)>,
    proximity: u8,
    bucket_candidates: RoaringBitmap,
    parent: Box<dyn Criterion + 't>,
    candidates_cache: Cache,
    plane_sweep_cache: Option<btree_map::IntoIter<u8, RoaringBitmap>>,
}

impl<'t> Proximity<'t> {
    pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
        Proximity {
            ctx,
            state: None,
            proximity: 0,
            bucket_candidates: RoaringBitmap::new(),
            parent,
            candidates_cache: Cache::new(),
            plane_sweep_cache: None,
        }
    }
}

impl<'t> Criterion for Proximity<'t> {
    #[logging_timer::time("Proximity::{}")]
    fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
        // remove excluded candidates when next is called, instead of doing it in the loop.
        if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
            *allowed_candidates -= params.excluded_candidates;
        }

        loop {
            debug!(
                "Proximity at iteration {} (max prox {:?}) ({:?})",
                self.proximity,
                self.state.as_ref().map(|(mp, _, _)| mp),
                self.state.as_ref().map(|(_, _, cd)| cd),
            );

            match &mut self.state {
                Some((max_prox, _, allowed_candidates))
                    if allowed_candidates.is_empty() || self.proximity > *max_prox =>
                {
                    self.state = None; // reset state
                }
                Some((_, query_tree, allowed_candidates)) => {
                    let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD
                        && self.proximity > PROXIMITY_THRESHOLD
                    {
                        if let Some(cache) = self.plane_sweep_cache.as_mut() {
                            match cache.next() {
                                Some((p, candidates)) => {
                                    self.proximity = p;
                                    candidates
                                }
                                None => {
                                    self.state = None; // reset state
                                    continue;
                                }
                            }
                        } else {
                            let cache = resolve_plane_sweep_candidates(
                                self.ctx,
                                query_tree,
                                allowed_candidates,
                            )?;
                            self.plane_sweep_cache = Some(cache.into_iter());

                            continue;
                        }
                    } else {
                        // use set theory based algorithm
                        resolve_candidates(
                            self.ctx,
                            &query_tree,
                            self.proximity,
                            &mut self.candidates_cache,
                            params.wdcache,
                        )?
                    };

                    new_candidates &= &*allowed_candidates;
                    *allowed_candidates -= &new_candidates;
                    self.proximity += 1;

                    return Ok(Some(CriterionResult {
                        query_tree: Some(query_tree.clone()),
                        candidates: Some(new_candidates),
                        filtered_candidates: None,
                        bucket_candidates: Some(take(&mut self.bucket_candidates)),
                    }));
                }
                None => match self.parent.next(params)? {
                    Some(CriterionResult {
                        query_tree: Some(query_tree),
                        candidates,
                        filtered_candidates,
                        bucket_candidates,
                    }) => {
                        let mut candidates = match candidates {
                            Some(candidates) => candidates,
                            None => {
                                resolve_query_tree(self.ctx, &query_tree, params.wdcache)?
                                    - params.excluded_candidates
                            }
                        };

                        if let Some(filtered_candidates) = filtered_candidates {
                            candidates &= filtered_candidates;
                        }

                        match bucket_candidates {
                            Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
                            None => self.bucket_candidates |= &candidates,
                        }

                        let maximum_proximity = maximum_proximity(&query_tree);
                        self.state = Some((maximum_proximity as u8, query_tree, candidates));
                        self.proximity = 0;
                        self.plane_sweep_cache = None;
                    }
                    Some(CriterionResult {
                        query_tree: None,
                        candidates,
                        filtered_candidates,
                        bucket_candidates,
                    }) => {
                        return Ok(Some(CriterionResult {
                            query_tree: None,
                            candidates,
                            filtered_candidates,
                            bucket_candidates,
                        }));
                    }
                    None => return Ok(None),
                },
            }
        }
    }
}

fn resolve_candidates<'t>(
    ctx: &'t dyn Context,
    query_tree: &Operation,
    proximity: u8,
    cache: &mut Cache,
    wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
    fn resolve_operation<'t>(
        ctx: &'t dyn Context,
        query_tree: &Operation,
        proximity: u8,
        cache: &mut Cache,
        wdcache: &mut WordDerivationsCache,
    ) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
        use Operation::{And, Or, Phrase};

        let result = match query_tree {
            And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?,
            Phrase(words) => {
                if proximity == 0 {
                    let most_left = words
                        .first()
                        .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
                    let most_right = words
                        .last()
                        .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) });
                    let mut candidates = RoaringBitmap::new();
                    let mut first_iter = true;
                    let winsize = words.len().min(7);

                    for win in words.windows(winsize) {
                        // Get all the documents with the matching distance for each word pairs.
                        let mut bitmaps = Vec::with_capacity(winsize.pow(2));
                        for (offset, s1) in win.iter().enumerate() {
                            for (dist, s2) in win.iter().skip(offset + 1).enumerate() {
                                match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? {
                                    Some(m) => bitmaps.push(m),
                                    // If there are no document for this distance, there will be no
                                    // results for the phrase query.
                                    None => return Ok(Default::default()),
                                }
                            }
                        }

                        // We sort the bitmaps so that we perform the small intersections first, which is faster.
                        bitmaps.sort_unstable_by(|a, b| a.len().cmp(&b.len()));

                        for bitmap in bitmaps {
                            if first_iter {
                                candidates = bitmap;
                                first_iter = false;
                            } else {
                                candidates &= bitmap;
                            }
                            // There will be no match, return early
                            if candidates.is_empty() {
                                break;
                            }
                        }
                    }
                    match (most_left, most_right) {
                        (Some(l), Some(r)) => vec![(l, r, candidates)],
                        _otherwise => Default::default(),
                    }
                } else {
                    Default::default()
                }
            }
            Or(_, ops) => {
                let mut output = Vec::new();
                for op in ops {
                    let result = resolve_operation(ctx, op, proximity, cache, wdcache)?;
                    output.extend(result);
                }
                output
            }
            Operation::Query(q) => {
                if proximity == 0 {
                    let candidates = query_docids(ctx, q, wdcache)?;
                    vec![(q.clone(), q.clone(), candidates)]
                } else {
                    Default::default()
                }
            }
        };

        Ok(result)
    }

    fn mdfs_pair<'t>(
        ctx: &'t dyn Context,
        left: &Operation,
        right: &Operation,
        proximity: u8,
        cache: &mut Cache,
        wdcache: &mut WordDerivationsCache,
    ) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
        fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
            (0..=mana.min(left_max)).map(move |m| (m, mana - m))
        }

        let pair_max_proximity = 7;

        let mut output = Vec::new();

        for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
            for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
                let left_key = (left.clone(), left_p);
                if !cache.contains_key(&left_key) {
                    let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?;
                    cache.insert(left_key.clone(), candidates);
                }

                let right_key = (right.clone(), right_p);
                if !cache.contains_key(&right_key) {
                    let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?;
                    cache.insert(right_key.clone(), candidates);
                }

                let lefts = cache.get(&left_key).unwrap();
                let rights = cache.get(&right_key).unwrap();

                for (ll, lr, lcandidates) in lefts {
                    for (rl, rr, rcandidates) in rights {
                        let mut candidates =
                            query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?;
                        if lcandidates.len() < rcandidates.len() {
                            candidates &= lcandidates;
                            candidates &= rcandidates;
                        } else {
                            candidates &= rcandidates;
                            candidates &= lcandidates;
                        }
                        if !candidates.is_empty() {
                            output.push((ll.clone(), rr.clone(), candidates));
                        }
                    }
                }
            }
        }

        Ok(output)
    }

    fn mdfs<'t>(
        ctx: &'t dyn Context,
        branches: &[Operation],
        proximity: u8,
        cache: &mut Cache,
        wdcache: &mut WordDerivationsCache,
    ) -> Result<Vec<(Query, Query, RoaringBitmap)>> {
        // Extract the first two elements but gives the tail
        // that is just after the first element.
        let next =
            branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t))));

        match next {
            Some((head1, Some((head2, [_])))) => {
                mdfs_pair(ctx, head1, head2, proximity, cache, wdcache)
            }
            Some((head1, Some((head2, tail)))) => {
                let mut output = Vec::new();
                for p in 0..=proximity {
                    for (lhead, _, head_candidates) in
                        mdfs_pair(ctx, head1, head2, p, cache, wdcache)?
                    {
                        if !head_candidates.is_empty() {
                            for (_, rtail, mut candidates) in
                                mdfs(ctx, tail, proximity - p, cache, wdcache)?
                            {
                                candidates &= &head_candidates;
                                if !candidates.is_empty() {
                                    output.push((lhead.clone(), rtail, candidates));
                                }
                            }
                        }
                    }
                }
                Ok(output)
            }
            Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache),
            None => Ok(Default::default()),
        }
    }

    let mut candidates = RoaringBitmap::new();
    for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? {
        candidates |= cds;
    }
    Ok(candidates)
}

fn resolve_plane_sweep_candidates(
    ctx: &dyn Context,
    query_tree: &Operation,
    allowed_candidates: &RoaringBitmap,
) -> Result<BTreeMap<u8, RoaringBitmap>> {
    /// FIXME may be buggy with query like "new new york"
    fn plane_sweep(
        groups_positions: Vec<Vec<(Position, u8, Position)>>,
        consecutive: bool,
    ) -> Result<Vec<(Position, u8, Position)>> {
        fn compute_groups_proximity(
            groups: &[(usize, (Position, u8, Position))],
            consecutive: bool,
        ) -> Option<(Position, u8, Position)> {
            // take the inner proximity of the first group as initial
            let (_, (_, mut proximity, _)) = groups.first()?;
            let (_, (left_most_pos, _, _)) = groups.first()?;
            let (_, (_, _, right_most_pos)) =
                groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?;

            for pair in groups.windows(2) {
                if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair {
                    // if two positions are equal, meaning that they share at least a word, we return None
                    if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 {
                        return None;
                    }

                    let pair_proximity = {
                        // if intervals are disjoint [..].(..)
                        if lpos2 > rpos1 {
                            lpos2 - rpos1
                        }
                        // if the second interval is a subset of the first [.(..).]
                        else if rpos2 < rpos1 {
                            (lpos2 - lpos1).min(rpos1 - rpos2)
                        }
                        // if intervals overlaps [.(..].)
                        else {
                            (lpos2 - lpos1).min(rpos2 - rpos1)
                        }
                    };

                    // if groups are in the good order (query order) we remove 1 to the proximity
                    // the proximity is clamped to 7
                    let pair_proximity =
                        if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) };

                    proximity += pair_proximity as u8 + prox2;
                }
            }

            // if groups should be consecutives, we will only accept groups with a proximity of 0
            if !consecutive || proximity == 0 {
                Some((*left_most_pos, proximity, *right_most_pos))
            } else {
                None
            }
        }

        let groups_len = groups_positions.len();

        let mut groups_positions: Vec<_> =
            groups_positions.into_iter().map(|pos| pos.into_iter()).collect();

        // Pop top elements of each list.
        let mut current = Vec::with_capacity(groups_len);
        for (i, positions) in groups_positions.iter_mut().enumerate() {
            match positions.next() {
                Some(p) => current.push((i, p)),
                // if a group return None, it means that the document does not contain all the words,
                // we return an empty result.
                None => return Ok(Vec::new()),
            }
        }

        // Sort k elements by their positions.
        current.sort_unstable_by_key(|(_, p)| *p);

        // Find leftmost and rightmost group and their positions.
        let mut leftmost = *current.first().unwrap();
        let mut rightmost = *current.last().unwrap();

        let mut output = Vec::new();
        loop {
            // Find the position p of the next elements of a list of the leftmost group.
            // If the list is empty, break the loop.
            let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p));

            // let q be the position q of second group of the interval.
            let q = current[1];

            // If p > r, then the interval [l, r] is minimal and
            // we insert it into the heap according to its size.
            if p.map_or(true, |p| p.1 > rightmost.1) {
                if let Some(group) = compute_groups_proximity(&current, consecutive) {
                    output.push(group);
                }
            }

            let p = match p {
                Some(p) => p,
                None => break,
            };

            // Replace the leftmost group P in the interval.
            current[0] = p;

            if p.1 > rightmost.1 {
                // if [l, r] is minimal, let r = p and l = q.
                rightmost = p;
                leftmost = q;
            } else {
                // Ohterwise, let l = min{p,q}.
                leftmost = if p.1 < q.1 { p } else { q };
            }

            // Then update the interval and order of groups_positions in the interval.
            current.sort_unstable_by_key(|(_, p)| *p);
        }

        // Sort the list according to the size and the positions.
        output.sort_unstable();

        Ok(output)
    }

    fn resolve_operation<'a>(
        query_tree: &'a Operation,
        rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
        words_positions: &HashMap<String, RoaringBitmap>,
    ) -> Result<Vec<(Position, u8, Position)>> {
        use Operation::{And, Or, Phrase};

        if let Some(result) = rocache.get(query_tree) {
            return Ok(result.clone());
        }

        let result = match query_tree {
            And(ops) => {
                let mut groups_positions = Vec::with_capacity(ops.len());
                for operation in ops {
                    let positions = resolve_operation(operation, rocache, words_positions)?;
                    groups_positions.push(positions);
                }
                plane_sweep(groups_positions, false)?
            }
            Phrase(words) => {
                let mut groups_positions = Vec::with_capacity(words.len());
                for word in words {
                    let positions = match words_positions.get(word) {
                        Some(positions) => positions.iter().map(|p| (p, 0, p)).collect(),
                        None => return Ok(vec![]),
                    };
                    groups_positions.push(positions);
                }
                plane_sweep(groups_positions, true)?
            }
            Or(_, ops) => {
                let mut result = Vec::new();
                for op in ops {
                    result.extend(resolve_operation(op, rocache, words_positions)?)
                }

                result.sort_unstable();
                result
            }
            Operation::Query(Query { prefix, kind }) => {
                let mut result = Vec::new();
                match kind {
                    QueryKind::Exact { word, .. } => {
                        if *prefix {
                            let iter = word_derivations(word, true, 0, &words_positions)
                                .flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
                            result.extend(iter);
                        } else if let Some(positions) = words_positions.get(word) {
                            result.extend(positions.iter().map(|p| (p, 0, p)));
                        }
                    }
                    QueryKind::Tolerant { typo, word } => {
                        let iter = word_derivations(word, *prefix, *typo, &words_positions)
                            .flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
                        result.extend(iter);
                    }
                }

                result.sort_unstable();
                result
            }
        };

        rocache.insert(query_tree, result.clone());
        Ok(result)
    }

    fn word_derivations<'a>(
        word: &str,
        is_prefix: bool,
        max_typo: u8,
        words_positions: &'a HashMap<String, RoaringBitmap>,
    ) -> impl Iterator<Item = &'a RoaringBitmap> {
        let dfa = build_dfa(word, max_typo, is_prefix);
        words_positions.iter().filter_map(move |(document_word, positions)| {
            use levenshtein_automata::Distance;
            match dfa.eval(document_word) {
                Distance::Exact(_) => Some(positions),
                Distance::AtLeast(_) => None,
            }
        })
    }

    let mut resolve_operation_cache = HashMap::new();
    let mut candidates = BTreeMap::new();
    for docid in allowed_candidates {
        let words_positions = ctx.docid_words_positions(docid)?;
        resolve_operation_cache.clear();
        let positions =
            resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?;
        let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);
        let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7);
        candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid);
    }

    Ok(candidates)
}