Support ngram typos + splitwords and splitwords+synonyms in proximity

2025-11-09 20:36:28 +00:00 · 2023-03-13 17:21:29 +01:00
parent 14e8d0aaa2
commit 3004e281d7
9 changed files with 701 additions and 411 deletions
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@@ -1,37 +1,43 @@
+#![allow(clippy::too_many_arguments)]
 use std::collections::BTreeMap;

-use itertools::Itertools;
-
 use super::ProximityEdge;
-use crate::search::new::interner::Interner;
-use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
+use crate::search::new::db_cache::DatabaseCache;
+use crate::search::new::interner::{Interned, Interner};
+use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
 use crate::search::new::ranking_rule_graph::proximity::WordPair;
 use crate::search::new::ranking_rule_graph::EdgeCondition;
 use crate::search::new::{QueryNode, SearchContext};
 use crate::Result;
+use heed::RoTxn;

 pub fn visit_from_node(
    ctx: &mut SearchContext,
    from_node: &QueryNode,
-) -> Result<Option<(WordDerivations, i8)>> {
-    Ok(Some(match from_node {
+) -> Result<Option<(Vec<(Option<Interned<Phrase>>, Interned<String>)>, i8)>> {
+    let SearchContext { derivations_interner, .. } = ctx;
+
+    let (left_phrase, left_derivations, left_end_position) = match from_node {
        QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => {
            match value1 {
                QueryTerm::Word { derivations } => {
-                    (ctx.derivations_interner.get(*derivations).clone(), *pos1.end())
+                    (None, derivations_interner.get(*derivations).clone(), *pos1.end())
                }
-                QueryTerm::Phrase { phrase: phrase1 } => {
-                    let phrase1 = ctx.phrase_interner.get(*phrase1);
-                    if let Some(original) = *phrase1.words.last().unwrap() {
+                QueryTerm::Phrase { phrase: phrase_interned } => {
+                    let phrase = ctx.phrase_interner.get(*phrase_interned);
+                    if let Some(original) = *phrase.words.last().unwrap() {
                        (
+                            Some(*phrase_interned),
                            WordDerivations {
                                original,
-                                zero_typo: Box::new([original]),
+                                zero_typo: Some(original),
                                one_typo: Box::new([]),
                                two_typos: Box::new([]),
-                                use_prefix_db: false,
+                                use_prefix_db: None,
                                synonyms: Box::new([]),
                                split_words: None,
+                                is_prefix: false,
+                                prefix_of: Box::new([]),
                            },
                            *pos1.end(),
                        )
@@ -42,190 +48,175 @@ pub fn visit_from_node(
                }
            }
        }
-        QueryNode::Start => (
-            WordDerivations {
-                original: ctx.word_interner.insert(String::new()),
-                zero_typo: Box::new([]),
-                one_typo: Box::new([]),
-                two_typos: Box::new([]),
-                use_prefix_db: false,
-                synonyms: Box::new([]),
-                split_words: None,
-            },
-            -100,
-        ),
+        QueryNode::Start => (None, WordDerivations::empty(&mut ctx.word_interner, ""), -1),
        _ => return Ok(None),
-    }))
+    };
+
+    // left term cannot be a prefix
+    assert!(left_derivations.use_prefix_db.is_none() && !left_derivations.is_prefix);
+
+    let last_word_left_phrase = if let Some(left_phrase_interned) = left_phrase {
+        let left_phrase = ctx.phrase_interner.get(left_phrase_interned);
+        left_phrase.words.last().copied().unwrap()
+    } else {
+        None
+    };
+    let left_single_word_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> = left_derivations
+        .all_single_word_derivations_except_prefix_db()
+        .chain(last_word_left_phrase.iter().copied())
+        .map(|w| (left_phrase, w))
+        .collect();
+    let left_phrase_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> = left_derivations
+        .all_phrase_derivations()
+        .map(|left_phrase_interned: Interned<Phrase>| {
+            let left_phrase = ctx.phrase_interner.get(left_phrase_interned);
+            let last_word_left_phrase: Interned<String> =
+                left_phrase.words.last().unwrap().unwrap();
+            let r: (Option<Interned<Phrase>>, Interned<String>) =
+                (Some(left_phrase_interned), last_word_left_phrase);
+            r
+        })
+        .collect();
+    let mut left_word_iter = left_single_word_iter;
+    left_word_iter.extend(left_phrase_iter);
+
+    Ok(Some((left_word_iter, left_end_position)))
 }

-pub fn visit_to_node<'ctx, 'from_data>(
+pub fn build_step_visit_destination_node<'ctx, 'from_data>(
    ctx: &mut SearchContext<'ctx>,
    conditions_interner: &mut Interner<ProximityEdge>,
+    from_node_data: &'from_data (Vec<(Option<Interned<Phrase>>, Interned<String>)>, i8),
    to_node: &QueryNode,
-    from_node_data: &'from_data (WordDerivations, i8),
 ) -> Result<Vec<(u8, EdgeCondition<ProximityEdge>)>> {
-    let SearchContext { index, txn, db_cache, word_interner, derivations_interner, .. } = ctx;
-
-    // IMPORTANT! TODO: split words support
-
-    let (derivations1, pos1) = from_node_data;
-    let term2 = match &to_node {
+    let SearchContext {
+        index,
+        txn,
+        db_cache,
+        word_interner,
+        phrase_interner,
+        derivations_interner,
+        query_term_docids: _,
+    } = ctx;
+    let right_term = match &to_node {
        QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]),
        QueryNode::Deleted | QueryNode::Start => return Ok(vec![]),
        QueryNode::Term(term) => term,
    };
-    let LocatedQueryTerm { value: value2, positions: pos2 } = term2;
+    let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term;

-    let (derivations2, pos2, ngram_len2) = match value2 {
-        QueryTerm::Word { derivations } => {
-            (derivations_interner.get(*derivations).clone(), *pos2.start(), pos2.len())
-        }
-        QueryTerm::Phrase { phrase: phrase2 } => {
-            let phrase2 = ctx.phrase_interner.get(*phrase2);
-            if let Some(original) = *phrase2.words.first().unwrap() {
-                (
-                    WordDerivations {
-                        original,
-                        zero_typo: Box::new([original]),
-                        one_typo: Box::new([]),
-                        two_typos: Box::new([]),
-                        use_prefix_db: false,
-                        synonyms: Box::new([]),
-                        split_words: None,
-                    },
-                    *pos2.start(),
-                    1,
-                )
-            } else {
-                // No word pairs if the phrase does not have a regular word as its first term
-                return Ok(vec![]);
+    let (right_phrase, right_derivations, right_start_position, right_ngram_length) =
+        match right_value {
+            QueryTerm::Word { derivations } => (
+                None,
+                derivations_interner.get(*derivations).clone(),
+                *right_positions.start(),
+                right_positions.len(),
+            ),
+            QueryTerm::Phrase { phrase: right_phrase_interned } => {
+                let right_phrase = phrase_interner.get(*right_phrase_interned);
+                if let Some(original) = *right_phrase.words.first().unwrap() {
+                    (
+                        Some(*right_phrase_interned),
+                        WordDerivations {
+                            original,
+                            zero_typo: Some(original),
+                            one_typo: Box::new([]),
+                            two_typos: Box::new([]),
+                            use_prefix_db: None,
+                            synonyms: Box::new([]),
+                            split_words: None,
+                            is_prefix: false,
+                            prefix_of: Box::new([]),
+                        },
+                        *right_positions.start(),
+                        1,
+                    )
+                } else {
+                    // No word pairs if the phrase does not have a regular word as its first term
+                    return Ok(vec![]);
+                }
            }
-        }
-    };
+        };

-    if pos1 + 1 != pos2 {
-        // TODO: how should this actually be handled?
-        // We want to effectively ignore this pair of terms
+    let (left_derivations, left_end_position) = from_node_data;
+
+    if left_end_position + 1 != right_start_position {
+        // We want to ignore this pair of terms
        // Unconditionally walk through the edge without computing the docids
-        // But also what should the cost be?
+        // This can happen when, in a query like `the sun flowers are beautiful`, the term
+        // `flowers` is removed by the words ranking rule due to the terms matching strategy.
+        // The remaining query graph represents `the sun .. are beautiful`
+        // but `sun` and `are` have no proximity condition between them
        return Ok(vec![(0, EdgeCondition::Unconditional)]);
    }

-    let updb1 = derivations1.use_prefix_db;
-    let updb2 = derivations2.use_prefix_db;
-
-    // left term cannot be a prefix
-    assert!(!updb1);
-
-    // TODO: IMPORTANT! split words and synonyms support
-    let derivations1 = derivations1.all_single_word_derivations_except_prefix_db();
-    // TODO: eventually, we want to get rid of the uses from `orginal`
    let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();

-    if updb2 {
-        for word1 in derivations1.clone() {
-            for proximity in 1..=(8 - ngram_len2) {
-                let cost = (proximity + ngram_len2 - 1) as u8;
-                // TODO: if we had access to the universe here, we could already check whether
-                // the bitmap corresponding to this word pair is disjoint with the universe or not
-                if db_cache
-                    .get_word_prefix_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        word1,
-                        derivations2.original,
-                        proximity as u8,
-                    )?
-                    .is_some()
-                {
-                    cost_proximity_word_pairs
-                        .entry(cost)
-                        .or_default()
-                        .entry(proximity as u8)
-                        .or_default()
-                        .push(WordPair::WordPrefix {
-                            left: word1,
-                            right_prefix: derivations2.original,
-                        });
-                }
-                if db_cache
-                    .get_prefix_word_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        derivations2.original,
-                        word1,
-                        proximity as u8 - 1,
-                    )?
-                    .is_some()
-                {
-                    cost_proximity_word_pairs
-                        .entry(cost)
-                        .or_default()
-                        .entry(proximity as u8)
-                        .or_default()
-                        .push(WordPair::WordPrefixSwapped {
-                            left_prefix: derivations2.original,
-                            right: word1,
-                        });
-                }
-            }
+    if let Some(right_prefix) = right_derivations.use_prefix_db {
+        for (left_phrase, left_word) in left_derivations.iter().copied() {
+            add_prefix_edges(
+                index,
+                txn,
+                db_cache,
+                word_interner,
+                right_ngram_length,
+                left_word,
+                right_prefix,
+                &mut cost_proximity_word_pairs,
+                left_phrase,
+            )?;
        }
    }

-    // TODO: important! support split words and synonyms as well
-    let derivations2 = derivations2.all_single_word_derivations_except_prefix_db();
    // TODO: add safeguard in case the cartesian product is too large!
    // even if we restrict the word derivations to a maximum of 100, the size of the
    // caterisan product could reach a maximum of 10_000 derivations, which is way too much.
    // mMaybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
    // reached
-    let product_derivations = derivations1.cartesian_product(derivations2);
+    let first_word_right_phrase = if let Some(right_phrase_interned) = right_phrase {
+        let right_phrase = phrase_interner.get(right_phrase_interned);
+        right_phrase.words.first().copied().unwrap()
+    } else {
+        None
+    };
+    let right_single_word_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> =
+        right_derivations
+            .all_single_word_derivations_except_prefix_db()
+            .chain(first_word_right_phrase.iter().copied())
+            .map(|w| (right_phrase, w))
+            .collect();
+    let right_phrase_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> = right_derivations
+        .all_phrase_derivations()
+        .map(|right_phrase_interned: Interned<Phrase>| {
+            let right_phrase = phrase_interner.get(right_phrase_interned);
+            let first_word_right_phrase: Interned<String> =
+                right_phrase.words.first().unwrap().unwrap();
+            let r: (Option<Interned<Phrase>>, Interned<String>) =
+                (Some(right_phrase_interned), first_word_right_phrase);
+            r
+        })
+        .collect();
+    let mut right_word_iter = right_single_word_iter;
+    right_word_iter.extend(right_phrase_iter);

-    for (word1, word2) in product_derivations {
-        for proximity in 1..=(8 - ngram_len2) {
-            let cost = (proximity + ngram_len2 - 1) as u8;
-            if db_cache
-                .get_word_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    word1,
-                    word2,
-                    proximity as u8,
-                )?
-                .is_some()
-            {
-                cost_proximity_word_pairs
-                    .entry(cost)
-                    .or_default()
-                    .entry(proximity as u8)
-                    .or_default()
-                    .push(WordPair::Words { left: word1, right: word2 });
-            }
-            if proximity > 1
-                && db_cache
-                    .get_word_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        word2,
-                        word1,
-                        proximity as u8 - 1,
-                    )?
-                    .is_some()
-            {
-                cost_proximity_word_pairs
-                    .entry(cost)
-                    .or_default()
-                    .entry(proximity as u8 - 1)
-                    .or_default()
-                    .push(WordPair::Words { left: word2, right: word1 });
-            }
+    for (left_phrase, left_word) in left_derivations.iter().copied() {
+        for (right_phrase, right_word) in right_word_iter.iter().copied() {
+            add_non_prefix_edges(
+                index,
+                txn,
+                db_cache,
+                word_interner,
+                right_ngram_length,
+                left_word,
+                right_word,
+                &mut cost_proximity_word_pairs,
+                &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
+            )?;
        }
    }
+
    let mut new_edges =
        cost_proximity_word_pairs
            .into_iter()
@@ -243,6 +234,124 @@ pub fn visit_to_node<'ctx, 'from_data>(
                edges
            })
            .collect::<Vec<_>>();
-    new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional));
+    new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional));
    Ok(new_edges)
 }
+
+fn add_prefix_edges<'ctx>(
+    index: &mut &crate::Index,
+    txn: &'ctx RoTxn,
+    db_cache: &mut DatabaseCache<'ctx>,
+    word_interner: &mut Interner<String>,
+    right_ngram_length: usize,
+    left_word: Interned<String>,
+    right_prefix: Interned<String>,
+    cost_proximity_word_pairs: &mut BTreeMap<u8, BTreeMap<u8, Vec<WordPair>>>,
+    left_phrase: Option<Interned<Phrase>>,
+) -> Result<()> {
+    for proximity in 1..=(8 - right_ngram_length) {
+        let cost = (proximity + right_ngram_length - 1) as u8;
+        // TODO: if we had access to the universe here, we could already check whether
+        // the bitmap corresponding to this word pair is disjoint with the universe or not
+        if db_cache
+            .get_word_prefix_pair_proximity_docids(
+                index,
+                txn,
+                word_interner,
+                left_word,
+                right_prefix,
+                proximity as u8,
+            )?
+            .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8)
+                .or_default()
+                .push(WordPair::WordPrefix {
+                    phrases: left_phrase.into_iter().collect(),
+                    left: left_word,
+                    right_prefix,
+                });
+        }
+
+        // No swapping when computing the proximity between a phrase and a word
+        if left_phrase.is_none()
+            && db_cache
+                .get_prefix_word_pair_proximity_docids(
+                    index,
+                    txn,
+                    word_interner,
+                    right_prefix,
+                    left_word,
+                    proximity as u8 - 1,
+                )?
+                .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8)
+                .or_default()
+                .push(WordPair::WordPrefixSwapped { left_prefix: right_prefix, right: left_word });
+        }
+    }
+    Ok(())
+}
+
+fn add_non_prefix_edges<'ctx>(
+    index: &mut &crate::Index,
+    txn: &'ctx RoTxn,
+    db_cache: &mut DatabaseCache<'ctx>,
+    word_interner: &mut Interner<String>,
+    right_ngram_length: usize,
+    word1: Interned<String>,
+    word2: Interned<String>,
+    cost_proximity_word_pairs: &mut BTreeMap<u8, BTreeMap<u8, Vec<WordPair>>>,
+    phrases: &[Interned<Phrase>],
+) -> Result<()> {
+    for proximity in 1..=(8 - right_ngram_length) {
+        let cost = (proximity + right_ngram_length - 1) as u8;
+        if db_cache
+            .get_word_pair_proximity_docids(
+                index,
+                txn,
+                word_interner,
+                word1,
+                word2,
+                proximity as u8,
+            )?
+            .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8)
+                .or_default()
+                .push(WordPair::Words { phrases: phrases.to_vec(), left: word1, right: word2 });
+        }
+        if proximity > 1
+            // no swapping when either term is a phrase
+            && phrases.is_empty()
+            && db_cache
+                .get_word_pair_proximity_docids(
+                    index,
+                    txn,
+                    word_interner,
+                    word2,
+                    word1,
+                    proximity as u8 - 1,
+                )?
+                .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8 - 1)
+                .or_default()
+                .push(WordPair::Words { phrases: vec![], left: word2, right: word1 });
+        }
+    }
+    Ok(())
+}
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -13,24 +13,61 @@ pub fn compute_docids<'ctx>(
    let ProximityEdge { pairs, proximity } = edge;
    let mut pair_docids = RoaringBitmap::new();
    for pair in pairs.iter() {
-        let bytes = match pair {
-            WordPair::Words { left, right } => db_cache.get_word_pair_proximity_docids(
-                index,
-                txn,
-                word_interner,
-                *left,
-                *right,
-                *proximity,
-            ),
-            WordPair::WordPrefix { left, right_prefix } => db_cache
-                .get_word_prefix_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    *left,
-                    *right_prefix,
-                    *proximity,
-                ),
+        let pair = match pair {
+            WordPair::Words { phrases, left, right } => {
+                let mut docids = db_cache
+                    .get_word_pair_proximity_docids(
+                        index,
+                        txn,
+                        word_interner,
+                        *left,
+                        *right,
+                        *proximity,
+                    )?
+                    .map(CboRoaringBitmapCodec::deserialize_from)
+                    .transpose()?
+                    .unwrap_or_default();
+                if !docids.is_empty() {
+                    for phrase in phrases {
+                        docids &= ctx.query_term_docids.get_phrase_docids(
+                            index,
+                            txn,
+                            db_cache,
+                            word_interner,
+                            &ctx.phrase_interner,
+                            *phrase,
+                        )?;
+                    }
+                }
+                docids
+            }
+            WordPair::WordPrefix { phrases, left, right_prefix } => {
+                let mut docids = db_cache
+                    .get_word_prefix_pair_proximity_docids(
+                        index,
+                        txn,
+                        word_interner,
+                        *left,
+                        *right_prefix,
+                        *proximity,
+                    )?
+                    .map(CboRoaringBitmapCodec::deserialize_from)
+                    .transpose()?
+                    .unwrap_or_default();
+                if !docids.is_empty() {
+                    for phrase in phrases {
+                        docids &= ctx.query_term_docids.get_phrase_docids(
+                            index,
+                            txn,
+                            db_cache,
+                            word_interner,
+                            &ctx.phrase_interner,
+                            *phrase,
+                        )?;
+                    }
+                }
+                docids
+            }
            WordPair::WordPrefixSwapped { left_prefix, right } => db_cache
                .get_prefix_word_pair_proximity_docids(
                    index,
@@ -39,11 +76,13 @@ pub fn compute_docids<'ctx>(
                    *left_prefix,
                    *right,
                    *proximity,
-                ),
-        }?;
-        // TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator?
-        let bitmap = universe
-            & bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
+                )?
+                .map(CboRoaringBitmapCodec::deserialize_from)
+                .transpose()?
+                .unwrap_or_default(),
+        };
+        // TODO: deserialize bitmap within a universe
+        let bitmap = universe & pair;
        pair_docids |= bitmap;
    }

--- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
@@ -7,16 +7,27 @@ use super::empty_paths_cache::EmptyPathsCache;
 use super::{EdgeCondition, RankingRuleGraphTrait};
 use crate::search::new::interner::{Interned, Interner};
 use crate::search::new::logger::SearchLogger;
-use crate::search::new::query_term::WordDerivations;
+use crate::search::new::query_term::Phrase;
 use crate::search::new::small_bitmap::SmallBitmap;
 use crate::search::new::{QueryGraph, QueryNode, SearchContext};
 use crate::Result;

 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum WordPair {
-    Words { left: Interned<String>, right: Interned<String> },
-    WordPrefix { left: Interned<String>, right_prefix: Interned<String> },
-    WordPrefixSwapped { left_prefix: Interned<String>, right: Interned<String> },
+    Words {
+        phrases: Vec<Interned<Phrase>>,
+        left: Interned<String>,
+        right: Interned<String>,
+    },
+    WordPrefix {
+        phrases: Vec<Interned<Phrase>>,
+        left: Interned<String>,
+        right_prefix: Interned<String>,
+    },
+    WordPrefixSwapped {
+        left_prefix: Interned<String>,
+        right: Interned<String>,
+    },
 }

 #[derive(Clone, PartialEq, Eq, Hash)]
@@ -29,7 +40,7 @@ pub enum ProximityGraph {}

 impl RankingRuleGraphTrait for ProximityGraph {
    type EdgeCondition = ProximityEdge;
-    type BuildVisitedFromNode = (WordDerivations, i8);
+    type BuildVisitedFromNode = (Vec<(Option<Interned<Phrase>>, Interned<String>)>, i8);

    fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
        let ProximityEdge { pairs, proximity } = edge;
@@ -54,10 +65,15 @@ impl RankingRuleGraphTrait for ProximityGraph {
    fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>(
        ctx: &mut SearchContext<'ctx>,
        conditions_interner: &mut Interner<Self::EdgeCondition>,
-        to_node: &QueryNode,
-        from_node_data: &'from_data Self::BuildVisitedFromNode,
+        dest_node: &QueryNode,
+        source_node_data: &'from_data Self::BuildVisitedFromNode,
    ) -> Result<Vec<(u8, EdgeCondition<Self::EdgeCondition>)>> {
-        build::visit_to_node(ctx, conditions_interner, to_node, from_node_data)
+        build::build_step_visit_destination_node(
+            ctx,
+            conditions_interner,
+            source_node_data,
+            dest_node,
+        )
    }

    fn log_state(