Compute edges of proximity graph lazily

2025-11-09 12:26:30 +00:00 · 2023-03-21 10:44:40 +01:00
parent 272cd7ebbd
commit 83e5b4ed0d
12 changed files with 345 additions and 841 deletions
--- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs
@@ -1,19 +1,28 @@
 use std::marker::PhantomData;

-use fxhash::FxHashMap;
+use fxhash::{FxHashMap, FxHashSet};
 use roaring::RoaringBitmap;

 use super::{RankingRuleGraph, RankingRuleGraphTrait};
 use crate::search::new::interner::Interned;
+use crate::search::new::query_term::Phrase;
 use crate::search::new::SearchContext;
 use crate::Result;

 // TODO: give a generation to each universe, then be able to get the exact
 // delta of docids between two universes of different generations!

+#[derive(Default)]
+pub struct ComputedCondition {
+    docids: RoaringBitmap,
+    universe_len: u64,
+    used_words: FxHashSet<Interned<String>>,
+    used_phrases: FxHashSet<Interned<Phrase>>,
+}
+
 /// A cache storing the document ids associated with each ranking rule edge
 pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
-    pub cache: FxHashMap<Interned<G::Condition>, (u64, RoaringBitmap)>,
+    pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
    _phantom: PhantomData<G>,
 }
 impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
@@ -22,6 +31,14 @@ impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
    }
 }
 impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
+    pub fn get_condition_used_words_and_phrases(
+        &mut self,
+        interned_condition: Interned<G::Condition>,
+    ) -> (&FxHashSet<Interned<String>>, &FxHashSet<Interned<Phrase>>) {
+        let ComputedCondition { used_words, used_phrases, .. } = &self.cache[&interned_condition];
+        (used_words, used_phrases)
+    }
+
    /// Retrieve the document ids for the given edge condition.
    ///
    /// If the cache does not yet contain these docids, they are computed
@@ -30,14 +47,14 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
        &'s mut self,
        ctx: &mut SearchContext<'ctx>,
        interned_condition: Interned<G::Condition>,
-        graph: &RankingRuleGraph<G>,
-        // TODO: maybe universe doesn't belong here
+        graph: &mut RankingRuleGraph<G>,
        universe: &RoaringBitmap,
    ) -> Result<&'s RoaringBitmap> {
        if self.cache.contains_key(&interned_condition) {
            // TODO compare length of universe compared to the one in self
            // if it is smaller, then update the value
-            let (universe_len, docids) = self.cache.entry(interned_condition).or_default();
+            let ComputedCondition { docids, universe_len, .. } =
+                self.cache.entry(interned_condition).or_default();
            if *universe_len == universe.len() {
                return Ok(docids);
            } else {
@@ -46,12 +63,13 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
                return Ok(docids);
            }
        }
-        // TODO: maybe universe doesn't belong here
-        let condition = graph.conditions_interner.get(interned_condition);
-        // TODO: faster way to do this?
-        let docids = G::resolve_condition(ctx, condition, universe)?;
-        let _ = self.cache.insert(interned_condition, (universe.len(), docids));
-        let (_, docids) = &self.cache[&interned_condition];
+        let condition = graph.conditions_interner.get_mut(interned_condition);
+        let (docids, used_words, used_phrases) = G::resolve_condition(ctx, condition, universe)?;
+        let _ = self.cache.insert(
+            interned_condition,
+            ComputedCondition { docids, universe_len: universe.len(), used_words, used_phrases },
+        );
+        let ComputedCondition { docids, .. } = &self.cache[&interned_condition];
        Ok(docids)
    }
 }
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@@ -15,11 +15,11 @@ mod proximity;
 /// Implementation of the `typo` ranking rule
 mod typo;

-use std::collections::HashSet;
 use std::hash::Hash;

 pub use condition_docids_cache::ConditionDocIdsCache;
 pub use dead_ends_cache::DeadEndsCache;
+use fxhash::FxHashSet;
 pub use proximity::{ProximityCondition, ProximityGraph};
 use roaring::RoaringBitmap;
 pub use typo::{TypoCondition, TypoGraph};
@@ -80,23 +80,13 @@ pub trait RankingRuleGraphTrait: Sized {
        condition: &Self::Condition,
    ) -> Result<String>;

-    fn words_used_by_condition<'ctx>(
-        ctx: &mut SearchContext<'ctx>,
-        condition: &Self::Condition,
-    ) -> Result<HashSet<Interned<String>>>;
-
-    fn phrases_used_by_condition<'ctx>(
-        ctx: &mut SearchContext<'ctx>,
-        condition: &Self::Condition,
-    ) -> Result<HashSet<Interned<Phrase>>>;
-
    /// Compute the document ids associated with the given edge condition,
    /// restricted to the given universe.
    fn resolve_condition<'ctx>(
        ctx: &mut SearchContext<'ctx>,
        condition: &Self::Condition,
        universe: &RoaringBitmap,
-    ) -> Result<RoaringBitmap>;
+    ) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>;

    /// Return the costs and conditions of the edges going from the source node to the destination node
    fn build_edges<'ctx>(
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@@ -1,56 +1,18 @@
 #![allow(clippy::too_many_arguments)]
-use std::collections::BTreeMap;
-
-use heed::RoTxn;

 use super::ProximityCondition;
-use crate::search::new::db_cache::DatabaseCache;
 use crate::search::new::interner::{DedupInterner, Interned};
 use crate::search::new::query_graph::QueryNodeData;
-use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm};
-use crate::search::new::ranking_rule_graph::proximity::WordPair;
+use crate::search::new::query_term::LocatedQueryTerm;
 use crate::search::new::{QueryNode, SearchContext};
 use crate::Result;

-fn last_word_of_term_iter<'t>(
-    t: &'t QueryTerm,
-    phrase_interner: &'t DedupInterner<Phrase>,
-) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
-    t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
-        move |p| {
-            let phrase = phrase_interner.get(p);
-            phrase.words.last().unwrap().map(|last| (Some(p), last))
-        },
-    ))
-}
-fn first_word_of_term_iter<'t>(
-    t: &'t QueryTerm,
-    phrase_interner: &'t DedupInterner<Phrase>,
-) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
-    t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
-        move |p| {
-            let phrase = phrase_interner.get(p);
-            phrase.words.first().unwrap().map(|first| (first, Some(p)))
-        },
-    ))
-}
-
 pub fn build_edges<'ctx>(
-    ctx: &mut SearchContext<'ctx>,
+    _ctx: &mut SearchContext<'ctx>,
    conditions_interner: &mut DedupInterner<ProximityCondition>,
    from_node: &QueryNode,
    to_node: &QueryNode,
 ) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
-    let SearchContext {
-        index,
-        txn,
-        db_cache,
-        word_interner,
-        phrase_interner,
-        term_interner,
-        term_docids: _,
-    } = ctx;
-
    let right_term = match &to_node.data {
        QueryNodeData::End => return Ok(vec![(0, None)]),
        QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
@@ -59,13 +21,11 @@ pub fn build_edges<'ctx>(

    let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;

-    let (right_term, right_start_position, right_ngram_length) =
-        (term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
+    let (right_start_position, right_ngram_length) =
+        (*right_positions.start(), right_positions.len());

-    let (left_term, left_end_position) = match &from_node.data {
-        QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
-            (term_interner.get(*value), *positions.end())
-        }
+    let (left_term_interned, left_end_position) = match &from_node.data {
+        QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
        QueryNodeData::Deleted => return Ok(vec![]),
        QueryNodeData::Start => {
            return Ok(vec![(
@@ -94,175 +54,24 @@ pub fn build_edges<'ctx>(
        )]);
    }

-    let mut cost_word_pairs = BTreeMap::<u8, Vec<WordPair>>::new();
-
-    if let Some(right_prefix) = right_term.use_prefix_db {
-        for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
-            add_prefix_edges(
-                index,
-                txn,
-                db_cache,
-                word_interner,
-                right_ngram_length,
-                left_word,
-                right_prefix,
-                &mut cost_word_pairs,
-                left_phrase,
-            )?;
-        }
-    }
-
-    // TODO: add safeguard in case the cartesian product is too large!
-    // even if we restrict the word derivations to a maximum of 100, the size of the
-    // caterisan product could reach a maximum of 10_000 derivations, which is way too much.
-    // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
-    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
-    // reached
-
-    for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
-        for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
-            add_non_prefix_edges(
-                index,
-                txn,
-                db_cache,
-                word_interner,
-                right_ngram_length,
-                left_word,
-                right_word,
-                &mut cost_word_pairs,
-                &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
-            )?;
-        }
-    }
-
-    let mut new_edges = cost_word_pairs
-        .into_iter()
-        .map(|(cost, word_pairs)| {
-            (
+    let mut conditions = vec![];
+    for cost in right_ngram_length..(7 + right_ngram_length) {
+        let cost = cost as u8;
+        conditions.push((
+            cost,
+            Some(conditions_interner.insert(ProximityCondition::Uninit {
+                left_term: left_term_interned,
+                right_term: *right_term_interned,
+                right_term_ngram_len: right_ngram_length as u8,
                cost,
-                Some(
-                    conditions_interner
-                        .insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }),
-                ),
-            )
-        })
-        .collect::<Vec<_>>();
-    new_edges.push((
-        8 + (right_ngram_length - 1) as u8,
+            })),
+        ))
+    }
+
+    conditions.push((
+        (7 + right_ngram_length) as u8,
        Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
    ));
-    Ok(new_edges)
-}

-fn add_prefix_edges<'ctx>(
-    index: &mut &crate::Index,
-    txn: &'ctx RoTxn,
-    db_cache: &mut DatabaseCache<'ctx>,
-    word_interner: &mut DedupInterner<String>,
-    right_ngram_length: usize,
-    left_word: Interned<String>,
-    right_prefix: Interned<String>,
-    cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
-    left_phrase: Option<Interned<Phrase>>,
-) -> Result<()> {
-    for proximity in 1..=(8 - right_ngram_length) {
-        let cost = (proximity + right_ngram_length - 1) as u8;
-        // TODO: if we had access to the universe here, we could already check whether
-        // the bitmap corresponding to this word pair is disjoint with the universe or not
-        if db_cache
-            .get_word_prefix_pair_proximity_docids(
-                index,
-                txn,
-                word_interner,
-                left_word,
-                right_prefix,
-                proximity as u8,
-            )?
-            .is_some()
-        {
-            cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix {
-                phrases: left_phrase.into_iter().collect(),
-                left: left_word,
-                right_prefix,
-                proximity: proximity as u8,
-            });
-        }
-
-        // No swapping when computing the proximity between a phrase and a word
-        if left_phrase.is_none()
-            && db_cache
-                .get_prefix_word_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    right_prefix,
-                    left_word,
-                    proximity as u8 - 1,
-                )?
-                .is_some()
-        {
-            cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped {
-                left_prefix: right_prefix,
-                right: left_word,
-                proximity: proximity as u8 - 1,
-            });
-        }
-    }
-    Ok(())
-}
-
-fn add_non_prefix_edges<'ctx>(
-    index: &mut &crate::Index,
-    txn: &'ctx RoTxn,
-    db_cache: &mut DatabaseCache<'ctx>,
-    word_interner: &mut DedupInterner<String>,
-    right_ngram_length: usize,
-    word1: Interned<String>,
-    word2: Interned<String>,
-    cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
-    phrases: &[Interned<Phrase>],
-) -> Result<()> {
-    for proximity in 1..=(8 - right_ngram_length) {
-        let cost = (proximity + right_ngram_length - 1) as u8;
-        if db_cache
-            .get_word_pair_proximity_docids(
-                index,
-                txn,
-                word_interner,
-                word1,
-                word2,
-                proximity as u8,
-            )?
-            .is_some()
-        {
-            cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
-                phrases: phrases.to_vec(),
-                left: word1,
-                right: word2,
-                proximity: proximity as u8,
-            });
-        }
-        if proximity > 1
-            // no swapping when either term is a phrase
-            && phrases.is_empty()
-            && db_cache
-                .get_word_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    word2,
-                    word1,
-                    proximity as u8 - 1,
-                )?
-                .is_some()
-        {
-            cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
-                phrases: vec![],
-                left: word2,
-                right: word1,
-                proximity: proximity as u8 - 1,
-            });
-        }
-    }
-    Ok(())
+    Ok(conditions)
 }
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -1,6 +1,15 @@
+#![allow(clippy::too_many_arguments)]
+
+use std::iter::FromIterator;
+
+use fxhash::FxHashSet;
+use heed::RoTxn;
 use roaring::RoaringBitmap;

-use super::{ProximityCondition, WordPair};
+use super::ProximityCondition;
+use crate::search::new::db_cache::DatabaseCache;
+use crate::search::new::interner::{DedupInterner, Interned};
+use crate::search::new::query_term::{Phrase, QueryTerm};
 use crate::search::new::SearchContext;
 use crate::{CboRoaringBitmapCodec, Result};

@@ -8,7 +17,7 @@ pub fn compute_docids<'ctx>(
    ctx: &mut SearchContext<'ctx>,
    condition: &ProximityCondition,
    universe: &RoaringBitmap,
-) -> Result<RoaringBitmap> {
+) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
    let SearchContext {
        index,
        txn,
@@ -18,96 +27,238 @@ pub fn compute_docids<'ctx>(
        phrase_interner,
        term_interner,
    } = ctx;
-    let pairs = match condition {
-        ProximityCondition::Term { term } => {
-            return term_docids
-                .get_query_term_docids(
-                    index,
-                    txn,
-                    db_cache,
-                    word_interner,
-                    term_interner,
-                    phrase_interner,
-                    *term,
-                )
-                .cloned()
+
+    let (left_term, right_term, right_term_ngram_len, cost) = match condition {
+        ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
+            (*left_term, *right_term, *right_term_ngram_len, *cost)
+        }
+        ProximityCondition::Term { term } => {
+            let term_v = term_interner.get(*term);
+            return Ok((
+                term_docids
+                    .get_query_term_docids(
+                        index,
+                        txn,
+                        db_cache,
+                        word_interner,
+                        term_interner,
+                        phrase_interner,
+                        *term,
+                    )?
+                    .clone(),
+                FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
+                FxHashSet::from_iter(term_v.all_phrases()),
+            ));
        }
-        ProximityCondition::Pairs { pairs } => pairs,
    };
-    let mut pair_docids = RoaringBitmap::new();
-    for pair in pairs.iter() {
-        let pair = match pair {
-            WordPair::Words { phrases, left, right, proximity } => {
-                let mut docids = db_cache
-                    .get_word_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        *left,
-                        *right,
-                        *proximity,
-                    )?
-                    .map(CboRoaringBitmapCodec::deserialize_from)
-                    .transpose()?
-                    .unwrap_or_default();
-                if !docids.is_empty() {
-                    for phrase in phrases {
-                        docids &= ctx.term_docids.get_phrase_docids(
-                            index,
-                            txn,
-                            db_cache,
-                            word_interner,
-                            &ctx.phrase_interner,
-                            *phrase,
-                        )?;
-                    }
-                }
-                docids
-            }
-            WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
-                let mut docids = db_cache
-                    .get_word_prefix_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        *left,
-                        *right_prefix,
-                        *proximity,
-                    )?
-                    .map(CboRoaringBitmapCodec::deserialize_from)
-                    .transpose()?
-                    .unwrap_or_default();
-                if !docids.is_empty() {
-                    for phrase in phrases {
-                        docids &= ctx.term_docids.get_phrase_docids(
-                            index,
-                            txn,
-                            db_cache,
-                            word_interner,
-                            &ctx.phrase_interner,
-                            *phrase,
-                        )?;
-                    }
-                }
-                docids
-            }
-            WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache
-                .get_prefix_word_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    *left_prefix,
-                    *right,
-                    *proximity,
-                )?
-                .map(CboRoaringBitmapCodec::deserialize_from)
-                .transpose()?
-                .unwrap_or_default(),
-        };
-        // TODO: deserialize bitmap within a universe
-        let bitmap = universe & pair;
-        pair_docids |= bitmap;
+
+    let left_term = term_interner.get(left_term);
+    let right_term = term_interner.get(right_term);
+
+    // e.g. for the simple words `sun .. flower`
+    // the cost is 5
+    // the forward proximity is 5
+    // the backward proximity is 4
+    //
+    // for the 2gram `the sunflower`
+    // the cost is 5
+    // the forward proximity is 4
+    // the backward proximity is 3
+    let forward_proximity = 1 + cost - right_term_ngram_len;
+    let backward_proximity = cost - right_term_ngram_len;
+
+    let mut used_words = FxHashSet::default();
+    let mut used_phrases = FxHashSet::default();
+
+    let mut docids = RoaringBitmap::new();
+
+    if let Some(right_prefix) = right_term.use_prefix_db {
+        for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
+            compute_prefix_edges(
+                index,
+                txn,
+                db_cache,
+                word_interner,
+                left_word,
+                right_prefix,
+                left_phrase,
+                forward_proximity,
+                backward_proximity,
+                &mut docids,
+                universe,
+                &mut used_words,
+                &mut used_phrases,
+            )?;
+        }
    }

-    Ok(pair_docids)
+    // TODO: add safeguard in case the cartesian product is too large!
+    // even if we restrict the word derivations to a maximum of 100, the size of the
+    // caterisan product could reach a maximum of 10_000 derivations, which is way too much.
+    // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
+    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
+    // reached
+
+    for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
+        for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
+            compute_non_prefix_edges(
+                index,
+                txn,
+                db_cache,
+                word_interner,
+                left_word,
+                right_word,
+                &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
+                forward_proximity,
+                backward_proximity,
+                &mut docids,
+                universe,
+                &mut used_words,
+                &mut used_phrases,
+            )?;
+        }
+    }
+
+    Ok((docids, used_words, used_phrases))
+}
+
+fn compute_prefix_edges<'ctx>(
+    index: &mut &crate::Index,
+    txn: &'ctx RoTxn,
+    db_cache: &mut DatabaseCache<'ctx>,
+    word_interner: &mut DedupInterner<String>,
+    left_word: Interned<String>,
+    right_prefix: Interned<String>,
+    left_phrase: Option<Interned<Phrase>>,
+    forward_proximity: u8,
+    backward_proximity: u8,
+    docids: &mut RoaringBitmap,
+    universe: &RoaringBitmap,
+    used_words: &mut FxHashSet<Interned<String>>,
+    used_phrases: &mut FxHashSet<Interned<Phrase>>,
+) -> Result<()> {
+    if let Some(phrase) = left_phrase {
+        // TODO: compute the phrase, take the intersection between
+        // the phrase and the docids
+        used_phrases.insert(phrase); // This is not fully correct
+    }
+
+    if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
+        index,
+        txn,
+        word_interner,
+        left_word,
+        right_prefix,
+        forward_proximity,
+    )? {
+        let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
+        if !new_docids.is_empty() {
+            used_words.insert(left_word);
+            used_words.insert(right_prefix);
+            *docids |= new_docids;
+        }
+    }
+
+    // No swapping when computing the proximity between a phrase and a word
+    if left_phrase.is_none() {
+        if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
+            index,
+            txn,
+            word_interner,
+            right_prefix,
+            left_word,
+            backward_proximity,
+        )? {
+            let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
+            if !new_docids.is_empty() {
+                used_words.insert(left_word);
+                used_words.insert(right_prefix);
+                *docids |= new_docids;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn compute_non_prefix_edges<'ctx>(
+    index: &mut &crate::Index,
+    txn: &'ctx RoTxn,
+    db_cache: &mut DatabaseCache<'ctx>,
+    word_interner: &mut DedupInterner<String>,
+    word1: Interned<String>,
+    word2: Interned<String>,
+    phrases: &[Interned<Phrase>],
+    forward_proximity: u8,
+    backward_proximity: u8,
+    docids: &mut RoaringBitmap,
+    universe: &RoaringBitmap,
+    used_words: &mut FxHashSet<Interned<String>>,
+    used_phrases: &mut FxHashSet<Interned<Phrase>>,
+) -> Result<()> {
+    if !phrases.is_empty() {
+        // TODO: compute the docids associated with these phrases
+        // take their intersection with the new docids
+        used_phrases.extend(phrases); // This is not fully correct
+    }
+    if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
+        index,
+        txn,
+        word_interner,
+        word1,
+        word2,
+        forward_proximity,
+    )? {
+        let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
+        if !new_docids.is_empty() {
+            used_words.insert(word1);
+            used_words.insert(word2);
+            *docids |= new_docids;
+        }
+    }
+    if backward_proximity >= 1
+            // no swapping when either term is a phrase
+            && phrases.is_empty()
+    {
+        if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
+            index,
+            txn,
+            word_interner,
+            word2,
+            word1,
+            backward_proximity,
+        )? {
+            let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
+            if !new_docids.is_empty() {
+                used_words.insert(word1);
+                used_words.insert(word2);
+                *docids |= new_docids;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+fn last_word_of_term_iter<'t>(
+    t: &'t QueryTerm,
+    phrase_interner: &'t DedupInterner<Phrase>,
+) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
+    t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
+        move |p| {
+            let phrase = phrase_interner.get(p);
+            phrase.words.last().unwrap().map(|last| (Some(p), last))
+        },
+    ))
+}
+fn first_word_of_term_iter<'t>(
+    t: &'t QueryTerm,
+    phrase_interner: &'t DedupInterner<Phrase>,
+) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
+    t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
+        move |p| {
+            let phrase = phrase_interner.get(p);
+            phrase.words.first().unwrap().map(|first| (first, Some(p)))
+        },
+    ))
 }
--- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
@@ -1,9 +1,7 @@
 pub mod build;
 pub mod compute_docids;

-use std::collections::HashSet;
-use std::iter::FromIterator;
-
+use fxhash::FxHashSet;
 use roaring::RoaringBitmap;

 use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
@@ -13,31 +11,17 @@ use crate::search::new::query_term::{Phrase, QueryTerm};
 use crate::search::new::{QueryGraph, QueryNode, SearchContext};
 use crate::Result;

-#[derive(Debug, Clone, PartialEq, Eq, Hash)]
-pub enum WordPair {
-    Words {
-        phrases: Vec<Interned<Phrase>>,
-        left: Interned<String>,
-        right: Interned<String>,
-        proximity: u8,
-    },
-    WordPrefix {
-        phrases: Vec<Interned<Phrase>>,
-        left: Interned<String>,
-        right_prefix: Interned<String>,
-        proximity: u8,
-    },
-    WordPrefixSwapped {
-        left_prefix: Interned<String>,
-        right: Interned<String>,
-        proximity: u8,
-    },
-}
-
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum ProximityCondition {
-    Term { term: Interned<QueryTerm> },
-    Pairs { pairs: Box<[WordPair]> },
+    Uninit {
+        left_term: Interned<QueryTerm>,
+        right_term: Interned<QueryTerm>,
+        right_term_ngram_len: u8,
+        cost: u8,
+    },
+    Term {
+        term: Interned<QueryTerm>,
+    },
 }

 pub enum ProximityGraph {}
@@ -49,7 +33,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
        ctx: &mut SearchContext<'ctx>,
        condition: &Self::Condition,
        universe: &RoaringBitmap,
-    ) -> Result<roaring::RoaringBitmap> {
+    ) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
+    {
        compute_docids::compute_docids(ctx, condition, universe)
    }

@@ -79,107 +64,14 @@ impl RankingRuleGraphTrait for ProximityGraph {
        condition: &Self::Condition,
    ) -> Result<String> {
        match condition {
+            ProximityCondition::Uninit { cost, .. } => {
+                //  TODO
+                Ok(format!("{cost}: cost"))
+            }
            ProximityCondition::Term { term } => {
                let term = ctx.term_interner.get(*term);
                Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
            }
-            ProximityCondition::Pairs { pairs } => {
-                let mut s = String::new();
-                for pair in pairs.iter() {
-                    match pair {
-                        WordPair::Words { phrases, left, right, proximity } => {
-                            let left = ctx.word_interner.get(*left);
-                            let right = ctx.word_interner.get(*right);
-                            if !phrases.is_empty() {
-                                s.push_str(&format!("{} phrases + ", phrases.len()));
-                            }
-                            s.push_str(&format!("\"{left} {right}\": {proximity}\n"));
-                        }
-                        WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
-                            let left = ctx.word_interner.get(*left);
-                            let right = ctx.word_interner.get(*right_prefix);
-                            if !phrases.is_empty() {
-                                s.push_str(&format!("{} phrases + ", phrases.len()));
-                            }
-                            s.push_str(&format!("\"{left} {right}...\" : {proximity}\n"));
-                        }
-                        WordPair::WordPrefixSwapped { left_prefix, right, proximity } => {
-                            let left = ctx.word_interner.get(*left_prefix);
-                            let right = ctx.word_interner.get(*right);
-                            s.push_str(&format!("\"{left}... {right}\" : {proximity}\n"));
-                        }
-                    }
-                }
-                Ok(s)
-            }
-        }
-    }
-
-    fn words_used_by_condition<'ctx>(
-        ctx: &mut SearchContext<'ctx>,
-        condition: &Self::Condition,
-    ) -> Result<HashSet<Interned<String>>> {
-        match condition {
-            ProximityCondition::Term { term } => {
-                let term = ctx.term_interner.get(*term);
-                Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
-            }
-            ProximityCondition::Pairs { pairs } => {
-                let mut set = HashSet::new();
-                for pair in pairs.iter() {
-                    match pair {
-                        WordPair::Words { phrases: _, left, right, proximity: _ } => {
-                            set.insert(*left);
-                            set.insert(*right);
-                        }
-                        WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => {
-                            set.insert(*left);
-                            // TODO: this is not correct, there should be another trait method for collecting the prefixes
-                            // to be used with the prefix DBs
-                            set.insert(*right_prefix);
-                        }
-                        WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => {
-                            // TODO: this is not correct, there should be another trait method for collecting the prefixes
-                            // to be used with the prefix DBs
-                            set.insert(*left_prefix);
-                            set.insert(*right);
-                        }
-                    }
-                }
-                Ok(set)
-            }
-        }
-    }
-
-    fn phrases_used_by_condition<'ctx>(
-        ctx: &mut SearchContext<'ctx>,
-        condition: &Self::Condition,
-    ) -> Result<HashSet<Interned<Phrase>>> {
-        match condition {
-            ProximityCondition::Term { term } => {
-                let term = ctx.term_interner.get(*term);
-                Ok(HashSet::from_iter(term.all_phrases()))
-            }
-            ProximityCondition::Pairs { pairs } => {
-                let mut set = HashSet::new();
-                for pair in pairs.iter() {
-                    match pair {
-                        WordPair::Words { phrases, left: _, right: _, proximity: _ } => {
-                            set.extend(phrases.iter().copied());
-                        }
-                        WordPair::WordPrefix {
-                            phrases,
-                            left: _,
-                            right_prefix: _,
-                            proximity: _,
-                        } => {
-                            set.extend(phrases.iter().copied());
-                        }
-                        WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {}
-                    }
-                }
-                Ok(set)
-            }
        }
    }
 }
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@@ -1,7 +1,8 @@
-use std::collections::HashSet;
+// use std::collections::HashSet;
 use std::fmt::Write;
 use std::iter::FromIterator;

+use fxhash::FxHashSet;
 use roaring::RoaringBitmap;

 use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
@@ -26,7 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph {
        ctx: &mut SearchContext<'ctx>,
        condition: &Self::Condition,
        universe: &RoaringBitmap,
-    ) -> Result<RoaringBitmap> {
+    ) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
        let SearchContext {
            index,
            txn,
@@ -48,7 +49,12 @@ impl RankingRuleGraphTrait for TypoGraph {
                condition.term,
            )?;

-        Ok(docids)
+        let term = term_interner.get(condition.term);
+        Ok((
+            docids,
+            FxHashSet::from_iter(term.all_single_words_except_prefix_db()),
+            FxHashSet::from_iter(term.all_phrases()),
+        ))
    }

    fn build_edges<'ctx>(
@@ -202,21 +208,21 @@ impl RankingRuleGraphTrait for TypoGraph {
        Ok(s)
    }

-    fn words_used_by_condition<'ctx>(
-        ctx: &mut SearchContext<'ctx>,
-        condition: &Self::Condition,
-    ) -> Result<HashSet<Interned<String>>> {
-        let TypoCondition { term, .. } = condition;
-        let term = ctx.term_interner.get(*term);
-        Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
-    }
+    // fn words_used_by_condition<'ctx>(
+    //     ctx: &mut SearchContext<'ctx>,
+    //     condition: &Self::Condition,
+    // ) -> Result<HashSet<Interned<String>>> {
+    //     let TypoCondition { term, .. } = condition;
+    //     let term = ctx.term_interner.get(*term);
+    //     Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
+    // }

-    fn phrases_used_by_condition<'ctx>(
-        ctx: &mut SearchContext<'ctx>,
-        condition: &Self::Condition,
-    ) -> Result<HashSet<Interned<Phrase>>> {
-        let TypoCondition { term, .. } = condition;
-        let term = ctx.term_interner.get(*term);
-        Ok(HashSet::from_iter(term.all_phrases()))
-    }
+    // fn phrases_used_by_condition<'ctx>(
+    //     ctx: &mut SearchContext<'ctx>,
+    //     condition: &Self::Condition,
+    // ) -> Result<HashSet<Interned<Phrase>>> {
+    //     let TypoCondition { term, .. } = condition;
+    //     let term = ctx.term_interner.get(*term);
+    //     Ok(HashSet::from_iter(term.all_phrases()))
+    // }
 }