Remove comments and add documentation

2025-11-09 12:26:30 +00:00 · 2023-06-06 11:31:26 +02:00
parent 4829348d6e
commit 2da86b31a6
19 changed files with 117 additions and 93 deletions
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@@ -1,5 +1,48 @@
-#![allow(clippy::too_many_arguments)]
+/** Implements a "PathVisitor" which finds all paths of a certain cost
+from the START to END node of a ranking rule graph.

+A path is a list of conditions. A condition is the data associated with
+an edge, given by the ranking rule. Some edges don't have a condition associated
+with them, they are "unconditional". These kinds of edges are used to "skip" a node.
+
+The algorithm uses a depth-first search. It benefits from two main optimisations:
+- The list of all possible costs to go from any node to the END node is precomputed
+- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
+untraversable depending on what other edges were selected.
+
+These two optimisations are meant to avoid traversing edges that wouldn't lead
+to a valid path. In practically all cases, we avoid the exponential complexity
+that is inherent to depth-first search in a large ranking rule graph.
+
+The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
+conditions to a list of traversed conditions.
+For example, the DeadEndsCache could say the following:
+- Immediately, from the start, the conditions `[a,b]` are forbidden
+    - if we take the condition `c`, then the conditions `[e]` are also forbidden
+        - and if after that, we take `f`, then `[h,i]` are also forbidden
+            - etc.
+    - if we take `g`, then `[f]` is also forbidden
+        - etc.
+    - etc.
+As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
+conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
+
+When a path is found from START to END, we give it to the `visit` closure.
+This closure takes a mutable reference to the `DeadEndsCache`. This means that
+the caller can update this cache. Therefore, we must handle the case where the
+DeadEndsCache has been updated. This means potentially backtracking up to the point
+where the traversed conditions are all allowed by the new DeadEndsCache.
+
+The algorithm also implements the `TermsMatchingStrategy` logic.
+Some edges are augmented with a list of "nodes_to_skip". Skipping
+a node means "reaching this node through an unconditional edge". If we have
+already traversed (ie. not skipped) a node that is in this list, then we know that we
+can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
+future node that was present in the "nodes_to_skip" list.
+
+The caller can decide to stop the path finding algorithm
+by returning a `ControlFlow::Break` from the `visit` closure.
+*/
 use std::collections::{BTreeSet, VecDeque};
 use std::iter::FromIterator;
 use std::ops::ControlFlow;
@@ -12,30 +55,41 @@ use crate::search::new::query_graph::QueryNode;
 use crate::search::new::small_bitmap::SmallBitmap;
 use crate::Result;

+/// Closure which processes a path found by the `PathVisitor`
 type VisitFn<'f, G> = &'f mut dyn FnMut(
+    // the path as a list of conditions
    &[Interned<<G as RankingRuleGraphTrait>::Condition>],
    &mut RankingRuleGraph<G>,
+    // a mutable reference to the DeadEndsCache, to update it in case the given
+    // path doesn't resolve to any valid document ids
    &mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
 ) -> Result<ControlFlow<()>>;

+/// A structure which is kept but not updated during the traversal of the graph.
+/// It can however be updated by the `visit` closure once a valid path has been found.
 struct VisitorContext<'a, G: RankingRuleGraphTrait> {
    graph: &'a mut RankingRuleGraph<G>,
    all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
    dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
 }

+/// The internal state of the traversal algorithm
 struct VisitorState<G: RankingRuleGraphTrait> {
+    /// Budget from the current node to the end node
    remaining_cost: u64,
-
+    /// Previously visited conditions, in order.
    path: Vec<Interned<G::Condition>>,
-
+    /// Previously visited conditions, as an efficient and compact set.
    visited_conditions: SmallBitmap<G::Condition>,
+    /// Previously visited (ie not skipped) nodes, as an efficient and compact set.
    visited_nodes: SmallBitmap<QueryNode>,
-
+    /// The conditions that cannot be visited anymore
    forbidden_conditions: SmallBitmap<G::Condition>,
-    forbidden_conditions_to_nodes: SmallBitmap<QueryNode>,
+    /// The nodes that cannot be visited anymore (they must be skipped)
+    nodes_to_skip: SmallBitmap<QueryNode>,
 }

+/// See module documentation
 pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
    state: VisitorState<G>,
    ctx: VisitorContext<'a, G>,
@@ -56,14 +110,13 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
                forbidden_conditions: SmallBitmap::for_interned_values_in(
                    &graph.conditions_interner,
                ),
-                forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in(
-                    &graph.query_graph.nodes,
-                ),
+                nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
            },
            ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
        }
    }

+    /// See module documentation
    pub fn visit_paths(mut self, visit: VisitFn<G>) -> Result<()> {
        let _ =
            self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
@@ -72,22 +125,31 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
 }

 impl<G: RankingRuleGraphTrait> VisitorState<G> {
+    /// Visits a node: traverse all its valid conditional and unconditional edges.
+    ///
+    /// Returns ControlFlow::Break if the path finding algorithm should stop.
+    /// Returns whether a valid path was found from this node otherwise.
    fn visit_node(
        &mut self,
        from_node: Interned<QueryNode>,
        visit: VisitFn<G>,
        ctx: &mut VisitorContext<G>,
    ) -> Result<ControlFlow<(), bool>> {
+        // any valid path will be found from this point
+        // if a valid path was found, then we know that the DeadEndsCache may have been updated,
+        // and we will need to do more work to potentially backtrack
        let mut any_valid = false;

        let edges = ctx.graph.edges_of_node.get(from_node).clone();
        for edge_idx in edges.iter() {
+            // could be none if the edge was deleted
            let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };

            if self.remaining_cost < edge.cost as u64 {
                continue;
            }
            self.remaining_cost -= edge.cost as u64;
+
            let cf = match edge.condition {
                Some(condition) => self.visit_condition(
                    condition,
@@ -119,6 +181,10 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        Ok(ControlFlow::Continue(any_valid))
    }

+    /// Visits an unconditional edge.
+    ///
+    /// Returns ControlFlow::Break if the path finding algorithm should stop.
+    /// Returns whether a valid path was found from this node otherwise.
    fn visit_no_condition(
        &mut self,
        dest_node: Interned<QueryNode>,
@@ -134,20 +200,29 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        {
            return Ok(ControlFlow::Continue(false));
        }
+        // We've reached the END node!
        if dest_node == ctx.graph.query_graph.end_node {
            let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
+            // We could change the return type of the visit closure such that the caller
+            // tells us whether the dead ends cache was updated or not.
+            // Alternatively, maybe the DeadEndsCache should have a generation number
+            // to it, so that we don't need to play with these booleans at all.
            match control_flow {
                ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
                ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
            }
        } else {
-            let old_fbct = self.forbidden_conditions_to_nodes.clone();
-            self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip);
+            let old_fbct = self.nodes_to_skip.clone();
+            self.nodes_to_skip.union(edge_new_nodes_to_skip);
            let cf = self.visit_node(dest_node, visit, ctx)?;
-            self.forbidden_conditions_to_nodes = old_fbct;
+            self.nodes_to_skip = old_fbct;
            Ok(cf)
        }
    }
+    /// Visits a conditional edge.
+    ///
+    /// Returns ControlFlow::Break if the path finding algorithm should stop.
+    /// Returns whether a valid path was found from this node otherwise.
    fn visit_condition(
        &mut self,
        condition: Interned<G::Condition>,
@@ -159,7 +234,7 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        assert!(dest_node != ctx.graph.query_graph.end_node);

        if self.forbidden_conditions.contains(condition)
-            || self.forbidden_conditions_to_nodes.contains(dest_node)
+            || self.nodes_to_skip.contains(dest_node)
            || edge_new_nodes_to_skip.intersects(&self.visited_nodes)
        {
            return Ok(ControlFlow::Continue(false));
@@ -180,19 +255,19 @@ impl<G: RankingRuleGraphTrait> VisitorState<G> {
        self.visited_nodes.insert(dest_node);
        self.visited_conditions.insert(condition);

-        let old_fc = self.forbidden_conditions.clone();
+        let old_forb_cond = self.forbidden_conditions.clone();
        if let Some(next_forbidden) =
            ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
        {
            self.forbidden_conditions.union(&next_forbidden);
        }
-        let old_fctn = self.forbidden_conditions_to_nodes.clone();
-        self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip);
+        let old_nodes_to_skip = self.nodes_to_skip.clone();
+        self.nodes_to_skip.union(edge_new_nodes_to_skip);

        let cf = self.visit_node(dest_node, visit, ctx)?;

-        self.forbidden_conditions_to_nodes = old_fctn;
-        self.forbidden_conditions = old_fc;
+        self.nodes_to_skip = old_nodes_to_skip;
+        self.forbidden_conditions = old_forb_cond;

        self.visited_conditions.remove(condition);
        self.visited_nodes.remove(dest_node);
--- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs
@@ -9,12 +9,8 @@ use crate::search::new::query_term::LocatedQueryTermSubset;
 use crate::search::new::SearchContext;
 use crate::Result;

-// TODO: give a generation to each universe, then be able to get the exact
-// delta of docids between two universes of different generations!
-
 /// A cache storing the document ids associated with each ranking rule edge
 pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
-    // TOOD: should be a mapped interner?
    pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
    _phantom: PhantomData<G>,
 }
@@ -54,7 +50,7 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
        }
        let condition = graph.conditions_interner.get_mut(interned_condition);
        let computed = G::resolve_condition(ctx, condition, universe)?;
-        // TODO: if computed.universe_len != universe.len() ?
+        // Can we put an assert here for computed.universe_len == universe.len() ?
        let _ = self.cache.insert(interned_condition, computed);
        let computed = &self.cache[&interned_condition];
        Ok(computed)
--- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs
+++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs
@@ -2,6 +2,7 @@ use crate::search::new::interner::{FixedSizeInterner, Interned};
 use crate::search::new::small_bitmap::SmallBitmap;

 pub struct DeadEndsCache<T> {
+    // conditions and next could/should be part of the same vector
    conditions: Vec<Interned<T>>,
    next: Vec<Self>,
    pub forbidden: SmallBitmap<T>,
@@ -27,7 +28,7 @@ impl<T> DeadEndsCache<T> {
        self.forbidden.insert(condition);
    }

-    pub fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
+    fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
        if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
            Some(&mut self.next[idx])
        } else {
--- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs
@@ -69,14 +69,9 @@ impl RankingRuleGraphTrait for FidGraph {

        let mut edges = vec![];
        for fid in all_fields {
-            // TODO: We can improve performances and relevancy by storing
-            //       the term subsets associated to each field ids fetched.
            edges.push((
-                fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
-                conditions_interner.insert(FidCondition {
-                    term: term.clone(), // TODO remove this ugly clone
-                    fid,
-                }),
+                fid as u32 * term.term_ids.len() as u32,
+                conditions_interner.insert(FidCondition { term: term.clone(), fid }),
            ));
        }

--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@@ -94,14 +94,9 @@ impl RankingRuleGraphTrait for PositionGraph {
        let mut edges = vec![];

        for (cost, positions) in positions_for_costs {
-            // TODO: We can improve performances and relevancy by storing
-            //       the term subsets associated to each position fetched
            edges.push((
                cost,
-                conditions_interner.insert(PositionCondition {
-                    term: term.clone(), // TODO remove this ugly clone
-                    positions,
-                }),
+                conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
            ));
        }

--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -65,13 +65,6 @@ pub fn compute_docids(
        }
    }

-    // TODO: add safeguard in case the cartesian product is too large!
-    // even if we restrict the word derivations to a maximum of 100, the size of the
-    // caterisan product could reach a maximum of 10_000 derivations, which is way too much.
-    // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
-    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
-    // reached
-
    for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
        // Before computing the edges, check that the left word and left phrase
        // aren't disjoint with the universe, but only do it if there is more than
@@ -111,8 +104,6 @@ pub fn compute_docids(
    Ok(ComputedCondition {
        docids,
        universe_len: universe.len(),
-        // TODO: think about whether we want to reduce the subset,
-        // we probably should!
        start_term_subset: Some(left_term.clone()),
        end_term_subset: right_term.clone(),
    })
@@ -203,12 +194,7 @@ fn compute_non_prefix_edges(
            *docids |= new_docids;
        }
    }
-    if backward_proximity >= 1
-            // TODO: for now, we don't do any swapping when either term is a phrase
-            // but maybe we should. We'd need to look at the first/last word of the phrase
-            // depending on the context.
-            && left_phrase.is_none() && right_phrase.is_none()
-    {
+    if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
        if let Some(new_docids) =
            ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)?
        {