Add some documentation and use bitmaps instead of hashmaps when possible

2025-08-02 11:50:03 +00:00 · 2023-02-21 12:33:32 +01:00
parent 132191360b
commit 66d0c63694
10 changed files with 298 additions and 232 deletions
--- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs
@ -1,6 +1,9 @@
 use std::collections::{BTreeMap, HashSet};

 use itertools::Itertools;
+use roaring::RoaringBitmap;
+
+use crate::new::NodeIndex;

 use super::{
    empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph,
@ -14,18 +17,11 @@ pub struct Path {
 }

 struct DijkstraState {
-    unvisited: HashSet<usize>, // should be a small bitset
-    distances: Vec<u64>,       // or binary heap (f64, usize)
+    unvisited: RoaringBitmap, // should be a small bitset?
+    distances: Vec<u64>,      // or binary heap, or btreemap? (f64, usize)
    edges: Vec<EdgeIndex>,
    edge_costs: Vec<u8>,
-    paths: Vec<Option<usize>>,
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct PathEdgeId<Id> {
-    pub from: usize,
-    pub to: usize,
-    pub id: Id,
+    paths: Vec<Option<NodeIndex>>,
 }

 pub struct KCheapestPathsState {
@ -127,9 +123,10 @@ impl KCheapestPathsState {
            // for all the paths already found that share a common prefix with the root path
            // we delete the edge from the spur node to the next one
            for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
-                let was_removed = graph.node_edges[*spur_node].remove(&edge_index_to_remove.0);
+                let was_removed =
+                    graph.node_edges[spur_node.0 as usize].remove(edge_index_to_remove.0 as u32);
                if was_removed {
-                    tmp_removed_edges.push(edge_index_to_remove.0);
+                    tmp_removed_edges.push(edge_index_to_remove.0 as u32);
                }
            }

@ -137,7 +134,7 @@ impl KCheapestPathsState {
            // we will combine it with the root path to get a potential kth cheapest path
            let spur_path = graph.cheapest_path_to_end(*spur_node);
            // restore the temporarily removed edges
-            graph.node_edges[*spur_node].extend(tmp_removed_edges);
+            graph.node_edges[spur_node.0 as usize].extend(tmp_removed_edges);

            let Some(spur_path) = spur_path else { continue; };
            let total_cost = root_cost + spur_path.cost;
@ -182,68 +179,73 @@ impl KCheapestPathsState {
 }

 impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
-    fn cheapest_path_to_end(&self, from: usize) -> Option<Path> {
+    fn cheapest_path_to_end(&self, from: NodeIndex) -> Option<Path> {
        let mut dijkstra = DijkstraState {
-            unvisited: (0..self.query_graph.nodes.len()).collect(),
+            unvisited: (0..self.query_graph.nodes.len() as u32).collect(),
            distances: vec![u64::MAX; self.query_graph.nodes.len()],
            edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()],
            edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
            paths: vec![None; self.query_graph.nodes.len()],
        };
-        dijkstra.distances[from] = 0;
+        dijkstra.distances[from.0 as usize] = 0;

-        // TODO: could use a binary heap here to store the distances
-        while let Some(&cur_node) =
-            dijkstra.unvisited.iter().min_by_key(|&&n| dijkstra.distances[n])
+        // TODO: could use a binary heap here to store the distances, or a btreemap
+        while let Some(cur_node) =
+            dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize])
        {
-            let cur_node_dist = dijkstra.distances[cur_node];
+            let cur_node_dist = dijkstra.distances[cur_node as usize];
            if cur_node_dist == u64::MAX {
                return None;
            }
-            if cur_node == self.query_graph.end_node {
+            if cur_node == self.query_graph.end_node.0 {
                break;
            }

-            let succ_cur_node: HashSet<_> = self.node_edges[cur_node]
-                .iter()
-                .map(|e| self.all_edges[*e].as_ref().unwrap().to_node)
-                .collect();
+            // this is expensive, but shouldn't
+            // ideally I could quickly get a bitmap of all a node's successors
+            // then take the intersection with unvisited
+            let succ_cur_node: &RoaringBitmap = &self.successors[cur_node as usize];
+            // .iter()
+            // .map(|e| self.all_edges[e as usize].as_ref().unwrap().to_node.0)
+            // .collect();
            // TODO: this intersection may be slow but shouldn't be,
            // can use a bitmap intersection instead
-            let unvisited_succ_cur_node = succ_cur_node.intersection(&dijkstra.unvisited);
-            for &succ in unvisited_succ_cur_node {
-                let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else {
+            let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited;
+            for succ in unvisited_succ_cur_node {
+                // cheapest_edge() is also potentially too expensive
+                let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(NodeIndex(cur_node), NodeIndex(succ)) else {
                    continue
                };

                // println!("cur node dist {cur_node_dist}");
-                let old_dist_succ = &mut dijkstra.distances[succ];
+                let old_dist_succ = &mut dijkstra.distances[succ as usize];
                let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
                if new_potential_distance < *old_dist_succ {
                    *old_dist_succ = new_potential_distance;
-                    dijkstra.edges[succ] = cheapest_edge;
-                    dijkstra.edge_costs[succ] = cheapest_edge_cost;
-                    dijkstra.paths[succ] = Some(cur_node);
+                    dijkstra.edges[succ as usize] = cheapest_edge;
+                    dijkstra.edge_costs[succ as usize] = cheapest_edge_cost;
+                    dijkstra.paths[succ as usize] = Some(NodeIndex(cur_node));
                }
            }
-            dijkstra.unvisited.remove(&cur_node);
+            dijkstra.unvisited.remove(cur_node);
        }

        let mut cur = self.query_graph.end_node;
        // let mut edge_costs = vec![];
        // let mut distances = vec![];
        let mut path_edges = vec![];
-        while let Some(n) = dijkstra.paths[cur] {
-            path_edges.push(dijkstra.edges[cur]);
+        while let Some(n) = dijkstra.paths[cur.0 as usize] {
+            path_edges.push(dijkstra.edges[cur.0 as usize]);
            cur = n;
        }
        path_edges.reverse();
-        Some(Path { edges: path_edges, cost: dijkstra.distances[self.query_graph.end_node] })
+        Some(Path {
+            edges: path_edges,
+            cost: dijkstra.distances[self.query_graph.end_node.0 as usize],
+        })
    }

-    // TODO: this implementation is VERY fragile, as we assume that the edges are ordered by cost
-    // already. Change it.
-    pub fn cheapest_edge(&self, cur_node: usize, succ: usize) -> Option<(EdgeIndex, u8)> {
+    pub fn cheapest_edge(&self, cur_node: NodeIndex, succ: NodeIndex) -> Option<(EdgeIndex, u8)> {
        self.visit_edges(cur_node, succ, |edge_idx, edge| {
            std::ops::ControlFlow::Break((edge_idx, edge.cost))
        })