Add some documentation and use bitmaps instead of hashmaps when possible

This commit is contained in:
Loïc Lecrenier
2023-02-21 12:33:32 +01:00
parent 132191360b
commit 66d0c63694
10 changed files with 298 additions and 232 deletions

View File

@ -1,10 +1,11 @@
use std::collections::{BTreeSet, HashMap, HashSet};
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::QueryGraph;
use crate::new::{NodeIndex, QueryGraph};
use crate::{Index, Result};
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
@ -14,29 +15,38 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
db_cache: &mut DatabaseCache<'transaction>,
query_graph: QueryGraph,
) -> Result<Self> {
let mut ranking_rule_graph = Self { query_graph, all_edges: vec![], node_edges: vec![] };
let mut ranking_rule_graph =
Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] };
for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() {
ranking_rule_graph.node_edges.push(BTreeSet::new());
ranking_rule_graph.node_edges.push(RoaringBitmap::new());
ranking_rule_graph.successors.push(RoaringBitmap::new());
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
let new_successors = ranking_rule_graph.successors.last_mut().unwrap();
let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue };
for &successor_idx in ranking_rule_graph.query_graph.edges[node_idx].outgoing.iter() {
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx];
let Some(edges) = G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)? else { continue };
for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() {
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize];
let mut edges =
G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?;
if edges.is_empty() {
continue;
}
edges.sort_by_key(|e| e.0);
for (cost, details) in edges {
ranking_rule_graph.all_edges.push(Some(Edge {
from_node: node_idx,
to_node: successor_idx,
from_node: NodeIndex(node_idx as u32),
to_node: NodeIndex(successor_idx),
cost,
details,
}));
new_edges.insert(ranking_rule_graph.all_edges.len() - 1);
new_edges.insert(ranking_rule_graph.all_edges.len() as u32 - 1);
new_successors.insert(successor_idx);
}
}
}
ranking_rule_graph.simplify();
// ranking_rule_graph.simplify();
Ok(ranking_rule_graph)
}

View File

@ -1,6 +1,9 @@
use std::collections::{BTreeMap, HashSet};
use itertools::Itertools;
use roaring::RoaringBitmap;
use crate::new::NodeIndex;
use super::{
empty_paths_cache::EmptyPathsCache, paths_map::PathsMap, Edge, EdgeIndex, RankingRuleGraph,
@ -14,18 +17,11 @@ pub struct Path {
}
struct DijkstraState {
unvisited: HashSet<usize>, // should be a small bitset
distances: Vec<u64>, // or binary heap (f64, usize)
unvisited: RoaringBitmap, // should be a small bitset?
distances: Vec<u64>, // or binary heap, or btreemap? (f64, usize)
edges: Vec<EdgeIndex>,
edge_costs: Vec<u8>,
paths: Vec<Option<usize>>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct PathEdgeId<Id> {
pub from: usize,
pub to: usize,
pub id: Id,
paths: Vec<Option<NodeIndex>>,
}
pub struct KCheapestPathsState {
@ -127,9 +123,10 @@ impl KCheapestPathsState {
// for all the paths already found that share a common prefix with the root path
// we delete the edge from the spur node to the next one
for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
let was_removed = graph.node_edges[*spur_node].remove(&edge_index_to_remove.0);
let was_removed =
graph.node_edges[spur_node.0 as usize].remove(edge_index_to_remove.0 as u32);
if was_removed {
tmp_removed_edges.push(edge_index_to_remove.0);
tmp_removed_edges.push(edge_index_to_remove.0 as u32);
}
}
@ -137,7 +134,7 @@ impl KCheapestPathsState {
// we will combine it with the root path to get a potential kth cheapest path
let spur_path = graph.cheapest_path_to_end(*spur_node);
// restore the temporarily removed edges
graph.node_edges[*spur_node].extend(tmp_removed_edges);
graph.node_edges[spur_node.0 as usize].extend(tmp_removed_edges);
let Some(spur_path) = spur_path else { continue; };
let total_cost = root_cost + spur_path.cost;
@ -182,68 +179,73 @@ impl KCheapestPathsState {
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
fn cheapest_path_to_end(&self, from: usize) -> Option<Path> {
fn cheapest_path_to_end(&self, from: NodeIndex) -> Option<Path> {
let mut dijkstra = DijkstraState {
unvisited: (0..self.query_graph.nodes.len()).collect(),
unvisited: (0..self.query_graph.nodes.len() as u32).collect(),
distances: vec![u64::MAX; self.query_graph.nodes.len()],
edges: vec![EdgeIndex(usize::MAX); self.query_graph.nodes.len()],
edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
paths: vec![None; self.query_graph.nodes.len()],
};
dijkstra.distances[from] = 0;
dijkstra.distances[from.0 as usize] = 0;
// TODO: could use a binary heap here to store the distances
while let Some(&cur_node) =
dijkstra.unvisited.iter().min_by_key(|&&n| dijkstra.distances[n])
// TODO: could use a binary heap here to store the distances, or a btreemap
while let Some(cur_node) =
dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize])
{
let cur_node_dist = dijkstra.distances[cur_node];
let cur_node_dist = dijkstra.distances[cur_node as usize];
if cur_node_dist == u64::MAX {
return None;
}
if cur_node == self.query_graph.end_node {
if cur_node == self.query_graph.end_node.0 {
break;
}
let succ_cur_node: HashSet<_> = self.node_edges[cur_node]
.iter()
.map(|e| self.all_edges[*e].as_ref().unwrap().to_node)
.collect();
// this is expensive, but shouldn't
// ideally I could quickly get a bitmap of all a node's successors
// then take the intersection with unvisited
let succ_cur_node: &RoaringBitmap = &self.successors[cur_node as usize];
// .iter()
// .map(|e| self.all_edges[e as usize].as_ref().unwrap().to_node.0)
// .collect();
// TODO: this intersection may be slow but shouldn't be,
// can use a bitmap intersection instead
let unvisited_succ_cur_node = succ_cur_node.intersection(&dijkstra.unvisited);
for &succ in unvisited_succ_cur_node {
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else {
let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited;
for succ in unvisited_succ_cur_node {
// cheapest_edge() is also potentially too expensive
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(NodeIndex(cur_node), NodeIndex(succ)) else {
continue
};
// println!("cur node dist {cur_node_dist}");
let old_dist_succ = &mut dijkstra.distances[succ];
let old_dist_succ = &mut dijkstra.distances[succ as usize];
let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
if new_potential_distance < *old_dist_succ {
*old_dist_succ = new_potential_distance;
dijkstra.edges[succ] = cheapest_edge;
dijkstra.edge_costs[succ] = cheapest_edge_cost;
dijkstra.paths[succ] = Some(cur_node);
dijkstra.edges[succ as usize] = cheapest_edge;
dijkstra.edge_costs[succ as usize] = cheapest_edge_cost;
dijkstra.paths[succ as usize] = Some(NodeIndex(cur_node));
}
}
dijkstra.unvisited.remove(&cur_node);
dijkstra.unvisited.remove(cur_node);
}
let mut cur = self.query_graph.end_node;
// let mut edge_costs = vec![];
// let mut distances = vec![];
let mut path_edges = vec![];
while let Some(n) = dijkstra.paths[cur] {
path_edges.push(dijkstra.edges[cur]);
while let Some(n) = dijkstra.paths[cur.0 as usize] {
path_edges.push(dijkstra.edges[cur.0 as usize]);
cur = n;
}
path_edges.reverse();
Some(Path { edges: path_edges, cost: dijkstra.distances[self.query_graph.end_node] })
Some(Path {
edges: path_edges,
cost: dijkstra.distances[self.query_graph.end_node.0 as usize],
})
}
// TODO: this implementation is VERY fragile, as we assume that the edges are ordered by cost
// already. Change it.
pub fn cheapest_edge(&self, cur_node: usize, succ: usize) -> Option<(EdgeIndex, u8)> {
pub fn cheapest_edge(&self, cur_node: NodeIndex, succ: NodeIndex) -> Option<(EdgeIndex, u8)> {
self.visit_edges(cur_node, succ, |edge_idx, edge| {
std::ops::ControlFlow::Break((edge_idx, edge.cost))
})

View File

@ -9,6 +9,12 @@ use crate::new::db_cache::DatabaseCache;
use crate::new::BitmapOrAllRef;
use crate::{Index, Result};
// TODO: the cache should have a G::EdgeDetails as key
// but then it means that we should have a quick way of
// computing their hash and comparing them
// which can be done...
// by using a pointer (real, Rc, bumpalo, or in a vector)???
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
pub cache: HashMap<EdgeIndex, RoaringBitmap>,

View File

@ -13,7 +13,7 @@ use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::{QueryGraph, QueryNode};
use super::{NodeIndex, QueryGraph, QueryNode};
use crate::{Index, Result};
#[derive(Debug, Clone)]
@ -24,8 +24,8 @@ pub enum EdgeDetails<E> {
#[derive(Debug, Clone)]
pub struct Edge<E> {
from_node: usize,
to_node: usize,
from_node: NodeIndex,
to_node: NodeIndex,
cost: u8,
details: EdgeDetails<E>,
}
@ -38,22 +38,20 @@ pub struct EdgePointer<'graph, E> {
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct EdgeIndex(pub usize);
// {
// // TODO: they could all be u16 instead
// // There may be a way to store all the edge indices in a u32 as well,
// // if the edges are in a vector
// // then we can store sets of edges in a bitmap efficiently
// pub from: usize,
// pub to: usize,
// pub edge_idx: usize,
// }
pub trait RankingRuleGraphTrait {
/// The details of an edge connecting two query nodes. These details
/// should be sufficient to compute the edge's cost and associated document ids
/// in [`compute_docids`](RankingRuleGraphTrait).
type EdgeDetails: Sized;
type BuildVisitedFromNode;
fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String;
/// Return the label of the given edge details, to be used when visualising
/// the ranking rule graph using GraphViz.
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String;
/// Compute the document ids associated with the given edge.
fn compute_docids<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
@ -61,6 +59,10 @@ pub trait RankingRuleGraphTrait {
edge_details: &Self::EdgeDetails,
) -> Result<RoaringBitmap>;
/// Prepare to build the edges outgoing from `from_node`.
///
/// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node),
/// which builds the actual edges.
fn build_visit_from_node<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
@ -68,39 +70,59 @@ pub trait RankingRuleGraphTrait {
from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>>;
/// Return the cost and details of the edges going from the previously visited node
/// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`.
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Option<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>>;
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>;
}
pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
pub query_graph: QueryGraph,
// pub edges: Vec<HashMap<usize, Vec<Edge<G::EdgeDetails>>>>,
pub all_edges: Vec<Option<Edge<G::EdgeDetails>>>,
pub node_edges: Vec<BTreeSet<usize>>,
pub node_edges: Vec<RoaringBitmap>,
pub successors: Vec<RoaringBitmap>,
// to get the edges between two nodes:
// 1. get node_outgoing_edges[from]
// 2. get node_incoming_edges[to]
// 3. take intersection betweem the two
// TODO: node edges could be different I guess
// something like:
// pub node_edges: Vec<BitSet>
// where each index is the result of:
// the successor index in the top 16 bits, the edge index in the bottom 16 bits
// TODO:
// node_successors?
// pub removed_edges: HashSet<EdgeIndex>,
// pub tmp_removed_edges: HashSet<EdgeIndex>,
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
// NOTE: returns the edge even if it was removed
pub fn get_edge(&self, edge_index: EdgeIndex) -> &Option<Edge<G::EdgeDetails>> {
&self.all_edges[edge_index.0]
}
// Visit all edges between the two given nodes in order of increasing cost.
pub fn visit_edges<'graph, O>(
&'graph self,
from: usize,
to: usize,
from: NodeIndex,
to: NodeIndex,
mut visit: impl FnMut(EdgeIndex, &'graph Edge<G::EdgeDetails>) -> ControlFlow<O>,
) -> Option<O> {
let from_edges = &self.node_edges[from];
for &edge_idx in from_edges {
let edge = self.all_edges[edge_idx].as_ref().unwrap();
let from_edges = &self.node_edges[from.0 as usize];
for edge_idx in from_edges {
let edge = self.all_edges[edge_idx as usize].as_ref().unwrap();
if edge.to_node == to {
let cf = visit(EdgeIndex(edge_idx), edge);
let cf = visit(EdgeIndex(edge_idx as usize), edge);
match cf {
ControlFlow::Continue(_) => continue,
ControlFlow::Break(o) => return Some(o),
@ -113,54 +135,61 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
fn remove_edge(&mut self, edge_index: EdgeIndex) {
let edge_opt = &mut self.all_edges[edge_index.0];
let Some(Edge { from_node, to_node, cost, details }) = &edge_opt else { return };
let node_edges = &mut self.node_edges[*from_node];
node_edges.remove(&edge_index.0);
let Some(edge) = &edge_opt else { return };
let (from_node, to_node) = (edge.from_node, edge.to_node);
*edge_opt = None;
}
pub fn remove_nodes(&mut self, nodes: &[usize]) {
for &node in nodes {
let edge_indices = &mut self.node_edges[node];
for edge_index in edge_indices.iter() {
self.all_edges[*edge_index] = None;
}
edge_indices.clear();
let preds = &self.query_graph.edges[node].incoming;
for pred in preds {
let edge_indices = &mut self.node_edges[*pred];
for edge_index in edge_indices.iter() {
let edge_opt = &mut self.all_edges[*edge_index];
let Some(edge) = edge_opt else { continue; };
if edge.to_node == node {
*edge_opt = None;
}
}
panic!("remove nodes is incorrect at the moment");
edge_indices.clear();
}
}
self.query_graph.remove_nodes(nodes);
}
pub fn simplify(&mut self) {
loop {
let mut nodes_to_remove = vec![];
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
if !matches!(node, QueryNode::End | QueryNode::Deleted)
&& self.node_edges[node_idx].is_empty()
{
nodes_to_remove.push(node_idx);
}
}
if nodes_to_remove.is_empty() {
break;
} else {
self.remove_nodes(&nodes_to_remove);
}
let from_node_edges = &mut self.node_edges[from_node.0 as usize];
from_node_edges.remove(edge_index.0 as u32);
let mut new_successors_from_node = RoaringBitmap::new();
for edge in from_node_edges.iter() {
let Edge { to_node, .. } = &self.all_edges[edge as usize].as_ref().unwrap();
new_successors_from_node.insert(to_node.0);
}
self.successors[from_node.0 as usize] = new_successors_from_node;
}
// pub fn remove_nodes(&mut self, nodes: &[usize]) {
// for &node in nodes {
// let edge_indices = &mut self.node_edges[node];
// for edge_index in edge_indices.iter() {
// self.all_edges[*edge_index] = None;
// }
// edge_indices.clear();
// let preds = &self.query_graph.edges[node].incoming;
// for pred in preds {
// let edge_indices = &mut self.node_edges[*pred];
// for edge_index in edge_indices.iter() {
// let edge_opt = &mut self.all_edges[*edge_index];
// let Some(edge) = edge_opt else { continue; };
// if edge.to_node == node {
// *edge_opt = None;
// }
// }
// panic!("remove nodes is incorrect at the moment");
// edge_indices.clear();
// }
// }
// self.query_graph.remove_nodes(nodes);
// }
// pub fn simplify(&mut self) {
// loop {
// let mut nodes_to_remove = vec![];
// for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
// if !matches!(node, QueryNode::End | QueryNode::Deleted)
// && self.node_edges[node_idx].is_empty()
// {
// nodes_to_remove.push(node_idx);
// }
// }
// if nodes_to_remove.is_empty() {
// break;
// } else {
// self.remove_nodes(&nodes_to_remove);
// }
// }
// }
// fn is_removed_edge(&self, edge: EdgeIndex) -> bool {
// self.removed_edges.contains(&edge) || self.tmp_removed_edges.contains(&edge)
// }
@ -174,9 +203,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
continue;
}
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
if node_idx == self.query_graph.root_node {
if node_idx == self.query_graph.root_node.0 as usize {
desc.push_str("[color = blue]");
} else if node_idx == self.query_graph.end_node {
} else if node_idx == self.query_graph.end_node.0 as usize {
desc.push_str("[color = red]");
}
desc.push_str(";\n");
@ -195,7 +224,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
desc.push_str(&format!(
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n",
cost = edge.cost,
edge_label = G::edge_details_dot_label(details)
edge_label = G::graphviz_edge_details_label(details)
));
}
}

View File

@ -235,9 +235,9 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
continue;
}
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
if node_idx == self.query_graph.root_node {
if node_idx == self.query_graph.root_node.0 as usize {
desc.push_str("[color = blue]");
} else if node_idx == self.query_graph.end_node {
} else if node_idx == self.query_graph.end_node.0 as usize {
desc.push_str("[color = red]");
}
desc.push_str(";\n");
@ -262,7 +262,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
desc.push_str(&format!(
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n",
cost = edge.cost,
edge_label = G::edge_details_dot_label(details),
edge_label = G::graphviz_edge_details_label(details),
));
}
}

View File

@ -51,11 +51,11 @@ pub fn visit_to_node<'transaction, 'from_data>(
db_cache: &mut DatabaseCache<'transaction>,
to_node: &QueryNode,
from_node_data: &'from_data (WordDerivations, i8),
) -> Result<Option<Vec<(u8, EdgeDetails<ProximityEdge>)>>> {
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
let (derivations1, pos1) = from_node_data;
let term2 = match &to_node {
QueryNode::End => return Ok(Some(vec![(0, EdgeDetails::Unconditional)])),
QueryNode::Deleted | QueryNode::Start => return Ok(None),
QueryNode::End => return Ok(vec![(0, EdgeDetails::Unconditional)]),
QueryNode::Deleted | QueryNode::Start => return Ok(vec![]),
QueryNode::Term(term) => term,
};
let LocatedQueryTerm { value: value2, positions: pos2 } = term2;
@ -86,7 +86,7 @@ pub fn visit_to_node<'transaction, 'from_data>(
// We want to effectively ignore this pair of terms
// Unconditionally walk through the edge without computing the docids
// But also what should the cost be?
return Ok(Some(vec![(0, EdgeDetails::Unconditional)]));
return Ok(vec![(0, EdgeDetails::Unconditional)]);
}
let updb1 = derivations1.use_prefix_db;
@ -161,5 +161,5 @@ pub fn visit_to_node<'transaction, 'from_data>(
})
.collect::<Vec<_>>();
new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeDetails::Unconditional));
Ok(Some(new_edges))
Ok(new_edges)
}

View File

@ -26,7 +26,7 @@ impl RankingRuleGraphTrait for ProximityGraph {
type EdgeDetails = ProximityEdge;
type BuildVisitedFromNode = (WordDerivations, i8);
fn edge_details_dot_label(edge: &Self::EdgeDetails) -> String {
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
let ProximityEdge { pairs, proximity } = edge;
format!(", prox {proximity}, {} pairs", pairs.len())
}
@ -55,7 +55,7 @@ impl RankingRuleGraphTrait for ProximityGraph {
db_cache: &mut DatabaseCache<'transaction>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Option<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>> {
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
build::visit_to_node(index, txn, db_cache, to_node, from_node_data)
}
}