Apply a few optimisations for graph-based ranking rules

This commit is contained in:
Loïc Lecrenier
2023-03-07 14:42:58 +01:00
parent e8c76cf7bf
commit 9051065c22
19 changed files with 682 additions and 438 deletions

View File

@ -1,40 +1,54 @@
use std::collections::HashSet;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::small_bitmap::SmallBitmap;
use crate::new::{QueryGraph, SearchContext};
use crate::Result;
use roaring::RoaringBitmap;
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result<Self> {
let mut ranking_rule_graph =
Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] };
let QueryGraph { nodes: graph_nodes, edges: graph_edges, .. } = &query_graph;
for (node_idx, node) in ranking_rule_graph.query_graph.nodes.iter().enumerate() {
ranking_rule_graph.node_edges.push(RoaringBitmap::new());
ranking_rule_graph.successors.push(RoaringBitmap::new());
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
let new_successors = ranking_rule_graph.successors.last_mut().unwrap();
let mut all_edges = vec![];
let mut node_edges = vec![];
let mut successors = vec![];
for (node_idx, node) in graph_nodes.iter().enumerate() {
node_edges.push(HashSet::new());
successors.push(HashSet::new());
let new_edges = node_edges.last_mut().unwrap();
let new_successors = successors.last_mut().unwrap();
let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue };
for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() {
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize];
for successor_idx in graph_edges[node_idx].successors.iter() {
let to_node = &graph_nodes[successor_idx as usize];
let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?;
if edges.is_empty() {
continue;
}
edges.sort_by_key(|e| e.0);
for (cost, details) in edges {
ranking_rule_graph.all_edges.push(Some(Edge {
from_node: node_idx as u32,
all_edges.push(Some(Edge {
from_node: node_idx as u16,
to_node: successor_idx,
cost,
details,
}));
new_edges.insert(ranking_rule_graph.all_edges.len() as u32 - 1);
new_edges.insert(all_edges.len() as u16 - 1);
new_successors.insert(successor_idx);
}
}
}
Ok(ranking_rule_graph)
let node_edges = node_edges
.into_iter()
.map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16))
.collect();
let successors = successors
.into_iter()
.map(|edges| SmallBitmap::from_iter(edges.into_iter(), all_edges.len() as u16))
.collect();
Ok(RankingRuleGraph { query_graph, all_edges, node_edges, successors })
}
}

View File

@ -2,124 +2,146 @@
use super::empty_paths_cache::EmptyPathsCache;
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::small_bitmap::SmallBitmap;
use crate::Result;
use std::collections::VecDeque;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Path {
pub edges: Vec<u32>,
pub edges: Vec<u16>,
pub cost: u64,
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn paths_of_cost(
&self,
pub fn visit_paths_of_cost(
&mut self,
from: usize,
cost: u64,
all_distances: &[Vec<u64>],
empty_paths_cache: &EmptyPathsCache,
) -> Vec<Vec<u32>> {
let mut paths = vec![];
self.paths_of_cost_rec(
cost: u16,
all_distances: &[Vec<u16>],
empty_paths_cache: &mut EmptyPathsCache,
mut visit: impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>,
) -> Result<()> {
let _ = self.visit_paths_of_cost_rec(
from,
all_distances,
cost,
&mut vec![],
&mut paths,
&vec![false; self.all_edges.len()],
all_distances,
empty_paths_cache,
);
paths
&mut visit,
&mut vec![],
&mut SmallBitmap::new(self.all_edges.len() as u16),
empty_paths_cache.empty_edges.clone(),
)?;
Ok(())
}
pub fn paths_of_cost_rec(
&self,
pub fn visit_paths_of_cost_rec(
&mut self,
from: usize,
all_distances: &[Vec<u64>],
cost: u64,
prev_edges: &mut Vec<u32>,
paths: &mut Vec<Vec<u32>>,
forbidden_edges: &[bool],
empty_paths_cache: &EmptyPathsCache,
) {
let distances = &all_distances[from];
if !distances.contains(&cost) {
panic!();
}
let tos = &self.query_graph.edges[from].successors;
let mut valid_edges = vec![];
for to in tos {
self.visit_edges::<()>(from as u32, to, |edge_idx, edge| {
if cost >= edge.cost as u64
&& all_distances[to as usize].contains(&(cost - edge.cost as u64))
&& !forbidden_edges[edge_idx as usize]
{
valid_edges.push((edge_idx, edge.cost, to));
}
std::ops::ControlFlow::Continue(())
});
}
cost: u16,
// TODO: replace all_distances with a Vec<SmallBitmap> where the SmallBitmap contains true if the cost exists and false otherwise
all_distances: &[Vec<u16>],
empty_paths_cache: &mut EmptyPathsCache,
visit: &mut impl FnMut(&[u16], &mut Self, &mut EmptyPathsCache) -> Result<()>,
// replace prev edges by:
// (1) a small bitmap representing the path
// (2) a pointer within the EmptyPathsCache::forbidden_prefixes structure
prev_edges: &mut Vec<u16>,
cur_path: &mut SmallBitmap,
mut forbidden_edges: SmallBitmap,
) -> Result<bool> {
let mut any_valid = false;
for (edge_idx, edge_cost, to) in valid_edges {
prev_edges.push(edge_idx);
if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) {
let edges = self.node_edges[from].clone();
for edge_idx in edges.iter() {
let Some(edge) = self.all_edges[edge_idx as usize].as_ref() else { continue };
if cost < edge.cost as u16
|| forbidden_edges.contains(edge_idx)
|| !all_distances[edge.to_node as usize].contains(&(cost - edge.cost as u16))
{
continue;
}
let mut new_forbidden_edges = forbidden_edges.to_vec();
for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() {
new_forbidden_edges[*edge_idx as usize] = true;
}
for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) {
new_forbidden_edges[edge_idx as usize] = true;
}
cur_path.insert(edge_idx);
prev_edges.push(edge_idx);
if to == self.query_graph.end_node {
paths.push(prev_edges.clone());
let mut new_forbidden_edges = forbidden_edges.clone();
new_forbidden_edges.union(&empty_paths_cache.empty_couple_edges[edge_idx as usize]);
empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| {
new_forbidden_edges.insert(x);
});
let next_any_valid = if edge.to_node == self.query_graph.end_node {
any_valid = true;
visit(prev_edges, self, empty_paths_cache)?;
true
} else {
self.paths_of_cost_rec(
to as usize,
self.visit_paths_of_cost_rec(
edge.to_node as usize,
cost - edge.cost as u16,
all_distances,
cost - edge_cost as u64,
prev_edges,
paths,
&new_forbidden_edges,
empty_paths_cache,
)
}
visit,
prev_edges,
cur_path,
new_forbidden_edges,
)?
};
any_valid |= next_any_valid;
cur_path.remove(edge_idx);
prev_edges.pop();
if next_any_valid {
if empty_paths_cache.path_is_empty(prev_edges, cur_path) {
return Ok(any_valid);
}
forbidden_edges.union(&empty_paths_cache.empty_edges);
for edge in prev_edges.iter() {
forbidden_edges.union(&empty_paths_cache.empty_couple_edges[*edge as usize]);
}
empty_paths_cache.empty_prefixes.final_edges_after_prefix(prev_edges, &mut |x| {
forbidden_edges.insert(x);
});
}
if next_any_valid && empty_paths_cache.path_is_empty(prev_edges, cur_path) {
return Ok(any_valid);
}
}
Ok(any_valid)
}
pub fn initialize_distances_cheapest(&self) -> Vec<Vec<u64>> {
let mut distances_to_end: Vec<Vec<u64>> = vec![vec![]; self.query_graph.nodes.len()];
let mut enqueued = vec![false; self.query_graph.nodes.len()];
pub fn initialize_distances_cheapest(&self) -> Vec<Vec<u16>> {
let mut distances_to_end: Vec<Vec<u16>> = vec![vec![]; self.query_graph.nodes.len()];
let mut enqueued = SmallBitmap::new(self.query_graph.nodes.len() as u16);
let mut node_stack = VecDeque::new();
distances_to_end[self.query_graph.end_node as usize] = vec![0];
for prev_node in
self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter()
{
node_stack.push_back(prev_node as usize);
enqueued[prev_node as usize] = true;
enqueued.insert(prev_node);
}
while let Some(cur_node) = node_stack.pop_front() {
let mut self_distances = vec![];
for succ_node in self.query_graph.edges[cur_node].successors.iter() {
let cur_node_edges = &self.node_edges[cur_node];
for edge_idx in cur_node_edges.iter() {
let edge = self.all_edges[edge_idx as usize].as_ref().unwrap();
let succ_node = edge.to_node;
let succ_distances = &distances_to_end[succ_node as usize];
let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| {
for succ_distance in succ_distances {
self_distances.push(edge.cost as u64 + succ_distance);
}
std::ops::ControlFlow::Continue(())
});
for succ_distance in succ_distances {
self_distances.push(edge.cost as u16 + succ_distance);
}
}
self_distances.sort_unstable();
self_distances.dedup();
distances_to_end[cur_node] = self_distances;
for prev_node in self.query_graph.edges[cur_node].predecessors.iter() {
if !enqueued[prev_node as usize] {
if !enqueued.contains(prev_node) {
node_stack.push_back(prev_node as usize);
enqueued[prev_node as usize] = true;
enqueued.insert(prev_node);
}
}
}

View File

@ -11,9 +11,20 @@ use roaring::RoaringBitmap;
// computing their hash and comparing them
// which can be done...
// by using a pointer (real, Rc, bumpalo, or in a vector)???
//
// But actually.... the edge details' docids are a subset of the universe at the
// moment they were computed.
// But the universes between two iterations of a ranking rule are completely different
// Thus, there is no point in doing this.
// UNLESS...
// we compute the whole docids corresponding to the edge details (potentially expensive in time and memory
// in the common case)
//
// But we could still benefit within a single iteration for requests like:
// `a a a a a a a a a` where we have many of the same edge details, repeated
pub struct EdgeDocidsCache<G: RankingRuleGraphTrait> {
pub cache: FxHashMap<u32, RoaringBitmap>,
pub cache: FxHashMap<u16, RoaringBitmap>,
_phantom: PhantomData<G>,
}
impl<G: RankingRuleGraphTrait> Default for EdgeDocidsCache<G> {
@ -25,7 +36,7 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
pub fn get_edge_docids<'s, 'search>(
&'s mut self,
ctx: &mut SearchContext<'search>,
edge_index: u32,
edge_index: u16,
graph: &RankingRuleGraph<G>,
// TODO: maybe universe doesn't belong here
universe: &RoaringBitmap,
@ -41,7 +52,7 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
}
// TODO: maybe universe doesn't belong here
let docids = universe & G::compute_docids(ctx, details)?;
let docids = universe & G::compute_docids(ctx, details, universe)?;
let _ = self.cache.insert(edge_index, docids);
let docids = &self.cache[&edge_index];
Ok(BitmapOrAllRef::Bitmap(docids))

View File

@ -1,60 +1,48 @@
use crate::new::small_bitmap::SmallBitmap;
use super::paths_map::PathsMap;
#[derive(Clone)]
pub struct EmptyPathsCache {
pub empty_edges: Vec<bool>,
pub empty_edges: SmallBitmap,
pub empty_prefixes: PathsMap<()>,
pub empty_couple_edges: Vec<Vec<u32>>,
pub empty_couple_edges: Vec<SmallBitmap>,
}
impl EmptyPathsCache {
pub fn new(all_edges_len: usize) -> Self {
pub fn new(all_edges_len: u16) -> Self {
Self {
empty_edges: vec![false; all_edges_len],
empty_edges: SmallBitmap::new(all_edges_len),
empty_prefixes: PathsMap::default(),
empty_couple_edges: vec![vec![]; all_edges_len],
empty_couple_edges: vec![SmallBitmap::new(all_edges_len); all_edges_len as usize],
}
}
pub fn forbid_edge(&mut self, edge_idx: u32) {
self.empty_edges[edge_idx as usize] = true;
self.empty_couple_edges[edge_idx as usize] = vec![];
pub fn forbid_edge(&mut self, edge_idx: u16) {
self.empty_edges.insert(edge_idx);
self.empty_couple_edges[edge_idx as usize].clear();
self.empty_prefixes.remove_edge(&edge_idx);
for edges2 in self.empty_couple_edges.iter_mut() {
if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) {
edges2.swap_remove(edge2_pos);
}
edges2.remove(edge_idx);
}
}
pub fn forbid_prefix(&mut self, prefix: &[u32]) {
pub fn forbid_prefix(&mut self, prefix: &[u16]) {
self.empty_prefixes.insert(prefix.iter().copied(), ());
}
pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) {
assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2));
self.empty_couple_edges[edge1 as usize].push(edge2);
pub fn forbid_couple_edges(&mut self, edge1: u16, edge2: u16) {
self.empty_couple_edges[edge1 as usize].insert(edge2);
}
pub fn path_is_empty(&self, path: &[u32]) -> bool {
for edge in path {
if self.empty_edges[*edge as usize] {
pub fn path_is_empty(&self, path: &[u16], path_bitmap: &SmallBitmap) -> bool {
if path_bitmap.intersects(&self.empty_edges) {
return true;
}
for edge in path.iter() {
let forbidden_other_edges = &self.empty_couple_edges[*edge as usize];
if path_bitmap.intersects(forbidden_other_edges) {
return true;
}
}
if self.empty_prefixes.contains_prefix_of_path(path) {
return true;
}
for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() {
if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) {
if path[pos_edge1..].iter().any(|e| edges2.contains(e)) {
return true;
}
}
}
// for (edge1, edge2) in self.empty_couple_edges.iter() {
// if path.contains(edge1) && path.contains(edge2) {
// return true;
// }
// }
// if self.empty_prefixes.contains_prefix_of_path(path) {
// return true;
// }
false
}
}

View File

@ -4,17 +4,16 @@ mod edge_docids_cache;
mod empty_paths_cache;
mod paths_map;
mod proximity;
mod resolve_paths;
mod typo;
use super::logger::SearchLogger;
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
pub use edge_docids_cache::EdgeDocidsCache;
pub use empty_paths_cache::EmptyPathsCache;
pub use proximity::ProximityGraph;
use roaring::RoaringBitmap;
use std::ops::ControlFlow;
pub use typo::TypoGraph;
#[derive(Debug, Clone)]
@ -25,15 +24,15 @@ pub enum EdgeDetails<E> {
#[derive(Debug, Clone)]
pub struct Edge<E> {
pub from_node: u32,
pub to_node: u32,
pub from_node: u16,
pub to_node: u16,
pub cost: u8,
pub details: EdgeDetails<E>,
}
#[derive(Debug, Clone)]
pub struct EdgePointer<'graph, E> {
pub index: u32,
pub index: u16,
pub edge: &'graph Edge<E>,
}
@ -95,6 +94,7 @@ pub trait RankingRuleGraphTrait: Sized {
fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge_details: &Self::EdgeDetails,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap>;
/// Prepare to build the edges outgoing from `from_node`.
@ -116,11 +116,11 @@ pub trait RankingRuleGraphTrait: Sized {
fn log_state(
graph: &RankingRuleGraph<Self>,
paths: &[Vec<u32>],
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u64>],
cost: u64,
distances: &[Vec<u16>],
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
);
}
@ -130,9 +130,9 @@ pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
// pub edges: Vec<HashMap<usize, Vec<Edge<G::EdgeDetails>>>>,
pub all_edges: Vec<Option<Edge<G::EdgeDetails>>>,
pub node_edges: Vec<RoaringBitmap>,
pub node_edges: Vec<SmallBitmap>,
pub successors: Vec<RoaringBitmap>,
pub successors: Vec<SmallBitmap>,
// TODO: to get the edges between two nodes:
// 1. get node_outgoing_edges[from]
// 2. get node_incoming_edges[to]
@ -149,29 +149,7 @@ impl<G: RankingRuleGraphTrait> Clone for RankingRuleGraph<G> {
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
// Visit all edges between the two given nodes in order of increasing cost.
pub fn visit_edges<'graph, O>(
&'graph self,
from: u32,
to: u32,
mut visit: impl FnMut(u32, &'graph Edge<G::EdgeDetails>) -> ControlFlow<O>,
) -> Option<O> {
let from_edges = &self.node_edges[from as usize];
for edge_idx in from_edges {
let edge = self.all_edges[edge_idx as usize].as_ref().unwrap();
if edge.to_node == to {
let cf = visit(edge_idx, edge);
match cf {
ControlFlow::Continue(_) => continue,
ControlFlow::Break(o) => return Some(o),
}
}
}
None
}
pub fn remove_edge(&mut self, edge_index: u32) {
pub fn remove_edge(&mut self, edge_index: u16) {
let edge_opt = &mut self.all_edges[edge_index as usize];
let Some(edge) = &edge_opt else { return };
let (from_node, _to_node) = (edge.from_node, edge.to_node);
@ -180,9 +158,10 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
let from_node_edges = &mut self.node_edges[from_node as usize];
from_node_edges.remove(edge_index);
let mut new_successors_from_node = RoaringBitmap::new();
let mut new_successors_from_node = SmallBitmap::new(self.all_edges.len() as u16);
let all_edges = &self.all_edges;
for from_node_edge in from_node_edges.iter() {
let Edge { to_node, .. } = &self.all_edges[from_node_edge as usize].as_ref().unwrap();
let Edge { to_node, .. } = &all_edges[from_node_edge as usize].as_ref().unwrap();
new_successors_from_node.insert(*to_node);
}
self.successors[from_node as usize] = new_successors_from_node;

View File

@ -1,9 +1,4 @@
use roaring::RoaringBitmap;
use crate::new::small_bitmap::SmallBitmap;
use super::cheapest_paths::Path;
// What is PathsMap used for?
@ -13,7 +8,7 @@ use super::cheapest_paths::Path;
#[derive(Debug, Clone)]
pub struct PathsMap<V> {
pub nodes: Vec<(u32, PathsMap<V>)>,
pub nodes: Vec<(u16, PathsMap<V>)>,
pub value: Option<V>,
}
impl<V> Default for PathsMap<V> {
@ -39,7 +34,7 @@ impl<V> PathsMap<V> {
self.nodes.is_empty() && self.value.is_none()
}
pub fn insert(&mut self, mut edges: impl Iterator<Item = u32>, value: V) {
pub fn insert(&mut self, mut edges: impl Iterator<Item = u16>, value: V) {
match edges.next() {
None => {
self.value = Some(value);
@ -57,7 +52,7 @@ impl<V> PathsMap<V> {
}
}
}
fn remove_first_rec(&mut self, cur: &mut Vec<u32>) -> (bool, V) {
fn remove_first_rec(&mut self, cur: &mut Vec<u16>) -> (bool, V) {
let Some((first_edge, rest)) = self.nodes.first_mut() else {
// The PathsMap has to be correct by construction here, otherwise
// the unwrap() will crash
@ -72,7 +67,7 @@ impl<V> PathsMap<V> {
(false, value)
}
}
pub fn remove_first(&mut self) -> Option<(Vec<u32>, V)> {
pub fn remove_first(&mut self) -> Option<(Vec<u16>, V)> {
if self.is_empty() {
return None;
}
@ -81,7 +76,7 @@ impl<V> PathsMap<V> {
let (_, value) = self.remove_first_rec(&mut result);
Some((result, value))
}
pub fn iterate_rec(&self, cur: &mut Vec<u32>, visit: &mut impl FnMut(&Vec<u32>, &V)) {
pub fn iterate_rec(&self, cur: &mut Vec<u16>, visit: &mut impl FnMut(&Vec<u16>, &V)) {
if let Some(value) = &self.value {
visit(cur, value);
}
@ -91,7 +86,7 @@ impl<V> PathsMap<V> {
cur.pop();
}
}
pub fn iterate(&self, mut visit: impl FnMut(&Vec<u32>, &V)) {
pub fn iterate(&self, mut visit: impl FnMut(&Vec<u16>, &V)) {
self.iterate_rec(&mut vec![], &mut visit)
}
@ -100,7 +95,7 @@ impl<V> PathsMap<V> {
self.remove_prefix(prefix);
});
}
pub fn remove_edges(&mut self, forbidden_edges: &RoaringBitmap) {
pub fn remove_edges(&mut self, forbidden_edges: &SmallBitmap) {
let mut i = 0;
while i < self.nodes.len() {
let should_remove = if forbidden_edges.contains(self.nodes[i].0) {
@ -118,7 +113,7 @@ impl<V> PathsMap<V> {
}
}
}
pub fn remove_edge(&mut self, forbidden_edge: &u32) {
pub fn remove_edge(&mut self, forbidden_edge: &u16) {
let mut i = 0;
while i < self.nodes.len() {
let should_remove = if &self.nodes[i].0 == forbidden_edge {
@ -136,7 +131,7 @@ impl<V> PathsMap<V> {
}
}
}
pub fn remove_prefix(&mut self, forbidden_prefix: &[u32]) {
pub fn remove_prefix(&mut self, forbidden_prefix: &[u16]) {
let [first_edge, remaining_prefix @ ..] = forbidden_prefix else {
self.nodes.clear();
self.value = None;
@ -160,25 +155,23 @@ impl<V> PathsMap<V> {
}
}
pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec<u32> {
pub fn final_edges_after_prefix(&self, prefix: &[u16], visit: &mut impl FnMut(u16)) {
let [first_edge, remaining_prefix @ ..] = prefix else {
return self.nodes.iter().filter_map(|n| {
if n.1.value.is_some() {
Some(n.0)
} else {
None
for node in self.nodes.iter() {
if node.1.value.is_some() {
visit(node.0)
}
}).collect();
}
return
};
for (edge, rest) in self.nodes.iter() {
if edge == first_edge {
return rest.final_edges_ater_prefix(remaining_prefix);
return rest.final_edges_after_prefix(remaining_prefix, visit);
}
}
vec![]
}
pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec<u32> {
pub fn edge_indices_after_prefix(&self, prefix: &[u16]) -> Vec<u16> {
let [first_edge, remaining_prefix @ ..] = prefix else {
return self.nodes.iter().map(|n| n.0).collect();
};
@ -190,7 +183,7 @@ impl<V> PathsMap<V> {
vec![]
}
pub fn contains_prefix_of_path(&self, path: &[u32]) -> bool {
pub fn contains_prefix_of_path(&self, path: &[u16]) -> bool {
if self.value.is_some() {
return true;
}

View File

@ -111,6 +111,8 @@ pub fn visit_to_node<'search, 'from_data>(
for word1 in derivations1.clone() {
for proximity in 1..=(8 - ngram_len2) {
let cost = (proximity + ngram_len2 - 1) as u8;
// TODO: if we had access to the universe here, we could already check whether
// the bitmap corresponding to this word pair is disjoint with the universe or not
if ctx
.get_word_prefix_pair_proximity_docids(
word1,
@ -183,8 +185,13 @@ pub fn visit_to_node<'search, 'from_data>(
.flat_map(|(cost, proximity_word_pairs)| {
let mut edges = vec![];
for (proximity, word_pairs) in proximity_word_pairs {
edges
.push((cost, EdgeDetails::Data(ProximityEdge { pairs: word_pairs, proximity })))
edges.push((
cost,
EdgeDetails::Data(ProximityEdge {
pairs: word_pairs.into_boxed_slice(),
proximity,
}),
))
}
edges
})

View File

@ -1,14 +1,15 @@
use super::{ProximityEdge, WordPair};
use crate::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result};
use roaring::{MultiOps, RoaringBitmap};
use roaring::RoaringBitmap;
pub fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge: &ProximityEdge,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
let ProximityEdge { pairs, proximity } = edge;
let mut pair_docids = vec![];
let mut pair_docids = RoaringBitmap::new();
for pair in pairs.iter() {
let bytes = match pair {
WordPair::Words { left, right } => {
@ -21,10 +22,11 @@ pub fn compute_docids<'search>(
ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity)
}
}?;
let bitmap =
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
pair_docids.push(bitmap);
// TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator?
let bitmap = universe
& bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
pair_docids |= bitmap;
}
let docids = MultiOps::union(pair_docids);
Ok(docids)
Ok(pair_docids)
}

View File

@ -10,7 +10,7 @@ use crate::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
use roaring::RoaringBitmap;
// TODO: intern the strings, refer to them by their pointer?
// TODO: intern the proximity edges as well?
#[derive(Clone)]
pub enum WordPair {
@ -21,8 +21,7 @@ pub enum WordPair {
#[derive(Clone)]
pub struct ProximityEdge {
// TODO: use a list of pointers to the word pairs instead?
pairs: Vec<WordPair>,
pairs: Box<[WordPair]>,
proximity: u8,
}
@ -40,8 +39,9 @@ impl RankingRuleGraphTrait for ProximityGraph {
fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
universe: &RoaringBitmap,
) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(ctx, edge)
compute_docids::compute_docids(ctx, edge, universe)
}
fn build_visit_from_node<'search>(
@ -61,11 +61,11 @@ impl RankingRuleGraphTrait for ProximityGraph {
fn log_state(
graph: &super::RankingRuleGraph<Self>,
paths: &[Vec<u32>],
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u64>],
cost: u64,
distances: &[Vec<u16>],
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
) {
logger.log_proximity_state(

View File

@ -1,97 +0,0 @@
#![allow(clippy::too_many_arguments)]
use super::edge_docids_cache::EdgeDocidsCache;
use super::empty_paths_cache::EmptyPathsCache;
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::{BitmapOrAllRef, SearchContext};
use crate::Result;
use roaring::{MultiOps, RoaringBitmap};
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
// TODO: reduce the universe after computing each path
// TODO: deserialize roaring bitmap within a universe
pub fn resolve_paths<'search>(
&mut self,
ctx: &mut SearchContext<'search>,
edge_docids_cache: &mut EdgeDocidsCache<G>,
empty_paths_cache: &mut EmptyPathsCache,
universe: &RoaringBitmap,
mut paths: Vec<Vec<u32>>,
) -> Result<RoaringBitmap> {
paths.sort_unstable();
// let mut needs_filtering_empty_edges = false;
// let mut needs_filtering_empty_prefix = false;
// let mut needs_filtering_empty_couple_edges = false;
let mut needs_filtering = false;
let mut path_bitmaps = vec![];
'path_loop: loop {
// TODO: distinguish between empty_edges, empty_prefix, and empty_couple_edges filtering
if needs_filtering {
for path in paths.iter_mut() {
if empty_paths_cache.path_is_empty(path) {
path.clear();
}
}
needs_filtering = false;
}
let Some(edge_indexes) = paths.pop() else {
break;
};
if edge_indexes.is_empty() {
continue;
}
let mut path_bitmap = universe.clone();
let mut visited_edges = vec![];
let mut cached_edge_docids = vec![];
'edge_loop: for edge_index in edge_indexes {
visited_edges.push(edge_index);
let edge_docids =
edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?;
match edge_docids {
BitmapOrAllRef::Bitmap(edge_docids) => {
cached_edge_docids.push((edge_index, edge_docids.clone()));
let (_, edge_docids) = cached_edge_docids.last().unwrap();
if edge_docids.is_disjoint(universe) {
// 1. Store in the cache that this edge is empty for this universe
empty_paths_cache.forbid_edge(edge_index);
// 2. remove this edge from the proximity graph
self.remove_edge(edge_index);
edge_docids_cache.cache.remove(&edge_index);
needs_filtering = true;
// needs_filtering_empty_edges = true;
// 3. continue executing this function again on the remaining paths
continue 'path_loop;
} else {
path_bitmap &= edge_docids;
if path_bitmap.is_disjoint(universe) {
// needs_filtering_empty_prefix = true;
needs_filtering = true;
empty_paths_cache.forbid_prefix(&visited_edges);
// if the intersection between this edge and any
// previous one is disjoint with the universe,
// then we add these two edges to the empty_path_cache
for (edge_index2, edge_docids2) in
cached_edge_docids[..cached_edge_docids.len() - 1].iter()
{
let intersection = edge_docids & edge_docids2;
if intersection.is_disjoint(universe) {
// needs_filtering_empty_couple_edges = true;
empty_paths_cache
.forbid_couple_edges(*edge_index2, edge_index);
}
}
continue 'path_loop;
}
}
}
BitmapOrAllRef::All => continue 'edge_loop,
}
}
path_bitmaps.push(path_bitmap);
}
Ok(MultiOps::union(path_bitmaps))
}
}

View File

@ -31,6 +31,7 @@ impl RankingRuleGraphTrait for TypoGraph {
fn compute_docids<'db_cache, 'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
match edge {
TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase),
@ -44,14 +45,17 @@ impl RankingRuleGraphTrait for TypoGraph {
let mut docids = RoaringBitmap::new();
for word in words.iter().copied() {
let Some(bytes) = ctx.get_word_docids(word)? else { continue };
let bitmap =
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
// TODO: deserialize bitmap within a universe
let bitmap = universe
& RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
docids |= bitmap;
}
if *nbr_typos == 0 {
if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? {
let bitmap =
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
// TODO: deserialize bitmap within a universe
let bitmap = universe
& RoaringBitmapCodec::bytes_decode(bytes)
.ok_or(heed::Error::Decoding)?;
docids |= bitmap;
}
}
@ -116,11 +120,11 @@ impl RankingRuleGraphTrait for TypoGraph {
fn log_state(
graph: &RankingRuleGraph<Self>,
paths: &[Vec<u32>],
paths: &[Vec<u16>],
empty_paths_cache: &EmptyPathsCache,
universe: &RoaringBitmap,
distances: &[Vec<u64>],
cost: u64,
distances: &[Vec<u16>],
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
) {
logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost);