mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-01 03:10:04 +00:00
Rewrite cheapest path algorithm and empty path cache
It is now much simpler and has much better performance.
This commit is contained in:
@ -1,10 +1,8 @@
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use std::collections::VecDeque;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub struct Path {
|
||||
@ -12,226 +10,119 @@ pub struct Path {
|
||||
pub cost: u64,
|
||||
}
|
||||
|
||||
struct DijkstraState {
|
||||
unvisited: RoaringBitmap, // should be a small bitset?
|
||||
distances: Vec<u64>, // or binary heap, or btreemap? (f64, usize)
|
||||
edges: Vec<u32>,
|
||||
edge_costs: Vec<u8>,
|
||||
paths: Vec<Option<u32>>,
|
||||
}
|
||||
|
||||
pub struct KCheapestPathsState {
|
||||
cheapest_paths: PathsMap<u64>,
|
||||
potential_cheapest_paths: BTreeMap<u64, PathsMap<u64>>,
|
||||
pub kth_cheapest_path: Path,
|
||||
}
|
||||
|
||||
impl KCheapestPathsState {
|
||||
pub fn next_cost(&self) -> u64 {
|
||||
self.kth_cheapest_path.cost
|
||||
}
|
||||
|
||||
pub fn new<G: RankingRuleGraphTrait>(
|
||||
graph: &RankingRuleGraph<G>,
|
||||
) -> Option<KCheapestPathsState> {
|
||||
let Some(cheapest_path) = graph.cheapest_path_to_end(graph.query_graph.root_node) else {
|
||||
return None
|
||||
};
|
||||
let cheapest_paths = PathsMap::from_paths(&[cheapest_path.clone()]);
|
||||
let potential_cheapest_paths = BTreeMap::new();
|
||||
Some(KCheapestPathsState {
|
||||
cheapest_paths,
|
||||
potential_cheapest_paths,
|
||||
kth_cheapest_path: cheapest_path,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn remove_empty_paths(mut self, empty_paths_cache: &EmptyPathsCache) -> Option<Self> {
|
||||
self.cheapest_paths.remove_edges(&empty_paths_cache.empty_edges);
|
||||
self.cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
|
||||
|
||||
let mut costs_to_delete = HashSet::new();
|
||||
for (cost, potential_cheapest_paths) in self.potential_cheapest_paths.iter_mut() {
|
||||
potential_cheapest_paths.remove_edges(&empty_paths_cache.empty_edges);
|
||||
potential_cheapest_paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
|
||||
if potential_cheapest_paths.is_empty() {
|
||||
costs_to_delete.insert(*cost);
|
||||
}
|
||||
}
|
||||
for cost in costs_to_delete {
|
||||
self.potential_cheapest_paths.remove(&cost);
|
||||
}
|
||||
|
||||
if self.cheapest_paths.is_empty() {}
|
||||
|
||||
todo!()
|
||||
}
|
||||
|
||||
pub fn compute_paths_of_next_lowest_cost<G: RankingRuleGraphTrait>(
|
||||
mut self,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
pub fn paths_of_cost(
|
||||
&self,
|
||||
from: usize,
|
||||
cost: u64,
|
||||
all_distances: &[Vec<u64>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
into_map: &mut PathsMap<u64>,
|
||||
) -> Option<Self> {
|
||||
if !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges) {
|
||||
into_map.add_path(&self.kth_cheapest_path);
|
||||
) -> Vec<Vec<u32>> {
|
||||
let mut paths = vec![];
|
||||
self.paths_of_cost_rec(
|
||||
from,
|
||||
all_distances,
|
||||
cost,
|
||||
&mut vec![],
|
||||
&mut paths,
|
||||
&vec![false; self.all_edges.len()],
|
||||
empty_paths_cache,
|
||||
);
|
||||
paths
|
||||
}
|
||||
pub fn paths_of_cost_rec(
|
||||
&self,
|
||||
from: usize,
|
||||
all_distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
prev_edges: &mut Vec<u32>,
|
||||
paths: &mut Vec<Vec<u32>>,
|
||||
forbidden_edges: &[bool],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
) {
|
||||
let distances = &all_distances[from];
|
||||
if !distances.contains(&cost) {
|
||||
panic!();
|
||||
}
|
||||
let cur_cost = self.kth_cheapest_path.cost;
|
||||
while self.kth_cheapest_path.cost <= cur_cost {
|
||||
if let Some(next_self) = self.compute_next_cheapest_paths(graph, empty_paths_cache) {
|
||||
self = next_self;
|
||||
if self.kth_cheapest_path.cost == cur_cost
|
||||
&& !empty_paths_cache.path_is_empty(&self.kth_cheapest_path.edges)
|
||||
let tos = &self.query_graph.edges[from].successors;
|
||||
let mut valid_edges = vec![];
|
||||
for to in tos {
|
||||
self.visit_edges::<()>(from as u32, to, |edge_idx, edge| {
|
||||
if cost >= edge.cost as u64
|
||||
&& all_distances[to as usize].contains(&(cost - edge.cost as u64))
|
||||
&& !forbidden_edges[edge_idx as usize]
|
||||
{
|
||||
into_map.add_path(&self.kth_cheapest_path);
|
||||
} else {
|
||||
break;
|
||||
valid_edges.push((edge_idx, edge.cost, to));
|
||||
}
|
||||
} else {
|
||||
return None;
|
||||
}
|
||||
std::ops::ControlFlow::Continue(())
|
||||
});
|
||||
}
|
||||
Some(self)
|
||||
}
|
||||
|
||||
fn compute_next_cheapest_paths<G: RankingRuleGraphTrait>(
|
||||
mut self,
|
||||
graph: &mut RankingRuleGraph<G>,
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
) -> Option<KCheapestPathsState> {
|
||||
// for all nodes in the last cheapest path (called spur_node), except last one...
|
||||
for (i, edge_idx) in self.kth_cheapest_path.edges[..self.kth_cheapest_path.edges.len() - 1]
|
||||
.iter()
|
||||
.enumerate()
|
||||
{
|
||||
let Some(edge) = graph.all_edges[*edge_idx as usize].as_ref() else { continue; };
|
||||
let Edge { from_node: spur_node, .. } = edge;
|
||||
|
||||
let root_path = &self.kth_cheapest_path.edges[..i];
|
||||
if empty_paths_cache.path_is_empty(root_path) {
|
||||
for (edge_idx, edge_cost, to) in valid_edges {
|
||||
prev_edges.push(edge_idx);
|
||||
if empty_paths_cache.empty_prefixes.contains_prefix_of_path(prev_edges) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let root_cost = root_path.iter().fold(0, |sum, next| {
|
||||
sum + graph.all_edges[*next as usize].as_ref().unwrap().cost as u64
|
||||
});
|
||||
|
||||
let mut tmp_removed_edges = vec![];
|
||||
// for all the paths already found that share a common prefix with the root path
|
||||
// we delete the edge from the spur node to the next one
|
||||
for edge_index_to_remove in self.cheapest_paths.edge_indices_after_prefix(root_path) {
|
||||
let was_removed =
|
||||
graph.node_edges[*spur_node as usize].remove(edge_index_to_remove);
|
||||
if was_removed {
|
||||
tmp_removed_edges.push(edge_index_to_remove);
|
||||
}
|
||||
let mut new_forbidden_edges = forbidden_edges.to_vec();
|
||||
for edge_idx in empty_paths_cache.empty_couple_edges[edge_idx as usize].iter() {
|
||||
new_forbidden_edges[*edge_idx as usize] = true;
|
||||
}
|
||||
for edge_idx in empty_paths_cache.empty_prefixes.final_edges_ater_prefix(prev_edges) {
|
||||
new_forbidden_edges[edge_idx as usize] = true;
|
||||
}
|
||||
|
||||
// Compute the cheapest path from the spur node to the destination
|
||||
// we will combine it with the root path to get a potential kth cheapest path
|
||||
let spur_path = graph.cheapest_path_to_end(*spur_node);
|
||||
// restore the temporarily removed edges
|
||||
graph.node_edges[*spur_node as usize].extend(tmp_removed_edges);
|
||||
|
||||
let Some(spur_path) = spur_path else { continue; };
|
||||
let total_cost = root_cost + spur_path.cost;
|
||||
let total_path = Path {
|
||||
edges: root_path.iter().chain(spur_path.edges.iter()).cloned().collect(),
|
||||
cost: total_cost,
|
||||
};
|
||||
let entry = self.potential_cheapest_paths.entry(total_cost).or_default();
|
||||
entry.add_path(&total_path);
|
||||
if to == self.query_graph.end_node {
|
||||
paths.push(prev_edges.clone());
|
||||
} else {
|
||||
self.paths_of_cost_rec(
|
||||
to as usize,
|
||||
all_distances,
|
||||
cost - edge_cost as u64,
|
||||
prev_edges,
|
||||
paths,
|
||||
&new_forbidden_edges,
|
||||
empty_paths_cache,
|
||||
)
|
||||
}
|
||||
prev_edges.pop();
|
||||
}
|
||||
while let Some(mut next_cheapest_paths_entry) = self.potential_cheapest_paths.first_entry()
|
||||
}
|
||||
|
||||
pub fn initialize_distances_cheapest(&self) -> Vec<Vec<u64>> {
|
||||
let mut distances_to_end: Vec<Vec<u64>> = vec![vec![]; self.query_graph.nodes.len()];
|
||||
let mut enqueued = vec![false; self.query_graph.nodes.len()];
|
||||
|
||||
let mut node_stack = VecDeque::new();
|
||||
|
||||
distances_to_end[self.query_graph.end_node as usize] = vec![0];
|
||||
for prev_node in
|
||||
self.query_graph.edges[self.query_graph.end_node as usize].predecessors.iter()
|
||||
{
|
||||
let cost = *next_cheapest_paths_entry.key();
|
||||
let next_cheapest_paths = next_cheapest_paths_entry.get_mut();
|
||||
node_stack.push_back(prev_node as usize);
|
||||
enqueued[prev_node as usize] = true;
|
||||
}
|
||||
|
||||
while let Some((next_cheapest_path, cost2)) = next_cheapest_paths.remove_first() {
|
||||
assert_eq!(cost, cost2);
|
||||
// NOTE: it is important not to discard the paths that are forbidden due to a
|
||||
// forbidden prefix, because the cheapest path algorithm (Dijkstra) cannot take
|
||||
// this property into account.
|
||||
if next_cheapest_path
|
||||
.iter()
|
||||
.any(|edge_index| graph.all_edges[*edge_index as usize].is_none())
|
||||
{
|
||||
continue;
|
||||
} else {
|
||||
self.cheapest_paths.insert(next_cheapest_path.iter().copied(), cost);
|
||||
|
||||
if next_cheapest_paths.is_empty() {
|
||||
next_cheapest_paths_entry.remove();
|
||||
while let Some(cur_node) = node_stack.pop_front() {
|
||||
let mut self_distances = vec![];
|
||||
for succ_node in self.query_graph.edges[cur_node].successors.iter() {
|
||||
let succ_distances = &distances_to_end[succ_node as usize];
|
||||
let _ = self.visit_edges::<()>(cur_node as u32, succ_node, |_, edge| {
|
||||
for succ_distance in succ_distances {
|
||||
self_distances.push(edge.cost as u64 + succ_distance);
|
||||
}
|
||||
self.kth_cheapest_path = Path { edges: next_cheapest_path, cost };
|
||||
|
||||
return Some(self);
|
||||
std::ops::ControlFlow::Continue(())
|
||||
});
|
||||
}
|
||||
self_distances.sort_unstable();
|
||||
self_distances.dedup();
|
||||
distances_to_end[cur_node] = self_distances;
|
||||
for prev_node in self.query_graph.edges[cur_node].predecessors.iter() {
|
||||
if !enqueued[prev_node as usize] {
|
||||
node_stack.push_back(prev_node as usize);
|
||||
enqueued[prev_node as usize] = true;
|
||||
}
|
||||
}
|
||||
let _ = next_cheapest_paths_entry.remove_entry();
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
fn cheapest_path_to_end(&self, from: u32) -> Option<Path> {
|
||||
let mut dijkstra = DijkstraState {
|
||||
unvisited: (0..self.query_graph.nodes.len() as u32).collect(),
|
||||
distances: vec![u64::MAX; self.query_graph.nodes.len()],
|
||||
edges: vec![u32::MAX; self.query_graph.nodes.len()],
|
||||
edge_costs: vec![u8::MAX; self.query_graph.nodes.len()],
|
||||
paths: vec![None; self.query_graph.nodes.len()],
|
||||
};
|
||||
dijkstra.distances[from as usize] = 0;
|
||||
|
||||
// TODO: could use a binary heap here to store the distances, or a btreemap
|
||||
while let Some(cur_node) =
|
||||
dijkstra.unvisited.iter().min_by_key(|&n| dijkstra.distances[n as usize])
|
||||
{
|
||||
let cur_node_dist = dijkstra.distances[cur_node as usize];
|
||||
if cur_node_dist == u64::MAX {
|
||||
return None;
|
||||
}
|
||||
if cur_node == self.query_graph.end_node {
|
||||
break;
|
||||
}
|
||||
|
||||
let succ_cur_node = &self.successors[cur_node as usize];
|
||||
let unvisited_succ_cur_node = succ_cur_node & &dijkstra.unvisited;
|
||||
for succ in unvisited_succ_cur_node {
|
||||
let Some((cheapest_edge, cheapest_edge_cost)) = self.cheapest_edge(cur_node, succ) else {
|
||||
continue
|
||||
};
|
||||
|
||||
let old_dist_succ = &mut dijkstra.distances[succ as usize];
|
||||
let new_potential_distance = cur_node_dist + cheapest_edge_cost as u64;
|
||||
if new_potential_distance < *old_dist_succ {
|
||||
*old_dist_succ = new_potential_distance;
|
||||
dijkstra.edges[succ as usize] = cheapest_edge;
|
||||
dijkstra.edge_costs[succ as usize] = cheapest_edge_cost;
|
||||
dijkstra.paths[succ as usize] = Some(cur_node);
|
||||
}
|
||||
}
|
||||
dijkstra.unvisited.remove(cur_node);
|
||||
}
|
||||
|
||||
let mut cur = self.query_graph.end_node;
|
||||
let mut path_edges = vec![];
|
||||
while let Some(n) = dijkstra.paths[cur as usize] {
|
||||
path_edges.push(dijkstra.edges[cur as usize]);
|
||||
cur = n;
|
||||
}
|
||||
path_edges.reverse();
|
||||
Some(Path {
|
||||
edges: path_edges,
|
||||
cost: dijkstra.distances[self.query_graph.end_node as usize],
|
||||
})
|
||||
}
|
||||
|
||||
pub fn cheapest_edge(&self, cur_node: u32, succ: u32) -> Option<(u32, u8)> {
|
||||
self.visit_edges(cur_node, succ, |edge_idx, edge| {
|
||||
std::ops::ControlFlow::Break((edge_idx, edge.cost))
|
||||
})
|
||||
distances_to_end
|
||||
}
|
||||
}
|
||||
|
@ -32,16 +32,19 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
edge_index: u32,
|
||||
graph: &RankingRuleGraph<G>,
|
||||
// TODO: maybe universe doesn't belong here
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<BitmapOrAllRef<'s>> {
|
||||
if self.cache.contains_key(&edge_index) {
|
||||
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
|
||||
}
|
||||
let edge = graph.all_edges[edge_index as usize].as_ref().unwrap();
|
||||
|
||||
match &edge.details {
|
||||
EdgeDetails::Unconditional => Ok(BitmapOrAllRef::All),
|
||||
EdgeDetails::Data(details) => {
|
||||
let docids = G::compute_docids(index, txn, db_cache, details)?;
|
||||
if self.cache.contains_key(&edge_index) {
|
||||
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
|
||||
}
|
||||
// TODO: maybe universe doesn't belong here
|
||||
let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
|
||||
|
||||
let _ = self.cache.insert(edge_index, docids);
|
||||
let docids = &self.cache[&edge_index];
|
||||
|
@ -1,26 +1,60 @@
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::paths_map::PathsMap;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
#[derive(Clone)]
|
||||
pub struct EmptyPathsCache {
|
||||
pub empty_edges: RoaringBitmap,
|
||||
pub empty_edges: Vec<bool>,
|
||||
pub empty_prefixes: PathsMap<()>,
|
||||
pub empty_couple_edges: Vec<Vec<u32>>,
|
||||
}
|
||||
impl EmptyPathsCache {
|
||||
pub fn new(all_edges_len: usize) -> Self {
|
||||
Self {
|
||||
empty_edges: vec![false; all_edges_len],
|
||||
empty_prefixes: PathsMap::default(),
|
||||
empty_couple_edges: vec![vec![]; all_edges_len],
|
||||
}
|
||||
}
|
||||
pub fn forbid_edge(&mut self, edge_idx: u32) {
|
||||
self.empty_edges.insert(edge_idx);
|
||||
self.empty_edges[edge_idx as usize] = true;
|
||||
self.empty_couple_edges[edge_idx as usize] = vec![];
|
||||
self.empty_prefixes.remove_edge(&edge_idx);
|
||||
for edges2 in self.empty_couple_edges.iter_mut() {
|
||||
if let Some(edge2_pos) = edges2.iter().position(|e| *e == edge_idx) {
|
||||
edges2.swap_remove(edge2_pos);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn forbid_prefix(&mut self, prefix: &[u32]) {
|
||||
self.empty_prefixes.insert(prefix.iter().copied(), ());
|
||||
}
|
||||
pub fn forbid_couple_edges(&mut self, edge1: u32, edge2: u32) {
|
||||
assert!(!self.empty_couple_edges[edge1 as usize].contains(&edge2));
|
||||
self.empty_couple_edges[edge1 as usize].push(edge2);
|
||||
}
|
||||
pub fn path_is_empty(&self, path: &[u32]) -> bool {
|
||||
for edge in path {
|
||||
if self.empty_edges.contains(*edge) {
|
||||
if self.empty_edges[*edge as usize] {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if self.empty_prefixes.contains_prefix_of_path(path) {
|
||||
return true;
|
||||
}
|
||||
for (edge1, edges2) in self.empty_couple_edges.iter().enumerate() {
|
||||
if let Some(pos_edge1) = path.iter().position(|e| *e == edge1 as u32) {
|
||||
if path[pos_edge1..].iter().any(|e| edges2.contains(e)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
// for (edge1, edge2) in self.empty_couple_edges.iter() {
|
||||
// if path.contains(edge1) && path.contains(edge2) {
|
||||
// return true;
|
||||
// }
|
||||
// }
|
||||
// if self.empty_prefixes.contains_prefix_of_path(path) {
|
||||
// return true;
|
||||
// }
|
||||
false
|
||||
}
|
||||
}
|
||||
|
@ -13,7 +13,6 @@ use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use self::empty_paths_cache::EmptyPathsCache;
|
||||
use self::paths_map::PathsMap;
|
||||
|
||||
use super::db_cache::DatabaseCache;
|
||||
use super::logger::SearchLogger;
|
||||
@ -83,8 +82,11 @@ pub trait RankingRuleGraphTrait: Sized {
|
||||
|
||||
fn log_state(
|
||||
graph: &RankingRuleGraph<Self>,
|
||||
paths: &PathsMap<u64>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
);
|
||||
}
|
||||
@ -135,7 +137,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
None
|
||||
}
|
||||
|
||||
fn remove_edge(&mut self, edge_index: u32) {
|
||||
pub fn remove_edge(&mut self, edge_index: u32) {
|
||||
let edge_opt = &mut self.all_edges[edge_index as usize];
|
||||
let Some(edge) = &edge_opt else { return };
|
||||
let (from_node, _to_node) = (edge.from_node, edge.to_node);
|
||||
@ -151,44 +153,4 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
}
|
||||
self.successors[from_node as usize] = new_successors_from_node;
|
||||
}
|
||||
|
||||
pub fn graphviz(&self) -> String {
|
||||
let mut desc = String::new();
|
||||
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
|
||||
|
||||
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
||||
if matches!(node, QueryNode::Deleted) {
|
||||
continue;
|
||||
}
|
||||
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
||||
if node_idx == self.query_graph.root_node as usize {
|
||||
desc.push_str("[color = blue]");
|
||||
} else if node_idx == self.query_graph.end_node as usize {
|
||||
desc.push_str("[color = red]");
|
||||
}
|
||||
desc.push_str(";\n");
|
||||
}
|
||||
for edge in self.all_edges.iter().flatten() {
|
||||
let Edge { from_node, to_node, details, .. } = edge;
|
||||
|
||||
match &details {
|
||||
EdgeDetails::Unconditional => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"always cost {cost}\"];\n",
|
||||
cost = edge.cost,
|
||||
));
|
||||
}
|
||||
EdgeDetails::Data(details) => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\"];\n",
|
||||
cost = edge.cost,
|
||||
edge_label = G::graphviz_edge_details_label(details)
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
desc.push('}');
|
||||
desc
|
||||
}
|
||||
}
|
||||
|
@ -1,12 +1,11 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::fmt::Write;
|
||||
use std::hash::{Hash, Hasher};
|
||||
|
||||
|
||||
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::cheapest_paths::Path;
|
||||
use super::{Edge, EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::new::QueryNode;
|
||||
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct PathsMap<V> {
|
||||
@ -157,6 +156,24 @@ impl<V> PathsMap<V> {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn final_edges_ater_prefix(&self, prefix: &[u32]) -> Vec<u32> {
|
||||
let [first_edge, remaining_prefix @ ..] = prefix else {
|
||||
return self.nodes.iter().filter_map(|n| {
|
||||
if n.1.value.is_some() {
|
||||
Some(n.0)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}).collect();
|
||||
};
|
||||
for (edge, rest) in self.nodes.iter() {
|
||||
if edge == first_edge {
|
||||
return rest.final_edges_ater_prefix(remaining_prefix);
|
||||
}
|
||||
}
|
||||
vec![]
|
||||
}
|
||||
|
||||
pub fn edge_indices_after_prefix(&self, prefix: &[u32]) -> Vec<u32> {
|
||||
let [first_edge, remaining_prefix @ ..] = prefix else {
|
||||
return self.nodes.iter().map(|n| n.0).collect();
|
||||
@ -185,88 +202,4 @@ impl<V> PathsMap<V> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn graphviz<G: RankingRuleGraphTrait>(&self, graph: &RankingRuleGraph<G>) -> String {
|
||||
let mut desc = String::new();
|
||||
desc.push_str("digraph G {\n");
|
||||
self.graphviz_rec(&mut desc, vec![], graph);
|
||||
desc.push_str("\n}\n");
|
||||
desc
|
||||
}
|
||||
fn graphviz_rec<G: RankingRuleGraphTrait>(
|
||||
&self,
|
||||
desc: &mut String,
|
||||
path_from: Vec<u64>,
|
||||
graph: &RankingRuleGraph<G>,
|
||||
) {
|
||||
let id_from = {
|
||||
let mut h = DefaultHasher::new();
|
||||
path_from.hash(&mut h);
|
||||
h.finish()
|
||||
};
|
||||
for (edge_idx, rest) in self.nodes.iter() {
|
||||
let Some(Edge { from_node, to_node, cost, .. }) = graph.all_edges[*edge_idx as usize].as_ref() else {
|
||||
continue;
|
||||
};
|
||||
let mut path_to = path_from.clone();
|
||||
path_to.push({
|
||||
let mut h = DefaultHasher::new();
|
||||
edge_idx.hash(&mut h);
|
||||
h.finish()
|
||||
});
|
||||
let id_to = {
|
||||
let mut h = DefaultHasher::new();
|
||||
path_to.hash(&mut h);
|
||||
h.finish()
|
||||
};
|
||||
writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap();
|
||||
writeln!(desc, "{id_from} -> {id_to};").unwrap();
|
||||
|
||||
rest.graphviz_rec(desc, path_to, graph);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
pub fn graphviz_with_path(&self, path: &Path) -> String {
|
||||
let mut desc = String::new();
|
||||
desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n");
|
||||
|
||||
for (node_idx, node) in self.query_graph.nodes.iter().enumerate() {
|
||||
if matches!(node, QueryNode::Deleted) {
|
||||
continue;
|
||||
}
|
||||
desc.push_str(&format!("{node_idx} [label = {:?}]", node));
|
||||
if node_idx == self.query_graph.root_node as usize {
|
||||
desc.push_str("[color = blue]");
|
||||
} else if node_idx == self.query_graph.end_node as usize {
|
||||
desc.push_str("[color = red]");
|
||||
}
|
||||
desc.push_str(";\n");
|
||||
}
|
||||
|
||||
for (edge_idx, edge) in self.all_edges.iter().enumerate() {
|
||||
let Some(edge) = edge else { continue };
|
||||
let Edge { from_node, to_node, .. } = edge;
|
||||
let color = if path.edges.contains(&(edge_idx as u32)) { "red" } else { "green" };
|
||||
match &edge.details {
|
||||
EdgeDetails::Unconditional => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n",
|
||||
cost = edge.cost,
|
||||
));
|
||||
}
|
||||
EdgeDetails::Data(details) => {
|
||||
desc.push_str(&format!(
|
||||
"{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n",
|
||||
cost = edge.cost,
|
||||
edge_label = G::graphviz_edge_details_label(details),
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
desc.push('}');
|
||||
desc
|
||||
}
|
||||
}
|
||||
|
@ -16,9 +16,9 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
||||
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => {
|
||||
match value1 {
|
||||
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
|
||||
QueryTerm::Phrase(phrase1) => {
|
||||
QueryTerm::Phrase { phrase: phrase1 } => {
|
||||
// TODO: remove second unwrap
|
||||
let original = phrase1.last().unwrap().as_ref().unwrap().clone();
|
||||
let original = phrase1.words.last().unwrap().as_ref().unwrap().clone();
|
||||
(
|
||||
WordDerivations {
|
||||
original: original.clone(),
|
||||
@ -26,6 +26,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
||||
one_typo: vec![],
|
||||
two_typos: vec![],
|
||||
use_prefix_db: false,
|
||||
synonyms: vec![],
|
||||
split_words: None,
|
||||
},
|
||||
*pos1.end(),
|
||||
)
|
||||
@ -39,6 +41,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
|
||||
one_typo: vec![],
|
||||
two_typos: vec![],
|
||||
use_prefix_db: false,
|
||||
synonyms: vec![],
|
||||
split_words: None,
|
||||
},
|
||||
-100,
|
||||
),
|
||||
@ -63,9 +67,9 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
||||
|
||||
let (derivations2, pos2, ngram_len2) = match value2 {
|
||||
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
|
||||
QueryTerm::Phrase(phrase2) => {
|
||||
QueryTerm::Phrase { phrase: phrase2 } => {
|
||||
// TODO: remove second unwrap
|
||||
let original = phrase2.last().unwrap().as_ref().unwrap().clone();
|
||||
let original = phrase2.words.last().unwrap().as_ref().unwrap().clone();
|
||||
(
|
||||
WordDerivations {
|
||||
original: original.clone(),
|
||||
@ -73,6 +77,8 @@ pub fn visit_to_node<'transaction, 'from_data>(
|
||||
one_typo: vec![],
|
||||
two_typos: vec![],
|
||||
use_prefix_db: false,
|
||||
synonyms: vec![],
|
||||
split_words: None,
|
||||
},
|
||||
*pos2.start(),
|
||||
1,
|
||||
|
@ -2,18 +2,21 @@ pub mod build;
|
||||
pub mod compute_docids;
|
||||
|
||||
use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
|
||||
use super::{EdgeDetails, RankingRuleGraphTrait};
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
use crate::new::logger::SearchLogger;
|
||||
use crate::new::query_term::WordDerivations;
|
||||
use crate::new::QueryNode;
|
||||
use crate::new::{QueryGraph, QueryNode};
|
||||
use crate::{Index, Result};
|
||||
|
||||
// TODO: intern the strings, refer to them by their pointer?
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum WordPair {
|
||||
// TODO: add WordsSwapped and WordPrefixSwapped case
|
||||
Words { left: String, right: String },
|
||||
WordsSwapped { left: String, right: String },
|
||||
WordPrefix { left: String, right_prefix: String },
|
||||
@ -22,6 +25,7 @@ pub enum WordPair {
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct ProximityEdge {
|
||||
// TODO: use a list of pointers to the word pairs instead?
|
||||
pairs: Vec<WordPair>,
|
||||
proximity: u8,
|
||||
}
|
||||
@ -67,10 +71,20 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
||||
|
||||
fn log_state(
|
||||
graph: &super::RankingRuleGraph<Self>,
|
||||
paths: &PathsMap<u64>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
logger.log_proximity_state(graph, paths, empty_paths_cache);
|
||||
logger.log_proximity_state(
|
||||
graph,
|
||||
paths,
|
||||
empty_paths_cache,
|
||||
universe,
|
||||
distances.to_vec(),
|
||||
cost,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,7 @@ use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use super::edge_docids_cache::EdgeDocidsCache;
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
|
||||
use super::{RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
|
||||
@ -21,44 +21,65 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
edge_docids_cache: &mut EdgeDocidsCache<G>,
|
||||
empty_paths_cache: &mut EmptyPathsCache,
|
||||
universe: &RoaringBitmap,
|
||||
mut paths: PathsMap<u64>,
|
||||
mut paths: Vec<Vec<u32>>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
paths.sort_unstable();
|
||||
let mut needs_filtering = false;
|
||||
let mut path_bitmaps = vec![];
|
||||
'path_loop: loop {
|
||||
if needs_filtering {
|
||||
for path in paths.iter_mut() {
|
||||
if empty_paths_cache.path_is_empty(path) {
|
||||
path.clear();
|
||||
}
|
||||
}
|
||||
needs_filtering = false;
|
||||
}
|
||||
let Some(edge_indexes) = paths.pop() else {
|
||||
break;
|
||||
};
|
||||
|
||||
paths.remove_edges(&empty_paths_cache.empty_edges);
|
||||
paths.remove_prefixes(&empty_paths_cache.empty_prefixes);
|
||||
if edge_indexes.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
'path_loop: while let Some((edge_indexes, _)) = paths.remove_first() {
|
||||
// if path is excluded, continue...
|
||||
let mut processed_edges = vec![];
|
||||
let mut path_bitmap = universe.clone();
|
||||
let mut visited_edges = vec![];
|
||||
let mut cached_edge_docids = vec![];
|
||||
'edge_loop: for edge_index in edge_indexes {
|
||||
processed_edges.push(edge_index);
|
||||
let edge_docids =
|
||||
edge_docids_cache.get_edge_docids(index, txn, db_cache, edge_index, self)?;
|
||||
visited_edges.push(edge_index);
|
||||
let edge_docids = edge_docids_cache
|
||||
.get_edge_docids(index, txn, db_cache, edge_index, self, universe)?;
|
||||
match edge_docids {
|
||||
BitmapOrAllRef::Bitmap(edge_docids) => {
|
||||
cached_edge_docids.push((edge_index, edge_docids.clone()));
|
||||
let (_, edge_docids) = cached_edge_docids.last().unwrap();
|
||||
if edge_docids.is_disjoint(universe) {
|
||||
// 1. Store in the cache that this edge is empty for this universe
|
||||
empty_paths_cache.forbid_edge(edge_index);
|
||||
// 2. remove all the paths that contain this edge for this universe
|
||||
paths.remove_edge(&edge_index);
|
||||
// 3. remove this edge from the proximity graph
|
||||
|
||||
// 2. remove this edge from the proximity graph
|
||||
self.remove_edge(edge_index);
|
||||
|
||||
// 4. continue executing this function again on the remaining paths
|
||||
edge_docids_cache.cache.remove(&edge_index);
|
||||
needs_filtering = true;
|
||||
// 3. continue executing this function again on the remaining paths
|
||||
continue 'path_loop;
|
||||
} else {
|
||||
path_bitmap &= edge_docids;
|
||||
if path_bitmap.is_disjoint(universe) {
|
||||
// 1. Store in the cache that this prefix is empty for this universe
|
||||
empty_paths_cache
|
||||
.empty_prefixes
|
||||
.insert(processed_edges.iter().copied(), ());
|
||||
// 2. remove all the paths beginning with this prefix
|
||||
paths.remove_prefix(&processed_edges);
|
||||
// 3. continue executing this function again on the remaining paths?
|
||||
needs_filtering = true;
|
||||
empty_paths_cache.forbid_prefix(&visited_edges);
|
||||
// if the intersection between this edge and any
|
||||
// previous one is disjoint with the universe,
|
||||
// then we add these two edges to the empty_path_cache
|
||||
for (edge_index2, edge_docids2) in
|
||||
cached_edge_docids[..cached_edge_docids.len() - 1].iter()
|
||||
{
|
||||
let intersection = edge_docids & edge_docids2;
|
||||
if intersection.is_disjoint(universe) {
|
||||
empty_paths_cache
|
||||
.forbid_couple_edges(*edge_index2, edge_index);
|
||||
}
|
||||
}
|
||||
continue 'path_loop;
|
||||
}
|
||||
}
|
||||
@ -68,6 +89,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
|
||||
}
|
||||
path_bitmaps.push(path_bitmap);
|
||||
}
|
||||
|
||||
Ok(MultiOps::union(path_bitmaps))
|
||||
}
|
||||
}
|
||||
|
@ -2,16 +2,18 @@ use heed::{BytesDecode, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::empty_paths_cache::EmptyPathsCache;
|
||||
use super::paths_map::PathsMap;
|
||||
use super::{EdgeDetails, RankingRuleGraphTrait};
|
||||
|
||||
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
use crate::new::db_cache::DatabaseCache;
|
||||
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
|
||||
use crate::new::QueryNode;
|
||||
use crate::new::logger::SearchLogger;
|
||||
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
|
||||
use crate::new::resolve_query_graph::resolve_phrase;
|
||||
use crate::new::{QueryGraph, QueryNode};
|
||||
use crate::{Index, Result, RoaringBitmapCodec};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum TypoEdge {
|
||||
Phrase,
|
||||
Phrase { phrase: Phrase },
|
||||
Word { derivations: WordDerivations, nbr_typos: u8 },
|
||||
}
|
||||
|
||||
@ -23,7 +25,7 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
|
||||
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String {
|
||||
match edge {
|
||||
TypoEdge::Phrase => format!(", 0 typos"),
|
||||
TypoEdge::Phrase { .. } => ", 0 typos".to_owned(),
|
||||
TypoEdge::Word { nbr_typos, .. } => format!(", {nbr_typos} typos"),
|
||||
}
|
||||
}
|
||||
@ -33,9 +35,9 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
txn: &'transaction RoTxn,
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
edge: &Self::EdgeDetails,
|
||||
) -> Result<roaring::RoaringBitmap> {
|
||||
) -> Result<RoaringBitmap> {
|
||||
match edge {
|
||||
TypoEdge::Phrase => todo!(),
|
||||
TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase),
|
||||
TypoEdge::Word { derivations, nbr_typos } => {
|
||||
let words = match nbr_typos {
|
||||
0 => &derivations.zero_typo,
|
||||
@ -68,21 +70,23 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
_index: &Index,
|
||||
_txn: &'transaction RoTxn,
|
||||
_db_cache: &mut DatabaseCache<'transaction>,
|
||||
from_node: &QueryNode,
|
||||
_from_node: &QueryNode,
|
||||
) -> Result<Option<Self::BuildVisitedFromNode>> {
|
||||
Ok(Some(()))
|
||||
}
|
||||
|
||||
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
|
||||
index: &Index,
|
||||
txn: &'transaction RoTxn,
|
||||
db_cache: &mut DatabaseCache<'transaction>,
|
||||
_index: &Index,
|
||||
_txn: &'transaction RoTxn,
|
||||
_db_cache: &mut DatabaseCache<'transaction>,
|
||||
to_node: &QueryNode,
|
||||
from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||
_from_node_data: &'from_data Self::BuildVisitedFromNode,
|
||||
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
|
||||
match to_node {
|
||||
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
|
||||
QueryTerm::Phrase(_) => Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase))]),
|
||||
QueryTerm::Phrase { phrase } => {
|
||||
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))])
|
||||
}
|
||||
QueryTerm::Word { derivations } => {
|
||||
let mut edges = vec![];
|
||||
if !derivations.zero_typo.is_empty() || derivations.use_prefix_db {
|
||||
@ -121,11 +125,14 @@ impl RankingRuleGraphTrait for TypoGraph {
|
||||
}
|
||||
|
||||
fn log_state(
|
||||
graph: &super::RankingRuleGraph<Self>,
|
||||
paths: &PathsMap<u64>,
|
||||
graph: &RankingRuleGraph<Self>,
|
||||
paths: &[Vec<u32>],
|
||||
empty_paths_cache: &EmptyPathsCache,
|
||||
logger: &mut dyn crate::new::logger::SearchLogger<crate::new::QueryGraph>,
|
||||
universe: &RoaringBitmap,
|
||||
distances: &[Vec<u64>],
|
||||
cost: u64,
|
||||
logger: &mut dyn SearchLogger<QueryGraph>,
|
||||
) {
|
||||
logger.log_typo_state(graph, paths, empty_paths_cache);
|
||||
logger.log_typo_state(graph, paths, empty_paths_cache, universe, distances.to_vec(), cost);
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user