From 864f6410ed492a315838717c4e46b23a626556f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 21 Feb 2023 09:46:49 +0100 Subject: [PATCH] Introduce a structure to represent a set of graph paths efficiently --- .../new/ranking_rule_graph/paths_map.rs | 427 ++++++++++++++++++ 1 file changed, 427 insertions(+) create mode 100644 milli/src/search/new/ranking_rule_graph/paths_map.rs diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs new file mode 100644 index 000000000..589a1a52f --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -0,0 +1,427 @@ +use std::collections::hash_map::DefaultHasher; +use std::collections::HashSet; +use std::fmt::Write; +use std::hash::{Hash, Hasher}; + +use super::cheapest_paths::Path; +use super::{EdgeDetails, EdgeIndex, RankingRuleGraph, RankingRuleGraphTrait, Edge}; +use crate::new::QueryNode; + + +#[derive(Debug)] +pub struct PathsMap { + nodes: Vec<(EdgeIndex, PathsMap)>, + value: Option +} +impl Default for PathsMap { + fn default() -> Self { + Self { nodes: vec![], value: None } + } +} + +impl PathsMap { + pub fn from_paths(paths: &[Path]) -> Self { + let mut result = Self::default(); + for p in paths { + result.add_path(p); + } + result + } + pub fn add_path(&mut self, path: &Path) { + self.insert(path.edges.iter().copied(), path.cost); + } +} +impl PathsMap { + pub fn is_empty(&self) -> bool { + self.nodes.is_empty() && self.value.is_none() + } + + pub fn insert(&mut self, mut edges: impl Iterator, value: V) { + match edges.next() { + None => { + self.value = Some(value); + } + Some(first_edge) => { + // comment + for (edge, next_node) in &mut self.nodes { + if edge == &first_edge { + return next_node.insert(edges, value); + } + } + let mut rest = PathsMap::default(); + rest.insert(edges, value); + self.nodes.push((first_edge, rest)); + } + } + } + fn remove_first_rec(&mut self, cur: &mut Vec) -> (bool, V) { + let Some((first_edge, rest)) = self.nodes.first_mut() else { + // The PathsMap has to be correct by construction here, otherwise + // the unwrap() will crash + return (true, self.value.take().unwrap()) + }; + cur.push(*first_edge); + let (rest_is_empty, value) = rest.remove_first_rec(cur); + if rest_is_empty { + self.nodes.remove(0); + (self.nodes.is_empty(), value) + } else { + (false, value) + } + } + pub fn remove_first(&mut self) -> Option<(Vec, V)> { + if self.is_empty() { + return None + } + + let mut result = vec![]; + let (_, value) = self.remove_first_rec(&mut result); + Some((result, value)) + } + pub fn iterate_rec(&self, cur: &mut Vec, visit: &mut impl FnMut(&Vec, &V)) { + if let Some(value) = &self.value { + visit(cur, value); + } + for (first_edge, rest) in self.nodes.iter() { + cur.push(*first_edge); + rest.iterate_rec(cur, visit); + cur.pop(); + } + } + pub fn iterate(&self, mut visit: impl FnMut(&Vec, &V)) { + self.iterate_rec(&mut vec![], &mut visit) + } + + pub fn remove_prefixes(&mut self, prefixes: &PathsMap) { + prefixes.iterate(|prefix, _v| { + self.remove_prefix(prefix); + }); + } + pub fn remove_edges(&mut self, forbidden_edges: &HashSet) { + let mut i = 0; + while i < self.nodes.len() { + let should_remove = if forbidden_edges.contains(&self.nodes[i].0) { + true + } else if !self.nodes[i].1.nodes.is_empty() { + self.nodes[i].1.remove_edges(forbidden_edges); + self.nodes[i].1.nodes.is_empty() + } else { + false + }; + if should_remove { + self.nodes.remove(i); + } else { + i += 1; + } + } + } + pub fn remove_edge(&mut self, forbidden_edge: &EdgeIndex) { + let mut i = 0; + while i < self.nodes.len() { + let should_remove = if &self.nodes[i].0 == forbidden_edge { + true + } else if !self.nodes[i].1.nodes.is_empty() { + self.nodes[i].1.remove_edge(forbidden_edge); + self.nodes[i].1.nodes.is_empty() + } else { + false + }; + if should_remove { + self.nodes.remove(i); + } else { + i += 1; + } + } + } + pub fn remove_prefix(&mut self, forbidden_prefix: &[EdgeIndex]) { + let [first_edge, remaining_prefix @ ..] = forbidden_prefix else { + self.nodes.clear(); + self.value = None; + return; + }; + + let mut i = 0; + while i < self.nodes.len() { + let edge = self.nodes[i].0; + let should_remove = if edge == *first_edge { + self.nodes[i].1.remove_prefix(remaining_prefix); + self.nodes[i].1.nodes.is_empty() + } else { + false + }; + if should_remove { + self.nodes.remove(i); + } else { + i += 1; + } + } + } + + pub fn edge_indices_after_prefix(&self, prefix: &[EdgeIndex]) -> Vec { + let [first_edge, remaining_prefix @ ..] = prefix else { + return self.nodes.iter().map(|n| n.0).collect(); + }; + for (edge, rest) in self.nodes.iter(){ + if edge == first_edge { + return rest.edge_indices_after_prefix(remaining_prefix); + } + } + vec![] + } + + pub fn contains_prefix_of_path(&self, path: &[EdgeIndex]) -> bool { + if self.value.is_some() { + return true + } + match path { + [] => { + false + } + [first_edge, remaining_path @ ..] => { + for (edge, rest) in self.nodes.iter(){ + if edge == first_edge { + return rest.contains_prefix_of_path(remaining_path); + } + } + false + } + } + } + + pub fn graphviz(&self, graph: &RankingRuleGraph) -> String { + let mut desc = String::new(); + desc.push_str("digraph G {\n"); + self.graphviz_rec(&mut desc, vec![], graph); + desc.push_str("\n}\n"); + desc + } + fn graphviz_rec(&self, desc: &mut String, path_from: Vec, graph: &RankingRuleGraph) { + let id_from = { + let mut h = DefaultHasher::new(); + path_from.hash(&mut h); + h.finish() + }; + for (edge_idx, rest) in self.nodes.iter() { + let Some(Edge { from_node, to_node, cost, details }) = graph.get_edge(*edge_idx).as_ref() else { + continue; + }; + let mut path_to = path_from.clone(); + path_to.push({ + let mut h = DefaultHasher::new(); + edge_idx.hash(&mut h); + h.finish() + }); + let id_to = { + let mut h = DefaultHasher::new(); + path_to.hash(&mut h); + h.finish() + }; + writeln!(desc, "{id_to} [label = \"{from_node}→{to_node} [{cost}]\"];").unwrap(); + writeln!(desc, "{id_from} -> {id_to};").unwrap(); + + rest.graphviz_rec(desc, path_to, graph); + } + } +} + +impl RankingRuleGraph { + + pub fn graphviz_with_path(&self, path: &Path) -> String { + let mut desc = String::new(); + desc.push_str("digraph G {\nrankdir = LR;\nnode [shape = \"record\"]\n"); + + for (node_idx, node) in self.query_graph.nodes.iter().enumerate() { + if matches!(node, QueryNode::Deleted) { + continue; + } + desc.push_str(&format!("{node_idx} [label = {:?}]", node)); + if node_idx == self.query_graph.root_node { + desc.push_str("[color = blue]"); + } else if node_idx == self.query_graph.end_node { + desc.push_str("[color = red]"); + } + desc.push_str(";\n"); + } + + for (edge_idx, edge) in self.all_edges.iter().enumerate() { + let Some(edge) = edge else { continue }; + let Edge { from_node, to_node, cost, details } = edge; + let color = if path.edges.contains(&EdgeIndex(edge_idx)) { + "red" + } else { + "green" + }; + match &edge.details { + EdgeDetails::Unconditional => { + desc.push_str(&format!( + "{from_node} -> {to_node} [label = \"cost {cost}\", color = {color}];\n", + cost = edge.cost, + )); + } + EdgeDetails::Data(details) => { + desc.push_str(&format!( + "{from_node} -> {to_node} [label = \"cost {cost} {edge_label}\", color = {color}];\n", + cost = edge.cost, + edge_label = G::edge_details_dot_label(details), + )); + } + } + } + + desc.push('}'); + desc + } + +} + +#[cfg(test)] +mod tests { + use super::PathsMap; + use crate::db_snap; + use crate::index::tests::TempIndex; + use crate::new::db_cache::DatabaseCache; + use crate::new::ranking_rule_graph::cheapest_paths::KCheapestPathsState; + use crate::new::ranking_rule_graph::empty_paths_cache::EmptyPathsCache; + use crate::new::ranking_rule_graph::proximity::ProximityGraph; + use crate::new::ranking_rule_graph::{RankingRuleGraph, EdgeIndex}; + use crate::search::new::query_term::{word_derivations, LocatedQueryTerm}; + use crate::search::new::QueryGraph; + use charabia::Tokenize; + + #[test] + fn paths_tree() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + index + .update_settings(|s| { + s.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "text": "0 1 2 3 4 5" + }, + { + "text": "0 a 1 b 2 3 4 5" + }, + { + "text": "0 a 1 b 3 a 4 b 5" + }, + { + "text": "0 a a 1 b 2 3 4 5" + }, + { + "text": "0 a a a a 1 b 3 45" + }, + ])) + .unwrap(); + + db_snap!(index, word_pair_proximity_docids, @"679d1126b569b3e8b10dd937c3faedf9"); + + let txn = index.read_txn().unwrap(); + let mut db_cache = DatabaseCache::default(); + let fst = index.words_fst(&txn).unwrap(); + let query = + LocatedQueryTerm::from_query("0 1 2 3 4 5".tokenize(), None, |word, is_prefix| { + word_derivations(&index, &txn, word, if word.len() < 3 { + 0 + } else if word.len() < 6 { + 1 + } else { + 2 + },is_prefix, &fst) + }) + .unwrap(); + let graph = QueryGraph::from_query(&index, &txn, &mut db_cache, query).unwrap(); + let empty_paths_cache = EmptyPathsCache::default(); + let mut db_cache = DatabaseCache::default(); + + let mut prox_graph = + RankingRuleGraph::::build(&index, &txn, &mut db_cache, graph).unwrap(); + + println!("{}", prox_graph.graphviz()); + + let mut state = KCheapestPathsState::new(&prox_graph).unwrap(); + + let mut path_tree = PathsMap::default(); + while state.next_cost() <= 6 { + let next_state = state.compute_paths_of_next_lowest_cost(&mut prox_graph, &empty_paths_cache, &mut path_tree); + if let Some(next_state) = next_state { + state = next_state; + } else { + break; + } + } + + let desc = path_tree.graphviz(&prox_graph); + println!("{desc}"); + + // let path = vec![EdgeIndex { from: 0, to: 2, edge_idx: 0 }, EdgeIndex { from: 2, to: 3, edge_idx: 0 }, EdgeIndex { from: 3, to: 4, edge_idx: 0 }, EdgeIndex { from: 4, to: 5, edge_idx: 0 }, EdgeIndex { from: 5, to: 8, edge_idx: 0 }, EdgeIndex { from: 8, to: 1, edge_idx: 0 }, EdgeIndex { from: 1, to: 10, edge_idx: 0 }]; + // println!("{}", psath_tree.contains_prefix_of_path(&path)); + + + // let path = vec![EdgeIndex { from: 0, to: 2, edge_idx: 0 }, EdgeIndex { from: 2, to: 3, edge_idx: 0 }, EdgeIndex { from: 3, to: 4, edge_idx: 0 }, EdgeIndex { from: 4, to: 5, edge_idx: 0 }, EdgeIndex { from: 5, to: 6, edge_idx: 0 }, EdgeIndex { from: 6, to: 7, edge_idx: 0 }, EdgeIndex { from: 7, to: 1, edge_idx: 0 }]; + + + // path_tree.iterate(|path, cost| { + // println!("cost {cost} for path: {path:?}"); + // }); + + // path_tree.remove_forbidden_prefix(&[ + // EdgeIndex { from: 0, to: 2, edge_idx: 0 }, + // EdgeIndex { from: 2, to: 3, edge_idx: 2 }, + // ]); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // path_tree.remove_forbidden_edge(&EdgeIndex { from: 5, to: 6, cost: 1 }); + + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING 5-6 [1]:\n{desc}"); + + // path_tree.remove_forbidden_edge(&EdgeIndex { from: 3, to: 4, cost: 1 }); + + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING 3-4 [1]:\n{desc}"); + + // let p = path_tree.remove_first(); + // println!("PATH: {p:?}"); + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING: {desc}"); + + // let p = path_tree.remove_first(); + // println!("PATH: {p:?}"); + // let desc = path_tree.graphviz(); + // println!("AFTER REMOVING: {desc}"); + + // path_tree.remove_all_containing_edge(&EdgeIndex { from: 5, to: 6, cost: 2 }); + + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // let first_edges = path_tree.remove_first().unwrap(); + // println!("{first_edges:?}"); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // let first_edges = path_tree.remove_first().unwrap(); + // println!("{first_edges:?}"); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // let first_edges = path_tree.remove_first().unwrap(); + // println!("{first_edges:?}"); + // let desc = path_tree.graphviz(); + // println!("{desc}"); + + // println!("{path_tree:?}"); + } + + + #[test] + fn test_contains_prefix_of_path() { + + } +}