Move crates under a sub folder to clean up the code

This commit is contained in:
Clément Renault
2024-10-21 08:18:43 +02:00
parent 30f3c30389
commit 9c1e54a2c8
1062 changed files with 19 additions and 20 deletions

View File

@ -0,0 +1,92 @@
use std::collections::HashSet;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, MappedInterner};
use crate::search::new::query_graph::{QueryNode, QueryNodeData};
use crate::search::new::small_bitmap::SmallBitmap;
use crate::search::new::{QueryGraph, SearchContext};
use crate::Result;
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
/// Build the ranking rule graph from the given query graph
pub fn build(
ctx: &mut SearchContext<'_>,
query_graph: QueryGraph,
cost_of_ignoring_node: MappedInterner<QueryNode, Option<(u32, SmallBitmap<QueryNode>)>>,
) -> Result<Self> {
let QueryGraph { nodes: graph_nodes, .. } = &query_graph;
let mut conditions_interner = DedupInterner::default();
let mut edges_store = DedupInterner::default();
let mut edges_of_node = query_graph.nodes.map(|_| HashSet::new());
for (source_id, source_node) in graph_nodes.iter() {
let new_edges = edges_of_node.get_mut(source_id);
for dest_idx in source_node.successors.iter() {
let src_term = match &source_node.data {
QueryNodeData::Term(t) => Some(t),
QueryNodeData::Start => None,
QueryNodeData::Deleted | QueryNodeData::End => panic!(),
};
let dest_node = graph_nodes.get(dest_idx);
let dest_term = match &dest_node.data {
QueryNodeData::Term(t) => t,
QueryNodeData::End => {
let new_edge_id = edges_store.insert(Some(Edge {
source_node: source_id,
dest_node: dest_idx,
cost: 0,
condition: None,
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
}));
new_edges.insert(new_edge_id);
continue;
}
QueryNodeData::Deleted | QueryNodeData::Start => panic!(),
};
if let Some((cost_of_ignoring, forbidden_nodes)) =
cost_of_ignoring_node.get(dest_idx)
{
let dest = graph_nodes.get(dest_idx);
let dest_size = match &dest.data {
QueryNodeData::Term(term) => term.term_ids.len(),
_ => panic!(),
};
let new_edge_id = edges_store.insert(Some(Edge {
source_node: source_id,
dest_node: dest_idx,
cost: *cost_of_ignoring * dest_size as u32,
condition: None,
nodes_to_skip: forbidden_nodes.clone(),
}));
new_edges.insert(new_edge_id);
}
let edges = G::build_edges(ctx, &mut conditions_interner, src_term, dest_term)?;
if edges.is_empty() {
continue;
}
for (cost, condition) in edges {
let new_edge_id = edges_store.insert(Some(Edge {
source_node: source_id,
dest_node: dest_idx,
cost,
condition: Some(condition),
nodes_to_skip: SmallBitmap::for_interned_values_in(graph_nodes),
}));
new_edges.insert(new_edge_id);
}
}
}
let edges_store = edges_store.freeze();
let edges_of_node =
edges_of_node.map(|edges| SmallBitmap::from_iter(edges.iter().copied(), &edges_store));
let conditions_interner = conditions_interner.freeze();
Ok(RankingRuleGraph { query_graph, edges_store, edges_of_node, conditions_interner })
}
}

View File

@ -0,0 +1,400 @@
/** Implements a "PathVisitor" which finds all paths of a certain cost
from the START to END node of a ranking rule graph.
A path is a list of conditions. A condition is the data associated with
an edge, given by the ranking rule. Some edges don't have a condition associated
with them, they are "unconditional". These kinds of edges are used to "skip" a node.
The algorithm uses a depth-first search. It benefits from two main optimisations:
- The list of all possible costs to go from any node to the END node is precomputed
- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges
untraversable depending on what other edges were selected.
These two optimisations are meant to avoid traversing edges that wouldn't lead
to a valid path. In practically all cases, we avoid the exponential complexity
that is inherent to depth-first search in a large ranking rule graph.
The DeadEndsCache is a sort of prefix tree which associates a list of forbidden
conditions to a list of traversed conditions.
For example, the DeadEndsCache could say the following:
- Immediately, from the start, the conditions `[a,b]` are forbidden
- if we take the condition `c`, then the conditions `[e]` are also forbidden
- and if after that, we take `f`, then `[h,i]` are also forbidden
- etc.
- if we take `g`, then `[f]` is also forbidden
- etc.
- etc.
As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden
conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden.
When a path is found from START to END, we give it to the `visit` closure.
This closure takes a mutable reference to the `DeadEndsCache`. This means that
the caller can update this cache. Therefore, we must handle the case where the
DeadEndsCache has been updated. This means potentially backtracking up to the point
where the traversed conditions are all allowed by the new DeadEndsCache.
The algorithm also implements the `TermsMatchingStrategy` logic.
Some edges are augmented with a list of "nodes_to_skip". Skipping
a node means "reaching this node through an unconditional edge". If we have
already traversed (ie. not skipped) a node that is in this list, then we know that we
can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any
future node that was present in the "nodes_to_skip" list.
The caller can decide to stop the path finding algorithm
by returning a `ControlFlow::Break` from the `visit` closure.
*/
use std::collections::{BTreeSet, VecDeque};
use std::iter::FromIterator;
use std::ops::ControlFlow;
use fxhash::FxHashSet;
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{Interned, MappedInterner};
use crate::search::new::query_graph::QueryNode;
use crate::search::new::small_bitmap::SmallBitmap;
use crate::Result;
/// Closure which processes a path found by the `PathVisitor`
type VisitFn<'f, G> = &'f mut dyn FnMut(
// the path as a list of conditions
&[Interned<<G as RankingRuleGraphTrait>::Condition>],
&mut RankingRuleGraph<G>,
// a mutable reference to the DeadEndsCache, to update it in case the given
// path doesn't resolve to any valid document ids
&mut DeadEndsCache<<G as RankingRuleGraphTrait>::Condition>,
) -> Result<ControlFlow<()>>;
/// A structure which is kept but not updated during the traversal of the graph.
/// It can however be updated by the `visit` closure once a valid path has been found.
struct VisitorContext<'a, G: RankingRuleGraphTrait> {
graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
}
/// The internal state of the traversal algorithm
struct VisitorState<G: RankingRuleGraphTrait> {
/// Budget from the current node to the end node
remaining_cost: u64,
/// Previously visited conditions, in order.
path: Vec<Interned<G::Condition>>,
/// Previously visited conditions, as an efficient and compact set.
visited_conditions: SmallBitmap<G::Condition>,
/// Previously visited (ie not skipped) nodes, as an efficient and compact set.
visited_nodes: SmallBitmap<QueryNode>,
/// The conditions that cannot be visited anymore
forbidden_conditions: SmallBitmap<G::Condition>,
/// The nodes that cannot be visited anymore (they must be skipped)
nodes_to_skip: SmallBitmap<QueryNode>,
}
/// See module documentation
pub struct PathVisitor<'a, G: RankingRuleGraphTrait> {
state: VisitorState<G>,
ctx: VisitorContext<'a, G>,
}
impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> {
pub fn new(
cost: u64,
graph: &'a mut RankingRuleGraph<G>,
all_costs_from_node: &'a MappedInterner<QueryNode, Vec<u64>>,
dead_ends_cache: &'a mut DeadEndsCache<G::Condition>,
) -> Self {
Self {
state: VisitorState {
remaining_cost: cost,
path: vec![],
visited_conditions: SmallBitmap::for_interned_values_in(&graph.conditions_interner),
visited_nodes: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
forbidden_conditions: SmallBitmap::for_interned_values_in(
&graph.conditions_interner,
),
nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes),
},
ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache },
}
}
/// See module documentation
pub fn visit_paths(mut self, visit: VisitFn<'_, G>) -> Result<()> {
let _ =
self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?;
Ok(())
}
}
impl<G: RankingRuleGraphTrait> VisitorState<G> {
/// Visits a node: traverse all its valid conditional and unconditional edges.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_node(
&mut self,
from_node: Interned<QueryNode>,
visit: VisitFn<'_, G>,
ctx: &mut VisitorContext<'_, G>,
) -> Result<ControlFlow<(), bool>> {
// any valid path will be found from this point
// if a valid path was found, then we know that the DeadEndsCache may have been updated,
// and we will need to do more work to potentially backtrack
let mut any_valid = false;
let edges = ctx.graph.edges_of_node.get(from_node).clone();
for edge_idx in edges.iter() {
// could be none if the edge was deleted
let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue };
if self.remaining_cost < edge.cost as u64 {
continue;
}
self.remaining_cost -= edge.cost as u64;
let cf = match edge.condition {
Some(condition) => self.visit_condition(
condition,
edge.dest_node,
&edge.nodes_to_skip,
visit,
ctx,
)?,
None => self.visit_no_condition(edge.dest_node, &edge.nodes_to_skip, visit, ctx)?,
};
self.remaining_cost += edge.cost as u64;
let ControlFlow::Continue(next_any_valid) = cf else {
return Ok(ControlFlow::Break(()));
};
any_valid |= next_any_valid;
if next_any_valid {
// backtrack as much as possible if a valid path was found and the dead_ends_cache
// was updated such that the current prefix is now invalid
self.forbidden_conditions = ctx
.dead_ends_cache
.forbidden_conditions_for_all_prefixes_up_to(self.path.iter().copied());
if self.visited_conditions.intersects(&self.forbidden_conditions) {
return Ok(ControlFlow::Continue(true));
}
}
}
Ok(ControlFlow::Continue(any_valid))
}
/// Visits an unconditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_no_condition(
&mut self,
dest_node: Interned<QueryNode>,
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
visit: VisitFn<'_, G>,
ctx: &mut VisitorContext<'_, G>,
) -> Result<ControlFlow<(), bool>> {
if !ctx
.all_costs_from_node
.get(dest_node)
.iter()
.any(|next_cost| *next_cost == self.remaining_cost)
{
return Ok(ControlFlow::Continue(false));
}
// We've reached the END node!
if dest_node == ctx.graph.query_graph.end_node {
let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?;
// We could change the return type of the visit closure such that the caller
// tells us whether the dead ends cache was updated or not.
// Alternatively, maybe the DeadEndsCache should have a generation number
// to it, so that we don't need to play with these booleans at all.
match control_flow {
ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)),
ControlFlow::Break(_) => Ok(ControlFlow::Break(())),
}
} else {
let old_fbct = self.nodes_to_skip.clone();
self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?;
self.nodes_to_skip = old_fbct;
Ok(cf)
}
}
/// Visits a conditional edge.
///
/// Returns ControlFlow::Break if the path finding algorithm should stop.
/// Returns whether a valid path was found from this node otherwise.
fn visit_condition(
&mut self,
condition: Interned<G::Condition>,
dest_node: Interned<QueryNode>,
edge_new_nodes_to_skip: &SmallBitmap<QueryNode>,
visit: VisitFn<'_, G>,
ctx: &mut VisitorContext<'_, G>,
) -> Result<ControlFlow<(), bool>> {
assert!(dest_node != ctx.graph.query_graph.end_node);
if self.forbidden_conditions.contains(condition)
|| self.nodes_to_skip.contains(dest_node)
|| edge_new_nodes_to_skip.intersects(&self.visited_nodes)
{
return Ok(ControlFlow::Continue(false));
}
// Checking that from the destination node, there is at least
// one cost that we can visit that corresponds to our remaining budget.
if !ctx
.all_costs_from_node
.get(dest_node)
.iter()
.any(|next_cost| *next_cost == self.remaining_cost)
{
return Ok(ControlFlow::Continue(false));
}
self.path.push(condition);
self.visited_nodes.insert(dest_node);
self.visited_conditions.insert(condition);
let old_forb_cond = self.forbidden_conditions.clone();
if let Some(next_forbidden) =
ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied())
{
self.forbidden_conditions.union(&next_forbidden);
}
let old_nodes_to_skip = self.nodes_to_skip.clone();
self.nodes_to_skip.union(edge_new_nodes_to_skip);
let cf = self.visit_node(dest_node, visit, ctx)?;
self.nodes_to_skip = old_nodes_to_skip;
self.forbidden_conditions = old_forb_cond;
self.visited_conditions.remove(condition);
self.visited_nodes.remove(dest_node);
self.path.pop();
Ok(cf)
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn find_all_costs_to_end(&self) -> MappedInterner<QueryNode, Vec<u64>> {
let mut costs_to_end = self.query_graph.nodes.map(|_| vec![]);
self.traverse_breadth_first_backward(self.query_graph.end_node, |cur_node| {
if cur_node == self.query_graph.end_node {
*costs_to_end.get_mut(self.query_graph.end_node) = vec![0];
return;
}
let mut self_costs = Vec::<u64>::new();
let cur_node_edges = &self.edges_of_node.get(cur_node);
for edge_idx in cur_node_edges.iter() {
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
let succ_node = edge.dest_node;
let succ_costs = costs_to_end.get(succ_node);
for succ_cost in succ_costs {
self_costs.push(edge.cost as u64 + succ_cost);
}
}
self_costs.sort_unstable();
self_costs.dedup();
*costs_to_end.get_mut(cur_node) = self_costs;
});
costs_to_end
}
pub fn update_all_costs_before_node(
&self,
node_with_removed_outgoing_conditions: Interned<QueryNode>,
costs: &mut MappedInterner<QueryNode, Vec<u64>>,
) {
// Traverse the graph backward from the target node, recomputing the cost for each of its predecessors.
// We first check that no other node is contributing the same total cost to a predecessor before removing
// the cost from the predecessor.
self.traverse_breadth_first_backward(node_with_removed_outgoing_conditions, |cur_node| {
let mut costs_to_remove = FxHashSet::default();
costs_to_remove.extend(costs.get(cur_node).iter().copied());
let cur_node_edges = &self.edges_of_node.get(cur_node);
for edge_idx in cur_node_edges.iter() {
let edge = self.edges_store.get(edge_idx).as_ref().unwrap();
for cost in costs.get(edge.dest_node).iter() {
costs_to_remove.remove(&(*cost + edge.cost as u64));
if costs_to_remove.is_empty() {
return;
}
}
}
if costs_to_remove.is_empty() {
return;
}
let mut new_costs = BTreeSet::from_iter(costs.get(cur_node).iter().copied());
for c in costs_to_remove {
new_costs.remove(&c);
}
*costs.get_mut(cur_node) = new_costs.into_iter().collect();
});
}
/// Traverse the graph backwards from the given node such that every time
/// a node is visited, we are guaranteed that all its successors either:
/// 1. have already been visited; OR
/// 2. were not reachable from the given node
pub fn traverse_breadth_first_backward(
&self,
from: Interned<QueryNode>,
mut visit: impl FnMut(Interned<QueryNode>),
) {
let mut reachable = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
{
// go backward to get the set of all reachable nodes from the given node
// the nodes that are not reachable will be set as `visited`
let mut stack = VecDeque::new();
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
enqueued.insert(from);
stack.push_back(from);
while let Some(n) = stack.pop_front() {
if reachable.contains(n) {
continue;
}
reachable.insert(n);
for prev_node in self.query_graph.nodes.get(n).predecessors.iter() {
if !enqueued.contains(prev_node) && !reachable.contains(prev_node) {
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
};
let mut unreachable_or_visited =
SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
for (n, _) in self.query_graph.nodes.iter() {
if !reachable.contains(n) {
unreachable_or_visited.insert(n);
}
}
let mut enqueued = SmallBitmap::for_interned_values_in(&self.query_graph.nodes);
let mut stack = VecDeque::new();
enqueued.insert(from);
stack.push_back(from);
while let Some(cur_node) = stack.pop_front() {
if !self.query_graph.nodes.get(cur_node).successors.is_subset(&unreachable_or_visited) {
stack.push_back(cur_node);
continue;
}
unreachable_or_visited.insert(cur_node);
visit(cur_node);
for prev_node in self.query_graph.nodes.get(cur_node).predecessors.iter() {
if !enqueued.contains(prev_node) && !unreachable_or_visited.contains(prev_node) {
stack.push_back(prev_node);
enqueued.insert(prev_node);
}
}
}
}
}

View File

@ -0,0 +1,58 @@
use std::marker::PhantomData;
use fxhash::FxHashMap;
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::Interned;
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
use crate::Result;
/// A cache storing the document ids associated with each ranking rule edge
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
_phantom: PhantomData<G>,
}
impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
fn default() -> Self {
Self { cache: Default::default(), _phantom: Default::default() }
}
}
impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
pub fn get_subsets_used_by_condition(
&mut self,
interned_condition: Interned<G::Condition>,
) -> (&Option<LocatedQueryTermSubset>, &LocatedQueryTermSubset) {
let c = &self.cache[&interned_condition];
(&c.start_term_subset, &c.end_term_subset)
}
/// Retrieve the document ids for the given edge condition.
///
/// If the cache does not yet contain these docids, they are computed
/// and inserted in the cache.
pub fn get_computed_condition<'s>(
&'s mut self,
ctx: &mut SearchContext<'_>,
interned_condition: Interned<G::Condition>,
graph: &mut RankingRuleGraph<G>,
universe: &RoaringBitmap,
) -> Result<&'s ComputedCondition> {
if self.cache.contains_key(&interned_condition) {
let computed = self.cache.get_mut(&interned_condition).unwrap();
if computed.universe_len == universe.len() {
return Ok(computed);
} else {
computed.docids &= universe;
computed.universe_len = universe.len();
return Ok(computed);
}
}
let condition = graph.conditions_interner.get_mut(interned_condition);
let computed = G::resolve_condition(ctx, condition, universe)?;
// Can we put an assert here for computed.universe_len == universe.len() ?
let _ = self.cache.insert(interned_condition, computed);
let computed = &self.cache[&interned_condition];
Ok(computed)
}
}

View File

@ -0,0 +1,100 @@
use crate::search::new::interner::{FixedSizeInterner, Interned};
use crate::search::new::small_bitmap::SmallBitmap;
pub struct DeadEndsCache<T> {
// conditions and next could/should be part of the same vector
conditions: Vec<Interned<T>>,
next: Vec<Self>,
pub forbidden: SmallBitmap<T>,
}
impl<T> Clone for DeadEndsCache<T> {
fn clone(&self) -> Self {
Self {
conditions: self.conditions.clone(),
next: self.next.clone(),
forbidden: self.forbidden.clone(),
}
}
}
impl<T> DeadEndsCache<T> {
pub fn new(for_interner: &FixedSizeInterner<T>) -> Self {
Self {
conditions: vec![],
next: vec![],
forbidden: SmallBitmap::for_interned_values_in(for_interner),
}
}
pub fn forbid_condition(&mut self, condition: Interned<T>) {
self.forbidden.insert(condition);
}
fn advance(&mut self, condition: Interned<T>) -> Option<&mut Self> {
if let Some(idx) = self.conditions.iter().position(|c| *c == condition) {
Some(&mut self.next[idx])
} else {
None
}
}
pub fn forbidden_conditions_for_all_prefixes_up_to(
&mut self,
prefix: impl Iterator<Item = Interned<T>>,
) -> SmallBitmap<T> {
let mut forbidden = self.forbidden.clone();
let mut cursor = self;
for c in prefix {
if let Some(next) = cursor.advance(c) {
cursor = next;
forbidden.union(&cursor.forbidden);
} else {
break;
}
}
forbidden
}
pub fn forbidden_conditions_after_prefix(
&mut self,
prefix: impl Iterator<Item = Interned<T>>,
) -> Option<SmallBitmap<T>> {
let mut cursor = self;
for c in prefix {
if let Some(next) = cursor.advance(c) {
cursor = next;
} else {
return None;
}
}
Some(cursor.forbidden.clone())
}
pub fn forbid_condition_after_prefix(
&mut self,
mut prefix: impl Iterator<Item = Interned<T>>,
forbidden: Interned<T>,
) {
match prefix.next() {
None => {
self.forbidden.insert(forbidden);
}
Some(first_condition) => {
if let Some(idx) = self.conditions.iter().position(|c| *c == first_condition) {
return self.next[idx].forbid_condition_after_prefix(prefix, forbidden);
}
let mut rest = DeadEndsCache {
conditions: vec![],
next: vec![],
forbidden: SmallBitmap::new(self.forbidden.universe_length()),
};
rest.forbid_condition_after_prefix(prefix, forbidden);
self.conditions.push(first_condition);
self.next.push(rest);
}
}
}
// pub fn debug_print(&self, indent: usize) {
// println!("{} {:?}", " ".repeat(indent), self.forbidden.iter().collect::<Vec<_>>());
// for (condition, next) in self.conditions.iter().zip(self.next.iter()) {
// println!("{} {condition}:", " ".repeat(indent));
// next.debug_print(indent + 2);
// }
// }
}

View File

@ -0,0 +1,92 @@
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{self, Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::Word;
use crate::{Result, SearchContext};
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum ExactnessCondition {
ExactInAttribute(LocatedQueryTermSubset),
Any(LocatedQueryTermSubset),
}
pub enum ExactnessGraph {}
fn compute_docids(
ctx: &mut SearchContext<'_>,
dest_node: &LocatedQueryTermSubset,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
exact_term
} else {
return Ok(Default::default());
};
let candidates = match exact_term {
// TODO I move the intersection here
ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe,
ExactTerm::Word(word) => {
ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default()
}
};
Ok(candidates)
}
impl RankingRuleGraphTrait for ExactnessGraph {
type Condition = ExactnessCondition;
#[tracing::instrument(level = "trace", skip_all, target = "search::exactness")]
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let (docids, end_term_subset) = match condition {
ExactnessCondition::ExactInAttribute(dest_node) => {
let mut end_term_subset = dest_node.clone();
end_term_subset.term_subset.keep_only_exact_term(ctx);
end_term_subset.term_subset.make_mandatory();
(compute_docids(ctx, dest_node, universe)?, end_term_subset)
}
ExactnessCondition::Any(dest_node) => {
let docids =
compute_query_term_subset_docids(ctx, Some(universe), &dest_node.term_subset)?;
(docids, dest_node.clone())
}
};
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset,
})
}
#[tracing::instrument(level = "trace", skip_all, target = "search::exactness")]
fn build_edges(
_ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
_source_node: Option<&LocatedQueryTermSubset>,
dest_node: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
let exact_condition = conditions_interner.insert(exact_condition);
let skip_condition = ExactnessCondition::Any(dest_node.clone());
let skip_condition = conditions_interner.insert(skip_condition);
Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)])
}
#[tracing::instrument(level = "trace", skip_all, target = "search::exactness")]
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::ExactWords(score_details::ExactWords::from_rank(rank))
}
}

View File

@ -0,0 +1,112 @@
use fxhash::FxHashSet;
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_field_id;
use crate::search::new::SearchContext;
use crate::{FieldId, InternalError, Result};
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct FidCondition {
term: LocatedQueryTermSubset,
fid: Option<FieldId>,
}
pub enum FidGraph {}
impl RankingRuleGraphTrait for FidGraph {
type Condition = FidCondition;
#[tracing::instrument(level = "trace", skip_all, target = "search::fid")]
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let FidCondition { term, .. } = condition;
let docids = if let Some(fid) = condition.fid {
compute_query_term_subset_docids_within_field_id(
ctx,
Some(universe),
&term.term_subset,
fid,
)?
} else {
RoaringBitmap::new()
};
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
})
}
#[tracing::instrument(level = "trace", skip_all, target = "search::fid")]
fn build_edges(
ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
_from: Option<&LocatedQueryTermSubset>,
to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
let term = to_term;
let mut all_fields = FxHashSet::default();
for word in term.term_subset.all_single_words_except_prefix_db(ctx)? {
let fields = ctx.get_db_word_fids(word.interned())?;
all_fields.extend(fields);
}
for phrase in term.term_subset.all_phrases(ctx)? {
for &word in phrase.words(ctx).iter().flatten() {
let fields = ctx.get_db_word_fids(word)?;
all_fields.extend(fields);
}
}
if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) {
let fields = ctx.get_db_word_prefix_fids(word_prefix.interned())?;
all_fields.extend(fields);
}
let weights_map = ctx.index.fieldids_weights_map(ctx.txn)?;
let mut edges = vec![];
for fid in all_fields.iter().copied() {
let weight = weights_map
.weight(fid)
.ok_or(InternalError::FieldidsWeightsMapMissingEntry { key: fid })?;
edges.push((
weight as u32 * term.term_ids.len() as u32,
conditions_interner.insert(FidCondition { term: term.clone(), fid: Some(fid) }),
));
}
// always lookup the max_fid if we don't already and add an artificial condition for max scoring
let max_weight: Option<u16> = weights_map.max_weight();
if let Some(max_weight) = max_weight {
if !all_fields.contains(&max_weight) {
edges.push((
max_weight as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10.
conditions_interner.insert(FidCondition {
term: term.clone(), // TODO remove this ugly clone
fid: None,
}),
));
}
}
Ok(edges)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::fid")]
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::Fid(rank)
}
}

View File

@ -0,0 +1,160 @@
/*! Module implementing the graph used for the graph-based ranking rules
and its related algorithms.
A ranking rule graph is built on top of the [`QueryGraph`]: the nodes stay
the same but the edges are replaced.
*/
mod build;
mod cheapest_paths;
mod condition_docids_cache;
mod dead_ends_cache;
/// Implementation of the `exactness` ranking rule
mod exactness;
/// Implementation of the `attribute` ranking rule
mod fid;
/// Implementation of the `position` ranking rule
mod position;
/// Implementation of the `proximity` ranking rule
mod proximity;
/// Implementation of the `typo` ranking rule
mod typo;
/// Implementation of the `words` ranking rule
mod words;
use std::collections::BTreeSet;
use std::hash::Hash;
pub use cheapest_paths::PathVisitor;
pub use condition_docids_cache::ConditionDocIdsCache;
pub use dead_ends_cache::DeadEndsCache;
pub use exactness::ExactnessGraph;
pub use fid::{FidCondition, FidGraph};
pub use position::{PositionCondition, PositionGraph};
pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap;
pub use typo::{TypoCondition, TypoGraph};
pub use words::{WordsCondition, WordsGraph};
use super::interner::{DedupInterner, FixedSizeInterner, Interned, MappedInterner};
use super::query_term::LocatedQueryTermSubset;
use super::small_bitmap::SmallBitmap;
use super::{QueryGraph, QueryNode, SearchContext};
use crate::score_details::{Rank, ScoreDetails};
use crate::Result;
pub struct ComputedCondition {
pub docids: RoaringBitmap,
pub universe_len: u64,
pub start_term_subset: Option<LocatedQueryTermSubset>,
pub end_term_subset: LocatedQueryTermSubset,
}
/// An edge in the ranking rule graph.
///
/// It contains:
/// 1. The source and destination nodes
/// 2. The cost of traversing this edge
/// 3. The condition associated with it
/// 4. The list of nodes that have to be skipped
/// if this edge is traversed.
#[derive(Clone)]
pub struct Edge<E> {
pub source_node: Interned<QueryNode>,
pub dest_node: Interned<QueryNode>,
pub cost: u32,
pub condition: Option<Interned<E>>,
pub nodes_to_skip: SmallBitmap<QueryNode>,
}
impl<E> Hash for Edge<E> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.source_node.hash(state);
self.dest_node.hash(state);
self.cost.hash(state);
self.condition.hash(state);
}
}
impl<E> Eq for Edge<E> {}
impl<E> PartialEq for Edge<E> {
fn eq(&self, other: &Self) -> bool {
self.source_node == other.source_node
&& self.dest_node == other.dest_node
&& self.cost == other.cost
&& self.condition == other.condition
}
}
/// A trait to be implemented by a marker type to build a graph-based ranking rule.
///
/// It mostly describes how to:
/// 1. Retrieve the set of edges (their cost and condition) between two nodes.
/// 2. Compute the document ids satisfying a condition
pub trait RankingRuleGraphTrait: Sized + 'static {
type Condition: Sized + Clone + PartialEq + Eq + Hash;
/// Compute the document ids associated with the given edge condition,
/// restricted to the given universe.
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition>;
/// Return the costs and conditions of the edges going from the source node to the destination node
fn build_edges(
ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
source_node: Option<&LocatedQueryTermSubset>,
dest_node: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>>;
/// Convert the rank of a path to its corresponding score for the ranking rule
fn rank_to_score(rank: Rank) -> ScoreDetails;
}
/// The graph used by graph-based ranking rules.
///
/// It is built on top of a [`QueryGraph`], keeping the same nodes
/// but replacing the edges.
pub struct RankingRuleGraph<G: RankingRuleGraphTrait> {
pub query_graph: QueryGraph,
pub edges_store: FixedSizeInterner<Option<Edge<G::Condition>>>,
pub edges_of_node: MappedInterner<QueryNode, SmallBitmap<Option<Edge<G::Condition>>>>,
pub conditions_interner: FixedSizeInterner<G::Condition>,
}
impl<G: RankingRuleGraphTrait> Clone for RankingRuleGraph<G> {
fn clone(&self) -> Self {
Self {
query_graph: self.query_graph.clone(),
edges_store: self.edges_store.clone(),
edges_of_node: self.edges_of_node.clone(),
conditions_interner: self.conditions_interner.clone(),
}
}
}
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
/// Remove all edges with the given condition
/// Return a set of all the source nodes of the removed edges
pub fn remove_edges_with_condition(
&mut self,
condition_to_remove: Interned<G::Condition>,
) -> BTreeSet<Interned<QueryNode>> {
let mut source_nodes = BTreeSet::new();
for (edge_id, edge_opt) in self.edges_store.iter_mut() {
let Some(edge) = edge_opt.as_mut() else { continue };
let Some(condition) = edge.condition else { continue };
if condition == condition_to_remove {
let (source_node, _dest_node) = (edge.source_node, edge.dest_node);
*edge_opt = None;
self.edges_of_node.get_mut(source_node).remove(edge_id);
source_nodes.insert(source_node);
}
}
source_nodes
}
}

View File

@ -0,0 +1,143 @@
use fxhash::{FxHashMap, FxHashSet};
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids_within_position;
use crate::search::new::SearchContext;
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct PositionCondition {
term: LocatedQueryTermSubset,
positions: Vec<u16>,
}
pub enum PositionGraph {}
impl RankingRuleGraphTrait for PositionGraph {
type Condition = PositionCondition;
#[tracing::instrument(level = "trace", skip_all, target = "search::position")]
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let PositionCondition { term, positions } = condition;
let mut docids = RoaringBitmap::new();
// TODO use MultiOps to do the big union
for position in positions {
// maybe compute_query_term_subset_docids_within_position should accept a universe as argument
docids |= compute_query_term_subset_docids_within_position(
ctx,
Some(universe),
&term.term_subset,
*position,
)?;
}
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
})
}
#[tracing::instrument(level = "trace", skip_all, target = "search::position")]
fn build_edges(
ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
_from: Option<&LocatedQueryTermSubset>,
to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
let term = to_term;
let mut all_positions = FxHashSet::default();
for word in term.term_subset.all_single_words_except_prefix_db(ctx)? {
let positions = ctx.get_db_word_positions(word.interned())?;
all_positions.extend(positions);
}
for phrase in term.term_subset.all_phrases(ctx)? {
// Only check the position of the first word in the phrase
// this is not correct, but it is the best we can do, since
// it is difficult/impossible to know the expected position
// of a word in a phrase.
// There is probably a more correct way to do it though.
if let Some(word) = phrase.words(ctx).iter().flatten().next() {
let positions = ctx.get_db_word_positions(*word)?;
all_positions.extend(positions);
}
}
if let Some(word_prefix) = term.term_subset.use_prefix_db(ctx) {
let positions = ctx.get_db_word_prefix_positions(word_prefix.interned())?;
all_positions.extend(positions);
}
let mut positions_for_costs = FxHashMap::<u32, Vec<u16>>::default();
for position in all_positions {
// FIXME: bucketed position???
let distance = position.abs_diff(*term.positions.start());
let cost = {
let mut cost = 0;
for i in 0..term.term_ids.len() {
// This is actually not fully correct and slightly penalises ngrams unfairly.
// Because if two words are in the same bucketed position (e.g. 32) and consecutive,
// then their position cost will be 32+32=64, but an ngram of these two words at the
// same position will have a cost of 32+32+1=65
cost += cost_from_distance(distance as u32 + i as u32);
}
cost
};
positions_for_costs.entry(cost).or_default().push(position);
}
let max_cost = term.term_ids.len() as u32 * 10;
let max_cost_exists = positions_for_costs.contains_key(&max_cost);
let mut edges = vec![];
for (cost, positions) in positions_for_costs {
edges.push((
cost,
conditions_interner.insert(PositionCondition { term: term.clone(), positions }),
));
}
if !max_cost_exists {
// artificial empty condition for computing max cost
edges.push((
max_cost,
conditions_interner
.insert(PositionCondition { term: term.clone(), positions: Vec::default() }),
));
}
Ok(edges)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::position")]
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::Position(rank)
}
}
fn cost_from_distance(distance: u32) -> u32 {
match distance {
0 => 0,
1 => 1,
2..=4 => 2,
5..=7 => 3,
8..=11 => 4,
12..=16 => 5,
17..=24 => 6,
25..=64 => 7,
65..=256 => 8,
257..=1024 => 9,
_ => 10,
}
}

View File

@ -0,0 +1,56 @@
#![allow(clippy::too_many_arguments)]
use super::ProximityCondition;
use crate::proximity::MAX_DISTANCE;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
use crate::Result;
pub fn build_edges(
_ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<ProximityCondition>,
left_term: Option<&LocatedQueryTermSubset>,
right_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<ProximityCondition>)>> {
let right_ngram_max = right_term.term_ids.len().saturating_sub(1);
let Some(left_term) = left_term else {
return Ok(vec![(
right_ngram_max as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
)]);
};
if left_term.positions.end() + 1 != *right_term.positions.start() {
// We want to ignore this pair of terms
// Unconditionally walk through the edge without computing the docids
// This can happen when, in a query like `the sun flowers are beautiful`, the term
// `flowers` is removed by the `words` ranking rule.
// The remaining query graph represents `the sun .. are beautiful`
// but `sun` and `are` have no proximity condition between them
return Ok(vec![(
right_ngram_max as u32,
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
)]);
}
let mut conditions = vec![];
for cost in right_ngram_max..(((MAX_DISTANCE as usize) - 1) + right_ngram_max) {
conditions.push((
cost as u32,
conditions_interner.insert(ProximityCondition::Uninit {
left_term: left_term.clone(),
right_term: right_term.clone(),
cost: (cost + 1) as u8,
}),
))
}
conditions.push((
((MAX_DISTANCE - 1) + (right_ngram_max as u32)),
conditions_interner.insert(ProximityCondition::Term { term: right_term.clone() }),
));
Ok(conditions)
}

View File

@ -0,0 +1,251 @@
#![allow(clippy::too_many_arguments)]
use std::collections::BTreeSet;
use roaring::RoaringBitmap;
use super::ProximityCondition;
use crate::search::new::interner::Interned;
use crate::search::new::query_term::{Phrase, QueryTermSubset};
use crate::search::new::ranking_rule_graph::ComputedCondition;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::{SearchContext, Word};
use crate::Result;
pub fn compute_docids(
ctx: &mut SearchContext<'_>,
condition: &ProximityCondition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let (left_term, right_term, cost) = match condition {
ProximityCondition::Uninit { left_term, right_term, cost } => {
(left_term, right_term, *cost)
}
ProximityCondition::Term { term } => {
return Ok(ComputedCondition {
docids: compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
});
}
};
let right_term_ngram_len = right_term.term_ids.len() as u8;
// e.g. for the simple words `sun .. flower`
// the cost is 5
// the forward proximity is 5
// the backward proximity is 4
//
// for the 2gram `the sunflower`
// the cost is 5
// the forward proximity is 4
// the backward proximity is 3
let forward_proximity = 1 + cost - right_term_ngram_len;
let backward_proximity = cost - right_term_ngram_len;
let mut docids = RoaringBitmap::new();
if let Some(right_prefix) = right_term.term_subset.use_prefix_db(ctx) {
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)?
{
compute_prefix_edges(
ctx,
left_word.interned(),
right_prefix.interned(),
left_phrase,
forward_proximity,
backward_proximity,
&mut docids,
universe,
)?;
}
}
for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? {
// Before computing the edges, check that the left word and left phrase
// aren't disjoint with the universe, but only do it if there is more than
// one word derivation to the right.
//
// This is an optimisation to avoid checking for an excessive number of
// pairs.
let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
if right_derivs.len() > 1 {
let universe = &universe;
if let Some(left_phrase) = left_phrase {
if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) {
continue;
}
} else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? {
if left_word_docids.is_empty() {
continue;
}
}
}
for (right_word, right_phrase) in right_derivs {
compute_non_prefix_edges(
ctx,
left_word.interned(),
right_word,
left_phrase,
right_phrase,
forward_proximity,
backward_proximity,
&mut docids,
universe,
)?;
}
}
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: Some(left_term.clone()),
end_term_subset: right_term.clone(),
})
}
fn compute_prefix_edges(
ctx: &mut SearchContext<'_>,
left_word: Interned<String>,
right_prefix: Interned<String>,
left_phrase: Option<Interned<Phrase>>,
forward_proximity: u8,
backward_proximity: u8,
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
) -> Result<()> {
let mut used_left_words = BTreeSet::new();
let mut used_left_phrases = BTreeSet::new();
let mut used_right_prefix = BTreeSet::new();
let mut universe = universe.clone();
if let Some(phrase) = left_phrase {
// TODO we can clearly give the universe to this method
// Unfortunately, it is deserializing/computing stuff and
// keeping the result as a materialized bitmap.
let phrase_docids = ctx.get_phrase_docids(phrase)?;
if !phrase_docids.is_empty() {
used_left_phrases.insert(phrase);
}
universe &= phrase_docids;
if universe.is_empty() {
return Ok(());
}
}
if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids(
Some(&universe),
left_word,
right_prefix,
forward_proximity,
)? {
if !new_docids.is_empty() {
used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix);
*docids |= new_docids;
}
}
// No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none() {
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
Some(&universe),
right_prefix,
left_word,
backward_proximity,
)? {
if !new_docids.is_empty() {
used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix);
*docids |= new_docids;
}
}
}
Ok(())
}
fn compute_non_prefix_edges(
ctx: &mut SearchContext<'_>,
word1: Interned<String>,
word2: Interned<String>,
left_phrase: Option<Interned<Phrase>>,
right_phrase: Option<Interned<Phrase>>,
forward_proximity: u8,
backward_proximity: u8,
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
) -> Result<()> {
let mut universe = universe.clone();
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
universe &= ctx.get_phrase_docids(phrase)?;
if universe.is_empty() {
return Ok(());
}
}
if let Some(new_docids) =
ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)?
{
if !new_docids.is_empty() {
*docids |= new_docids;
}
}
if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() {
if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(
Some(&universe),
word2,
word1,
backward_proximity,
)? {
if !new_docids.is_empty() {
*docids |= new_docids;
}
}
}
Ok(())
}
fn last_words_of_term_derivations(
ctx: &mut SearchContext<'_>,
t: &QueryTermSubset,
) -> Result<BTreeSet<(Option<Interned<Phrase>>, Word)>> {
let mut result = BTreeSet::new();
for w in t.all_single_words_except_prefix_db(ctx)? {
result.insert((None, w));
}
for p in t.all_phrases(ctx)? {
let phrase = ctx.phrase_interner.get(p);
let last_term_of_phrase = phrase.words.last().unwrap();
if let Some(last_word) = last_term_of_phrase {
result.insert((Some(p), Word::Original(*last_word)));
}
}
Ok(result)
}
fn first_word_of_term_iter(
ctx: &mut SearchContext<'_>,
t: &QueryTermSubset,
) -> Result<BTreeSet<(Interned<String>, Option<Interned<Phrase>>)>> {
let mut result = BTreeSet::new();
let all_words = t.all_single_words_except_prefix_db(ctx)?;
for w in all_words {
result.insert((w.interned(), None));
}
for p in t.all_phrases(ctx)? {
let phrase = ctx.phrase_interner.get(p);
let first_term_of_phrase = phrase.words.first().unwrap();
if let Some(first_word) = first_term_of_phrase {
result.insert((*first_word, Some(p)));
}
}
Ok(result)
}

View File

@ -0,0 +1,47 @@
pub mod build;
pub mod compute_docids;
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::SearchContext;
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum ProximityCondition {
Uninit { left_term: LocatedQueryTermSubset, right_term: LocatedQueryTermSubset, cost: u8 },
Term { term: LocatedQueryTermSubset },
}
pub enum ProximityGraph {}
impl RankingRuleGraphTrait for ProximityGraph {
type Condition = ProximityCondition;
#[tracing::instrument(level = "trace", skip_all, target = "search::proximity")]
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
compute_docids::compute_docids(ctx, condition, universe)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::proximity")]
fn build_edges(
ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
source_term: Option<&LocatedQueryTermSubset>,
dest_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
build::build_edges(ctx, conditions_interner, source_term, dest_term)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::proximity")]
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::Proximity(rank)
}
}

View File

@ -0,0 +1,85 @@
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{self, Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::SearchContext;
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct TypoCondition {
term: LocatedQueryTermSubset,
nbr_typos: u8,
}
pub enum TypoGraph {}
impl RankingRuleGraphTrait for TypoGraph {
type Condition = TypoCondition;
#[tracing::instrument(level = "trace", skip_all, target = "search::typo")]
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let TypoCondition { term, .. } = condition;
// maybe compute_query_term_subset_docids should accept a universe as argument
let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?;
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
})
}
#[tracing::instrument(level = "trace", skip_all, target = "search::typo")]
fn build_edges(
ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
_from: Option<&LocatedQueryTermSubset>,
to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
let term = to_term;
let mut edges = vec![];
// Ngrams have a base typo cost
// 2-gram -> equivalent to 1 typo
// 3-gram -> equivalent to 2 typos
let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
let mut term = term.clone();
match nbr_typos {
0 => {
term.term_subset.clear_one_typo_subset();
term.term_subset.clear_two_typo_subset();
}
1 => {
term.term_subset.clear_zero_typo_subset();
term.term_subset.clear_two_typo_subset();
}
2 => {
term.term_subset.clear_zero_typo_subset();
term.term_subset.clear_one_typo_subset();
}
_ => panic!(),
};
edges.push((
nbr_typos as u32 + base_cost,
conditions_interner.insert(TypoCondition { term, nbr_typos }),
));
}
Ok(edges)
}
#[tracing::instrument(level = "trace", skip_all, target = "search::typo")]
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::Typo(score_details::Typo::from_rank(rank))
}
}

View File

@ -0,0 +1,53 @@
use roaring::RoaringBitmap;
use super::{ComputedCondition, RankingRuleGraphTrait};
use crate::score_details::{self, Rank, ScoreDetails};
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::LocatedQueryTermSubset;
use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
use crate::search::new::SearchContext;
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct WordsCondition {
term: LocatedQueryTermSubset,
}
pub enum WordsGraph {}
impl RankingRuleGraphTrait for WordsGraph {
type Condition = WordsCondition;
#[tracing::instrument(level = "trace", skip_all, target = "search::words")]
fn resolve_condition(
ctx: &mut SearchContext<'_>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<ComputedCondition> {
let WordsCondition { term, .. } = condition;
// maybe compute_query_term_subset_docids should accept a universe as argument
let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?;
Ok(ComputedCondition {
docids,
universe_len: universe.len(),
start_term_subset: None,
end_term_subset: term.clone(),
})
}
#[tracing::instrument(level = "trace", skip_all, target = "search::words")]
fn build_edges(
_ctx: &mut SearchContext<'_>,
conditions_interner: &mut DedupInterner<Self::Condition>,
_from: Option<&LocatedQueryTermSubset>,
to_term: &LocatedQueryTermSubset,
) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
Ok(vec![(0, conditions_interner.insert(WordsCondition { term: to_term.clone() }))])
}
#[tracing::instrument(level = "trace", skip_all, target = "search::words")]
fn rank_to_score(rank: Rank) -> ScoreDetails {
ScoreDetails::Words(score_details::Words::from_rank(rank))
}
}