Fix bug in the proximity ranking rule for queries with ngrams

This commit is contained in:
Loïc Lecrenier
2023-03-15 12:52:40 +01:00
parent e9cf58d584
commit c0cdaf9f53
9 changed files with 132 additions and 68 deletions

View File

@ -2,6 +2,7 @@
use std::collections::btree_map::Entry;
use std::collections::{BTreeMap, VecDeque};
use std::ops::ControlFlow;
use super::empty_paths_cache::DeadEndPathCache;
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
@ -23,7 +24,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
cost: u16,
all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
empty_paths_cache: &mut DeadEndPathCache<G>,
mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<()>,
mut visit: impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<ControlFlow<()>>,
) -> Result<()> {
let _ = self.visit_paths_of_cost_rec(
from,
@ -43,7 +44,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
cost: u16,
all_distances: &MappedInterner<Vec<(u16, SmallBitmap<G::EdgeCondition>)>, QueryNode>,
empty_paths_cache: &mut DeadEndPathCache<G>,
visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<()>,
visit: &mut impl FnMut(&[u16], &mut Self, &mut DeadEndPathCache<G>) -> Result<ControlFlow<()>>,
prev_conditions: &mut Vec<u16>,
cur_path: &mut SmallBitmap<G::EdgeCondition>,
forbidden_conditions: &mut SmallBitmap<G::EdgeCondition>,
@ -60,7 +61,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
EdgeCondition::Unconditional => {
if edge.dest_node == self.query_graph.end_node {
any_valid = true;
visit(prev_conditions, self, empty_paths_cache)?;
let control_flow = visit(prev_conditions, self, empty_paths_cache)?;
match control_flow {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return Ok(true),
}
true
} else {
self.visit_paths_of_cost_rec(
@ -101,7 +106,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
);
let next_any_valid = if edge.dest_node == self.query_graph.end_node {
any_valid = true;
visit(prev_conditions, self, empty_paths_cache)?;
let control_flow = visit(prev_conditions, self, empty_paths_cache)?;
match control_flow {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return Ok(true),
}
true
} else {
self.visit_paths_of_cost_rec(

View File

@ -20,7 +20,7 @@ use std::hash::Hash;
pub use edge_docids_cache::EdgeConditionDocIdsCache;
pub use empty_paths_cache::DeadEndPathCache;
pub use proximity::{ProximityEdge, ProximityGraph};
pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap;
pub use typo::{TypoEdge, TypoGraph};

View File

@ -1,7 +1,7 @@
#![allow(clippy::too_many_arguments)]
use std::collections::BTreeMap;
use super::ProximityEdge;
use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_graph::QueryNodeData;
@ -37,10 +37,10 @@ fn first_word_of_term_iter<'t>(
pub fn build_edges<'ctx>(
ctx: &mut SearchContext<'ctx>,
conditions_interner: &mut DedupInterner<ProximityEdge>,
conditions_interner: &mut DedupInterner<ProximityCondition>,
from_node: &QueryNode,
to_node: &QueryNode,
) -> Result<Vec<(u8, EdgeCondition<ProximityEdge>)>> {
) -> Result<Vec<(u8, EdgeCondition<ProximityCondition>)>> {
let SearchContext {
index,
txn,
@ -51,24 +51,33 @@ pub fn build_edges<'ctx>(
term_docids: _,
} = ctx;
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNodeData::End => return Ok(vec![]),
};
let right_term = match &to_node.data {
QueryNodeData::End => return Ok(vec![(0, EdgeCondition::Unconditional)]),
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
QueryNodeData::Term(term) => term,
};
let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term;
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
let (right_term, right_start_position, right_ngram_length) =
(term_interner.get(*right_value), *right_positions.start(), right_positions.len());
(term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => {
return Ok(vec![(
(right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner
.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)])
}
QueryNodeData::End => return Ok(vec![]),
};
if left_end_position + 1 != right_start_position {
// We want to ignore this pair of terms
@ -77,7 +86,12 @@ pub fn build_edges<'ctx>(
// `flowers` is removed by the `words` ranking rule.
// The remaining query graph represents `the sun .. are beautiful`
// but `sun` and `are` have no proximity condition between them
return Ok(vec![(0, EdgeCondition::Unconditional)]);
return Ok(vec![(
(right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
)]);
}
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
@ -121,24 +135,30 @@ pub fn build_edges<'ctx>(
}
}
let mut new_edges =
cost_proximity_word_pairs
.into_iter()
.flat_map(|(cost, proximity_word_pairs)| {
let mut edges = vec![];
for (proximity, word_pairs) in proximity_word_pairs {
edges.push((
cost,
EdgeCondition::Conditional(conditions_interner.insert(ProximityEdge {
let mut new_edges = cost_proximity_word_pairs
.into_iter()
.flat_map(|(cost, proximity_word_pairs)| {
let mut edges = vec![];
for (proximity, word_pairs) in proximity_word_pairs {
edges.push((
cost,
EdgeCondition::Conditional(conditions_interner.insert(
ProximityCondition::Pairs {
pairs: word_pairs.into_boxed_slice(),
proximity,
})),
))
}
edges
})
.collect::<Vec<_>>();
new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional));
},
)),
))
}
edges
})
.collect::<Vec<_>>();
new_edges.push((
8 + (right_ngram_length - 1) as u8,
EdgeCondition::Conditional(
conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned }),
),
));
Ok(new_edges)
}

View File

@ -1,16 +1,39 @@
use roaring::RoaringBitmap;
use super::{ProximityEdge, WordPair};
use super::{ProximityCondition, WordPair};
use crate::search::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result};
pub fn compute_docids<'ctx>(
ctx: &mut SearchContext<'ctx>,
edge: &ProximityEdge,
edge: &ProximityCondition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
let SearchContext { index, txn, db_cache, word_interner, .. } = ctx;
let ProximityEdge { pairs, proximity } = edge;
let SearchContext {
index,
txn,
db_cache,
word_interner,
term_docids,
phrase_interner,
term_interner,
} = ctx;
let (pairs, proximity) = match edge {
ProximityCondition::Term { term } => {
return term_docids
.get_query_term_docids(
index,
txn,
db_cache,
word_interner,
term_interner,
phrase_interner,
*term,
)
.cloned()
}
ProximityCondition::Pairs { pairs, proximity } => (pairs, proximity),
};
let mut pair_docids = RoaringBitmap::new();
for pair in pairs.iter() {
let pair = match pair {

View File

@ -4,15 +4,15 @@ pub mod compute_docids;
use roaring::RoaringBitmap;
use super::empty_paths_cache::DeadEndPathCache;
use super::{EdgeCondition, RankingRuleGraphTrait};
use super::{EdgeCondition, RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
use crate::search::new::logger::SearchLogger;
use crate::search::new::query_term::Phrase;
use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::small_bitmap::SmallBitmap;
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
#[derive(Clone, PartialEq, Eq, Hash)]
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum WordPair {
Words {
phrases: Vec<Interned<Phrase>>,
@ -31,27 +31,33 @@ pub enum WordPair {
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub struct ProximityEdge {
pairs: Box<[WordPair]>,
proximity: u8,
pub enum ProximityCondition {
Term { term: Interned<QueryTerm> },
Pairs { pairs: Box<[WordPair]>, proximity: u8 },
}
pub enum ProximityGraph {}
impl RankingRuleGraphTrait for ProximityGraph {
type EdgeCondition = ProximityEdge;
type EdgeCondition = ProximityCondition;
fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
let ProximityEdge { pairs, proximity } = edge;
format!(", prox {proximity}, {} pairs", pairs.len())
match edge {
ProximityCondition::Term { term } => {
format!("term {term}")
}
ProximityCondition::Pairs { pairs, proximity } => {
format!("prox {proximity}, {} pairs", pairs.len())
}
}
}
fn resolve_edge_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
edge: &Self::EdgeCondition,
condition: &Self::EdgeCondition,
universe: &RoaringBitmap,
) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(ctx, edge, universe)
compute_docids::compute_docids(ctx, condition, universe)
}
fn build_edges<'ctx>(
@ -64,11 +70,11 @@ impl RankingRuleGraphTrait for ProximityGraph {
}
fn log_state(
graph: &super::RankingRuleGraph<Self>,
graph: &RankingRuleGraph<Self>,
paths: &[Vec<u16>],
empty_paths_cache: &DeadEndPathCache<Self>,
universe: &RoaringBitmap,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityEdge>)>, QueryNode>,
distances: &MappedInterner<Vec<(u16, SmallBitmap<ProximityCondition>)>, QueryNode>,
cost: u16,
logger: &mut dyn SearchLogger<QueryGraph>,
) {