Compute edges of proximity graph lazily

This commit is contained in:
Loïc Lecrenier
2023-03-21 10:44:40 +01:00
parent 272cd7ebbd
commit 83e5b4ed0d
12 changed files with 345 additions and 841 deletions

View File

@ -1,19 +1,28 @@
use std::marker::PhantomData;
use fxhash::FxHashMap;
use fxhash::{FxHashMap, FxHashSet};
use roaring::RoaringBitmap;
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use crate::search::new::interner::Interned;
use crate::search::new::query_term::Phrase;
use crate::search::new::SearchContext;
use crate::Result;
// TODO: give a generation to each universe, then be able to get the exact
// delta of docids between two universes of different generations!
#[derive(Default)]
pub struct ComputedCondition {
docids: RoaringBitmap,
universe_len: u64,
used_words: FxHashSet<Interned<String>>,
used_phrases: FxHashSet<Interned<Phrase>>,
}
/// A cache storing the document ids associated with each ranking rule edge
pub struct ConditionDocIdsCache<G: RankingRuleGraphTrait> {
pub cache: FxHashMap<Interned<G::Condition>, (u64, RoaringBitmap)>,
pub cache: FxHashMap<Interned<G::Condition>, ComputedCondition>,
_phantom: PhantomData<G>,
}
impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
@ -22,6 +31,14 @@ impl<G: RankingRuleGraphTrait> Default for ConditionDocIdsCache<G> {
}
}
impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
pub fn get_condition_used_words_and_phrases(
&mut self,
interned_condition: Interned<G::Condition>,
) -> (&FxHashSet<Interned<String>>, &FxHashSet<Interned<Phrase>>) {
let ComputedCondition { used_words, used_phrases, .. } = &self.cache[&interned_condition];
(used_words, used_phrases)
}
/// Retrieve the document ids for the given edge condition.
///
/// If the cache does not yet contain these docids, they are computed
@ -30,14 +47,14 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
&'s mut self,
ctx: &mut SearchContext<'ctx>,
interned_condition: Interned<G::Condition>,
graph: &RankingRuleGraph<G>,
// TODO: maybe universe doesn't belong here
graph: &mut RankingRuleGraph<G>,
universe: &RoaringBitmap,
) -> Result<&'s RoaringBitmap> {
if self.cache.contains_key(&interned_condition) {
// TODO compare length of universe compared to the one in self
// if it is smaller, then update the value
let (universe_len, docids) = self.cache.entry(interned_condition).or_default();
let ComputedCondition { docids, universe_len, .. } =
self.cache.entry(interned_condition).or_default();
if *universe_len == universe.len() {
return Ok(docids);
} else {
@ -46,12 +63,13 @@ impl<G: RankingRuleGraphTrait> ConditionDocIdsCache<G> {
return Ok(docids);
}
}
// TODO: maybe universe doesn't belong here
let condition = graph.conditions_interner.get(interned_condition);
// TODO: faster way to do this?
let docids = G::resolve_condition(ctx, condition, universe)?;
let _ = self.cache.insert(interned_condition, (universe.len(), docids));
let (_, docids) = &self.cache[&interned_condition];
let condition = graph.conditions_interner.get_mut(interned_condition);
let (docids, used_words, used_phrases) = G::resolve_condition(ctx, condition, universe)?;
let _ = self.cache.insert(
interned_condition,
ComputedCondition { docids, universe_len: universe.len(), used_words, used_phrases },
);
let ComputedCondition { docids, .. } = &self.cache[&interned_condition];
Ok(docids)
}
}

View File

@ -15,11 +15,11 @@ mod proximity;
/// Implementation of the `typo` ranking rule
mod typo;
use std::collections::HashSet;
use std::hash::Hash;
pub use condition_docids_cache::ConditionDocIdsCache;
pub use dead_ends_cache::DeadEndsCache;
use fxhash::FxHashSet;
pub use proximity::{ProximityCondition, ProximityGraph};
use roaring::RoaringBitmap;
pub use typo::{TypoCondition, TypoGraph};
@ -80,23 +80,13 @@ pub trait RankingRuleGraphTrait: Sized {
condition: &Self::Condition,
) -> Result<String>;
fn words_used_by_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
) -> Result<HashSet<Interned<String>>>;
fn phrases_used_by_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
) -> Result<HashSet<Interned<Phrase>>>;
/// Compute the document ids associated with the given edge condition,
/// restricted to the given universe.
fn resolve_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap>;
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>;
/// Return the costs and conditions of the edges going from the source node to the destination node
fn build_edges<'ctx>(

View File

@ -1,56 +1,18 @@
#![allow(clippy::too_many_arguments)]
use std::collections::BTreeMap;
use heed::RoTxn;
use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm};
use crate::search::new::ranking_rule_graph::proximity::WordPair;
use crate::search::new::query_term::LocatedQueryTerm;
use crate::search::new::{QueryNode, SearchContext};
use crate::Result;
fn last_word_of_term_iter<'t>(
t: &'t QueryTerm,
phrase_interner: &'t DedupInterner<Phrase>,
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
move |p| {
let phrase = phrase_interner.get(p);
phrase.words.last().unwrap().map(|last| (Some(p), last))
},
))
}
fn first_word_of_term_iter<'t>(
t: &'t QueryTerm,
phrase_interner: &'t DedupInterner<Phrase>,
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
move |p| {
let phrase = phrase_interner.get(p);
phrase.words.first().unwrap().map(|first| (first, Some(p)))
},
))
}
pub fn build_edges<'ctx>(
ctx: &mut SearchContext<'ctx>,
_ctx: &mut SearchContext<'ctx>,
conditions_interner: &mut DedupInterner<ProximityCondition>,
from_node: &QueryNode,
to_node: &QueryNode,
) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
let SearchContext {
index,
txn,
db_cache,
word_interner,
phrase_interner,
term_interner,
term_docids: _,
} = ctx;
let right_term = match &to_node.data {
QueryNodeData::End => return Ok(vec![(0, None)]),
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
@ -59,13 +21,11 @@ pub fn build_edges<'ctx>(
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
let (right_term, right_start_position, right_ngram_length) =
(term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
let (right_start_position, right_ngram_length) =
(*right_positions.start(), right_positions.len());
let (left_term, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
(term_interner.get(*value), *positions.end())
}
let (left_term_interned, left_end_position) = match &from_node.data {
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
QueryNodeData::Deleted => return Ok(vec![]),
QueryNodeData::Start => {
return Ok(vec![(
@ -94,175 +54,24 @@ pub fn build_edges<'ctx>(
)]);
}
let mut cost_word_pairs = BTreeMap::<u8, Vec<WordPair>>::new();
if let Some(right_prefix) = right_term.use_prefix_db {
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
add_prefix_edges(
index,
txn,
db_cache,
word_interner,
right_ngram_length,
left_word,
right_prefix,
&mut cost_word_pairs,
left_phrase,
)?;
}
}
// TODO: add safeguard in case the cartesian product is too large!
// even if we restrict the word derivations to a maximum of 100, the size of the
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
add_non_prefix_edges(
index,
txn,
db_cache,
word_interner,
right_ngram_length,
left_word,
right_word,
&mut cost_word_pairs,
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
)?;
}
}
let mut new_edges = cost_word_pairs
.into_iter()
.map(|(cost, word_pairs)| {
(
let mut conditions = vec![];
for cost in right_ngram_length..(7 + right_ngram_length) {
let cost = cost as u8;
conditions.push((
cost,
Some(conditions_interner.insert(ProximityCondition::Uninit {
left_term: left_term_interned,
right_term: *right_term_interned,
right_term_ngram_len: right_ngram_length as u8,
cost,
Some(
conditions_interner
.insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }),
),
)
})
.collect::<Vec<_>>();
new_edges.push((
8 + (right_ngram_length - 1) as u8,
})),
))
}
conditions.push((
(7 + right_ngram_length) as u8,
Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
));
Ok(new_edges)
}
fn add_prefix_edges<'ctx>(
index: &mut &crate::Index,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &mut DedupInterner<String>,
right_ngram_length: usize,
left_word: Interned<String>,
right_prefix: Interned<String>,
cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
left_phrase: Option<Interned<Phrase>>,
) -> Result<()> {
for proximity in 1..=(8 - right_ngram_length) {
let cost = (proximity + right_ngram_length - 1) as u8;
// TODO: if we had access to the universe here, we could already check whether
// the bitmap corresponding to this word pair is disjoint with the universe or not
if db_cache
.get_word_prefix_pair_proximity_docids(
index,
txn,
word_interner,
left_word,
right_prefix,
proximity as u8,
)?
.is_some()
{
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix {
phrases: left_phrase.into_iter().collect(),
left: left_word,
right_prefix,
proximity: proximity as u8,
});
}
// No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none()
&& db_cache
.get_prefix_word_pair_proximity_docids(
index,
txn,
word_interner,
right_prefix,
left_word,
proximity as u8 - 1,
)?
.is_some()
{
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped {
left_prefix: right_prefix,
right: left_word,
proximity: proximity as u8 - 1,
});
}
}
Ok(())
}
fn add_non_prefix_edges<'ctx>(
index: &mut &crate::Index,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &mut DedupInterner<String>,
right_ngram_length: usize,
word1: Interned<String>,
word2: Interned<String>,
cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
phrases: &[Interned<Phrase>],
) -> Result<()> {
for proximity in 1..=(8 - right_ngram_length) {
let cost = (proximity + right_ngram_length - 1) as u8;
if db_cache
.get_word_pair_proximity_docids(
index,
txn,
word_interner,
word1,
word2,
proximity as u8,
)?
.is_some()
{
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
phrases: phrases.to_vec(),
left: word1,
right: word2,
proximity: proximity as u8,
});
}
if proximity > 1
// no swapping when either term is a phrase
&& phrases.is_empty()
&& db_cache
.get_word_pair_proximity_docids(
index,
txn,
word_interner,
word2,
word1,
proximity as u8 - 1,
)?
.is_some()
{
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
phrases: vec![],
left: word2,
right: word1,
proximity: proximity as u8 - 1,
});
}
}
Ok(())
Ok(conditions)
}

View File

@ -1,6 +1,15 @@
#![allow(clippy::too_many_arguments)]
use std::iter::FromIterator;
use fxhash::FxHashSet;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::{ProximityCondition, WordPair};
use super::ProximityCondition;
use crate::search::new::db_cache::DatabaseCache;
use crate::search::new::interner::{DedupInterner, Interned};
use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result};
@ -8,7 +17,7 @@ pub fn compute_docids<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &ProximityCondition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
let SearchContext {
index,
txn,
@ -18,96 +27,238 @@ pub fn compute_docids<'ctx>(
phrase_interner,
term_interner,
} = ctx;
let pairs = match condition {
ProximityCondition::Term { term } => {
return term_docids
.get_query_term_docids(
index,
txn,
db_cache,
word_interner,
term_interner,
phrase_interner,
*term,
)
.cloned()
let (left_term, right_term, right_term_ngram_len, cost) = match condition {
ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
(*left_term, *right_term, *right_term_ngram_len, *cost)
}
ProximityCondition::Term { term } => {
let term_v = term_interner.get(*term);
return Ok((
term_docids
.get_query_term_docids(
index,
txn,
db_cache,
word_interner,
term_interner,
phrase_interner,
*term,
)?
.clone(),
FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
FxHashSet::from_iter(term_v.all_phrases()),
));
}
ProximityCondition::Pairs { pairs } => pairs,
};
let mut pair_docids = RoaringBitmap::new();
for pair in pairs.iter() {
let pair = match pair {
WordPair::Words { phrases, left, right, proximity } => {
let mut docids = db_cache
.get_word_pair_proximity_docids(
index,
txn,
word_interner,
*left,
*right,
*proximity,
)?
.map(CboRoaringBitmapCodec::deserialize_from)
.transpose()?
.unwrap_or_default();
if !docids.is_empty() {
for phrase in phrases {
docids &= ctx.term_docids.get_phrase_docids(
index,
txn,
db_cache,
word_interner,
&ctx.phrase_interner,
*phrase,
)?;
}
}
docids
}
WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
let mut docids = db_cache
.get_word_prefix_pair_proximity_docids(
index,
txn,
word_interner,
*left,
*right_prefix,
*proximity,
)?
.map(CboRoaringBitmapCodec::deserialize_from)
.transpose()?
.unwrap_or_default();
if !docids.is_empty() {
for phrase in phrases {
docids &= ctx.term_docids.get_phrase_docids(
index,
txn,
db_cache,
word_interner,
&ctx.phrase_interner,
*phrase,
)?;
}
}
docids
}
WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache
.get_prefix_word_pair_proximity_docids(
index,
txn,
word_interner,
*left_prefix,
*right,
*proximity,
)?
.map(CboRoaringBitmapCodec::deserialize_from)
.transpose()?
.unwrap_or_default(),
};
// TODO: deserialize bitmap within a universe
let bitmap = universe & pair;
pair_docids |= bitmap;
let left_term = term_interner.get(left_term);
let right_term = term_interner.get(right_term);
// e.g. for the simple words `sun .. flower`
// the cost is 5
// the forward proximity is 5
// the backward proximity is 4
//
// for the 2gram `the sunflower`
// the cost is 5
// the forward proximity is 4
// the backward proximity is 3
let forward_proximity = 1 + cost - right_term_ngram_len;
let backward_proximity = cost - right_term_ngram_len;
let mut used_words = FxHashSet::default();
let mut used_phrases = FxHashSet::default();
let mut docids = RoaringBitmap::new();
if let Some(right_prefix) = right_term.use_prefix_db {
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
compute_prefix_edges(
index,
txn,
db_cache,
word_interner,
left_word,
right_prefix,
left_phrase,
forward_proximity,
backward_proximity,
&mut docids,
universe,
&mut used_words,
&mut used_phrases,
)?;
}
}
Ok(pair_docids)
// TODO: add safeguard in case the cartesian product is too large!
// even if we restrict the word derivations to a maximum of 100, the size of the
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
// reached
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
compute_non_prefix_edges(
index,
txn,
db_cache,
word_interner,
left_word,
right_word,
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
forward_proximity,
backward_proximity,
&mut docids,
universe,
&mut used_words,
&mut used_phrases,
)?;
}
}
Ok((docids, used_words, used_phrases))
}
fn compute_prefix_edges<'ctx>(
index: &mut &crate::Index,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &mut DedupInterner<String>,
left_word: Interned<String>,
right_prefix: Interned<String>,
left_phrase: Option<Interned<Phrase>>,
forward_proximity: u8,
backward_proximity: u8,
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
used_words: &mut FxHashSet<Interned<String>>,
used_phrases: &mut FxHashSet<Interned<Phrase>>,
) -> Result<()> {
if let Some(phrase) = left_phrase {
// TODO: compute the phrase, take the intersection between
// the phrase and the docids
used_phrases.insert(phrase); // This is not fully correct
}
if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
index,
txn,
word_interner,
left_word,
right_prefix,
forward_proximity,
)? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(left_word);
used_words.insert(right_prefix);
*docids |= new_docids;
}
}
// No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none() {
if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
index,
txn,
word_interner,
right_prefix,
left_word,
backward_proximity,
)? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(left_word);
used_words.insert(right_prefix);
*docids |= new_docids;
}
}
}
Ok(())
}
fn compute_non_prefix_edges<'ctx>(
index: &mut &crate::Index,
txn: &'ctx RoTxn,
db_cache: &mut DatabaseCache<'ctx>,
word_interner: &mut DedupInterner<String>,
word1: Interned<String>,
word2: Interned<String>,
phrases: &[Interned<Phrase>],
forward_proximity: u8,
backward_proximity: u8,
docids: &mut RoaringBitmap,
universe: &RoaringBitmap,
used_words: &mut FxHashSet<Interned<String>>,
used_phrases: &mut FxHashSet<Interned<Phrase>>,
) -> Result<()> {
if !phrases.is_empty() {
// TODO: compute the docids associated with these phrases
// take their intersection with the new docids
used_phrases.extend(phrases); // This is not fully correct
}
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
index,
txn,
word_interner,
word1,
word2,
forward_proximity,
)? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(word1);
used_words.insert(word2);
*docids |= new_docids;
}
}
if backward_proximity >= 1
// no swapping when either term is a phrase
&& phrases.is_empty()
{
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
index,
txn,
word_interner,
word2,
word1,
backward_proximity,
)? {
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
if !new_docids.is_empty() {
used_words.insert(word1);
used_words.insert(word2);
*docids |= new_docids;
}
}
}
Ok(())
}
fn last_word_of_term_iter<'t>(
t: &'t QueryTerm,
phrase_interner: &'t DedupInterner<Phrase>,
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
move |p| {
let phrase = phrase_interner.get(p);
phrase.words.last().unwrap().map(|last| (Some(p), last))
},
))
}
fn first_word_of_term_iter<'t>(
t: &'t QueryTerm,
phrase_interner: &'t DedupInterner<Phrase>,
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
move |p| {
let phrase = phrase_interner.get(p);
phrase.words.first().unwrap().map(|first| (first, Some(p)))
},
))
}

View File

@ -1,9 +1,7 @@
pub mod build;
pub mod compute_docids;
use std::collections::HashSet;
use std::iter::FromIterator;
use fxhash::FxHashSet;
use roaring::RoaringBitmap;
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
@ -13,31 +11,17 @@ use crate::search::new::query_term::{Phrase, QueryTerm};
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum WordPair {
Words {
phrases: Vec<Interned<Phrase>>,
left: Interned<String>,
right: Interned<String>,
proximity: u8,
},
WordPrefix {
phrases: Vec<Interned<Phrase>>,
left: Interned<String>,
right_prefix: Interned<String>,
proximity: u8,
},
WordPrefixSwapped {
left_prefix: Interned<String>,
right: Interned<String>,
proximity: u8,
},
}
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum ProximityCondition {
Term { term: Interned<QueryTerm> },
Pairs { pairs: Box<[WordPair]> },
Uninit {
left_term: Interned<QueryTerm>,
right_term: Interned<QueryTerm>,
right_term_ngram_len: u8,
cost: u8,
},
Term {
term: Interned<QueryTerm>,
},
}
pub enum ProximityGraph {}
@ -49,7 +33,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<roaring::RoaringBitmap> {
) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
{
compute_docids::compute_docids(ctx, condition, universe)
}
@ -79,107 +64,14 @@ impl RankingRuleGraphTrait for ProximityGraph {
condition: &Self::Condition,
) -> Result<String> {
match condition {
ProximityCondition::Uninit { cost, .. } => {
// TODO
Ok(format!("{cost}: cost"))
}
ProximityCondition::Term { term } => {
let term = ctx.term_interner.get(*term);
Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
}
ProximityCondition::Pairs { pairs } => {
let mut s = String::new();
for pair in pairs.iter() {
match pair {
WordPair::Words { phrases, left, right, proximity } => {
let left = ctx.word_interner.get(*left);
let right = ctx.word_interner.get(*right);
if !phrases.is_empty() {
s.push_str(&format!("{} phrases + ", phrases.len()));
}
s.push_str(&format!("\"{left} {right}\": {proximity}\n"));
}
WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
let left = ctx.word_interner.get(*left);
let right = ctx.word_interner.get(*right_prefix);
if !phrases.is_empty() {
s.push_str(&format!("{} phrases + ", phrases.len()));
}
s.push_str(&format!("\"{left} {right}...\" : {proximity}\n"));
}
WordPair::WordPrefixSwapped { left_prefix, right, proximity } => {
let left = ctx.word_interner.get(*left_prefix);
let right = ctx.word_interner.get(*right);
s.push_str(&format!("\"{left}... {right}\" : {proximity}\n"));
}
}
}
Ok(s)
}
}
}
fn words_used_by_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
) -> Result<HashSet<Interned<String>>> {
match condition {
ProximityCondition::Term { term } => {
let term = ctx.term_interner.get(*term);
Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
}
ProximityCondition::Pairs { pairs } => {
let mut set = HashSet::new();
for pair in pairs.iter() {
match pair {
WordPair::Words { phrases: _, left, right, proximity: _ } => {
set.insert(*left);
set.insert(*right);
}
WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => {
set.insert(*left);
// TODO: this is not correct, there should be another trait method for collecting the prefixes
// to be used with the prefix DBs
set.insert(*right_prefix);
}
WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => {
// TODO: this is not correct, there should be another trait method for collecting the prefixes
// to be used with the prefix DBs
set.insert(*left_prefix);
set.insert(*right);
}
}
}
Ok(set)
}
}
}
fn phrases_used_by_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
) -> Result<HashSet<Interned<Phrase>>> {
match condition {
ProximityCondition::Term { term } => {
let term = ctx.term_interner.get(*term);
Ok(HashSet::from_iter(term.all_phrases()))
}
ProximityCondition::Pairs { pairs } => {
let mut set = HashSet::new();
for pair in pairs.iter() {
match pair {
WordPair::Words { phrases, left: _, right: _, proximity: _ } => {
set.extend(phrases.iter().copied());
}
WordPair::WordPrefix {
phrases,
left: _,
right_prefix: _,
proximity: _,
} => {
set.extend(phrases.iter().copied());
}
WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {}
}
}
Ok(set)
}
}
}
}

View File

@ -1,7 +1,8 @@
use std::collections::HashSet;
// use std::collections::HashSet;
use std::fmt::Write;
use std::iter::FromIterator;
use fxhash::FxHashSet;
use roaring::RoaringBitmap;
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
@ -26,7 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph {
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
let SearchContext {
index,
txn,
@ -48,7 +49,12 @@ impl RankingRuleGraphTrait for TypoGraph {
condition.term,
)?;
Ok(docids)
let term = term_interner.get(condition.term);
Ok((
docids,
FxHashSet::from_iter(term.all_single_words_except_prefix_db()),
FxHashSet::from_iter(term.all_phrases()),
))
}
fn build_edges<'ctx>(
@ -202,21 +208,21 @@ impl RankingRuleGraphTrait for TypoGraph {
Ok(s)
}
fn words_used_by_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
) -> Result<HashSet<Interned<String>>> {
let TypoCondition { term, .. } = condition;
let term = ctx.term_interner.get(*term);
Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
}
// fn words_used_by_condition<'ctx>(
// ctx: &mut SearchContext<'ctx>,
// condition: &Self::Condition,
// ) -> Result<HashSet<Interned<String>>> {
// let TypoCondition { term, .. } = condition;
// let term = ctx.term_interner.get(*term);
// Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
// }
fn phrases_used_by_condition<'ctx>(
ctx: &mut SearchContext<'ctx>,
condition: &Self::Condition,
) -> Result<HashSet<Interned<Phrase>>> {
let TypoCondition { term, .. } = condition;
let term = ctx.term_interner.get(*term);
Ok(HashSet::from_iter(term.all_phrases()))
}
// fn phrases_used_by_condition<'ctx>(
// ctx: &mut SearchContext<'ctx>,
// condition: &Self::Condition,
// ) -> Result<HashSet<Interned<Phrase>>> {
// let TypoCondition { term, .. } = condition;
// let term = ctx.term_interner.get(*term);
// Ok(HashSet::from_iter(term.all_phrases()))
// }
}