mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-31 10:50:03 +00:00
Compute edges of proximity graph lazily
This commit is contained in:
@ -1,56 +1,18 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::ProximityCondition;
|
||||
use crate::search::new::db_cache::DatabaseCache;
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_graph::QueryNodeData;
|
||||
use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm};
|
||||
use crate::search::new::ranking_rule_graph::proximity::WordPair;
|
||||
use crate::search::new::query_term::LocatedQueryTerm;
|
||||
use crate::search::new::{QueryNode, SearchContext};
|
||||
use crate::Result;
|
||||
|
||||
fn last_word_of_term_iter<'t>(
|
||||
t: &'t QueryTerm,
|
||||
phrase_interner: &'t DedupInterner<Phrase>,
|
||||
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
|
||||
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
|
||||
move |p| {
|
||||
let phrase = phrase_interner.get(p);
|
||||
phrase.words.last().unwrap().map(|last| (Some(p), last))
|
||||
},
|
||||
))
|
||||
}
|
||||
fn first_word_of_term_iter<'t>(
|
||||
t: &'t QueryTerm,
|
||||
phrase_interner: &'t DedupInterner<Phrase>,
|
||||
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
|
||||
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
|
||||
move |p| {
|
||||
let phrase = phrase_interner.get(p);
|
||||
phrase.words.first().unwrap().map(|first| (first, Some(p)))
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
pub fn build_edges<'ctx>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
_ctx: &mut SearchContext<'ctx>,
|
||||
conditions_interner: &mut DedupInterner<ProximityCondition>,
|
||||
from_node: &QueryNode,
|
||||
to_node: &QueryNode,
|
||||
) -> Result<Vec<(u8, Option<Interned<ProximityCondition>>)>> {
|
||||
let SearchContext {
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
phrase_interner,
|
||||
term_interner,
|
||||
term_docids: _,
|
||||
} = ctx;
|
||||
|
||||
let right_term = match &to_node.data {
|
||||
QueryNodeData::End => return Ok(vec![(0, None)]),
|
||||
QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]),
|
||||
@ -59,13 +21,11 @@ pub fn build_edges<'ctx>(
|
||||
|
||||
let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term;
|
||||
|
||||
let (right_term, right_start_position, right_ngram_length) =
|
||||
(term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len());
|
||||
let (right_start_position, right_ngram_length) =
|
||||
(*right_positions.start(), right_positions.len());
|
||||
|
||||
let (left_term, left_end_position) = match &from_node.data {
|
||||
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => {
|
||||
(term_interner.get(*value), *positions.end())
|
||||
}
|
||||
let (left_term_interned, left_end_position) = match &from_node.data {
|
||||
QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()),
|
||||
QueryNodeData::Deleted => return Ok(vec![]),
|
||||
QueryNodeData::Start => {
|
||||
return Ok(vec![(
|
||||
@ -94,175 +54,24 @@ pub fn build_edges<'ctx>(
|
||||
)]);
|
||||
}
|
||||
|
||||
let mut cost_word_pairs = BTreeMap::<u8, Vec<WordPair>>::new();
|
||||
|
||||
if let Some(right_prefix) = right_term.use_prefix_db {
|
||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
||||
add_prefix_edges(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
right_ngram_length,
|
||||
left_word,
|
||||
right_prefix,
|
||||
&mut cost_word_pairs,
|
||||
left_phrase,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: add safeguard in case the cartesian product is too large!
|
||||
// even if we restrict the word derivations to a maximum of 100, the size of the
|
||||
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
|
||||
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
|
||||
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
|
||||
// reached
|
||||
|
||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
||||
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
|
||||
add_non_prefix_edges(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
right_ngram_length,
|
||||
left_word,
|
||||
right_word,
|
||||
&mut cost_word_pairs,
|
||||
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
let mut new_edges = cost_word_pairs
|
||||
.into_iter()
|
||||
.map(|(cost, word_pairs)| {
|
||||
(
|
||||
let mut conditions = vec![];
|
||||
for cost in right_ngram_length..(7 + right_ngram_length) {
|
||||
let cost = cost as u8;
|
||||
conditions.push((
|
||||
cost,
|
||||
Some(conditions_interner.insert(ProximityCondition::Uninit {
|
||||
left_term: left_term_interned,
|
||||
right_term: *right_term_interned,
|
||||
right_term_ngram_len: right_ngram_length as u8,
|
||||
cost,
|
||||
Some(
|
||||
conditions_interner
|
||||
.insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }),
|
||||
),
|
||||
)
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
new_edges.push((
|
||||
8 + (right_ngram_length - 1) as u8,
|
||||
})),
|
||||
))
|
||||
}
|
||||
|
||||
conditions.push((
|
||||
(7 + right_ngram_length) as u8,
|
||||
Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })),
|
||||
));
|
||||
Ok(new_edges)
|
||||
}
|
||||
|
||||
fn add_prefix_edges<'ctx>(
|
||||
index: &mut &crate::Index,
|
||||
txn: &'ctx RoTxn,
|
||||
db_cache: &mut DatabaseCache<'ctx>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
right_ngram_length: usize,
|
||||
left_word: Interned<String>,
|
||||
right_prefix: Interned<String>,
|
||||
cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
|
||||
left_phrase: Option<Interned<Phrase>>,
|
||||
) -> Result<()> {
|
||||
for proximity in 1..=(8 - right_ngram_length) {
|
||||
let cost = (proximity + right_ngram_length - 1) as u8;
|
||||
// TODO: if we had access to the universe here, we could already check whether
|
||||
// the bitmap corresponding to this word pair is disjoint with the universe or not
|
||||
if db_cache
|
||||
.get_word_prefix_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
left_word,
|
||||
right_prefix,
|
||||
proximity as u8,
|
||||
)?
|
||||
.is_some()
|
||||
{
|
||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix {
|
||||
phrases: left_phrase.into_iter().collect(),
|
||||
left: left_word,
|
||||
right_prefix,
|
||||
proximity: proximity as u8,
|
||||
});
|
||||
}
|
||||
|
||||
// No swapping when computing the proximity between a phrase and a word
|
||||
if left_phrase.is_none()
|
||||
&& db_cache
|
||||
.get_prefix_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
right_prefix,
|
||||
left_word,
|
||||
proximity as u8 - 1,
|
||||
)?
|
||||
.is_some()
|
||||
{
|
||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped {
|
||||
left_prefix: right_prefix,
|
||||
right: left_word,
|
||||
proximity: proximity as u8 - 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_non_prefix_edges<'ctx>(
|
||||
index: &mut &crate::Index,
|
||||
txn: &'ctx RoTxn,
|
||||
db_cache: &mut DatabaseCache<'ctx>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
right_ngram_length: usize,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
cost_proximity_word_pairs: &mut BTreeMap<u8, Vec<WordPair>>,
|
||||
phrases: &[Interned<Phrase>],
|
||||
) -> Result<()> {
|
||||
for proximity in 1..=(8 - right_ngram_length) {
|
||||
let cost = (proximity + right_ngram_length - 1) as u8;
|
||||
if db_cache
|
||||
.get_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
word1,
|
||||
word2,
|
||||
proximity as u8,
|
||||
)?
|
||||
.is_some()
|
||||
{
|
||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
|
||||
phrases: phrases.to_vec(),
|
||||
left: word1,
|
||||
right: word2,
|
||||
proximity: proximity as u8,
|
||||
});
|
||||
}
|
||||
if proximity > 1
|
||||
// no swapping when either term is a phrase
|
||||
&& phrases.is_empty()
|
||||
&& db_cache
|
||||
.get_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
word2,
|
||||
word1,
|
||||
proximity as u8 - 1,
|
||||
)?
|
||||
.is_some()
|
||||
{
|
||||
cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words {
|
||||
phrases: vec![],
|
||||
left: word2,
|
||||
right: word1,
|
||||
proximity: proximity as u8 - 1,
|
||||
});
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
Ok(conditions)
|
||||
}
|
||||
|
@ -1,6 +1,15 @@
|
||||
#![allow(clippy::too_many_arguments)]
|
||||
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fxhash::FxHashSet;
|
||||
use heed::RoTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{ProximityCondition, WordPair};
|
||||
use super::ProximityCondition;
|
||||
use crate::search::new::db_cache::DatabaseCache;
|
||||
use crate::search::new::interner::{DedupInterner, Interned};
|
||||
use crate::search::new::query_term::{Phrase, QueryTerm};
|
||||
use crate::search::new::SearchContext;
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
@ -8,7 +17,7 @@ pub fn compute_docids<'ctx>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
condition: &ProximityCondition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<RoaringBitmap> {
|
||||
) -> Result<(RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)> {
|
||||
let SearchContext {
|
||||
index,
|
||||
txn,
|
||||
@ -18,96 +27,238 @@ pub fn compute_docids<'ctx>(
|
||||
phrase_interner,
|
||||
term_interner,
|
||||
} = ctx;
|
||||
let pairs = match condition {
|
||||
ProximityCondition::Term { term } => {
|
||||
return term_docids
|
||||
.get_query_term_docids(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
term_interner,
|
||||
phrase_interner,
|
||||
*term,
|
||||
)
|
||||
.cloned()
|
||||
|
||||
let (left_term, right_term, right_term_ngram_len, cost) = match condition {
|
||||
ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => {
|
||||
(*left_term, *right_term, *right_term_ngram_len, *cost)
|
||||
}
|
||||
ProximityCondition::Term { term } => {
|
||||
let term_v = term_interner.get(*term);
|
||||
return Ok((
|
||||
term_docids
|
||||
.get_query_term_docids(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
term_interner,
|
||||
phrase_interner,
|
||||
*term,
|
||||
)?
|
||||
.clone(),
|
||||
FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()),
|
||||
FxHashSet::from_iter(term_v.all_phrases()),
|
||||
));
|
||||
}
|
||||
ProximityCondition::Pairs { pairs } => pairs,
|
||||
};
|
||||
let mut pair_docids = RoaringBitmap::new();
|
||||
for pair in pairs.iter() {
|
||||
let pair = match pair {
|
||||
WordPair::Words { phrases, left, right, proximity } => {
|
||||
let mut docids = db_cache
|
||||
.get_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
*left,
|
||||
*right,
|
||||
*proximity,
|
||||
)?
|
||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
if !docids.is_empty() {
|
||||
for phrase in phrases {
|
||||
docids &= ctx.term_docids.get_phrase_docids(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
&ctx.phrase_interner,
|
||||
*phrase,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
docids
|
||||
}
|
||||
WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
|
||||
let mut docids = db_cache
|
||||
.get_word_prefix_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
*left,
|
||||
*right_prefix,
|
||||
*proximity,
|
||||
)?
|
||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
||||
.transpose()?
|
||||
.unwrap_or_default();
|
||||
if !docids.is_empty() {
|
||||
for phrase in phrases {
|
||||
docids &= ctx.term_docids.get_phrase_docids(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
&ctx.phrase_interner,
|
||||
*phrase,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
docids
|
||||
}
|
||||
WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache
|
||||
.get_prefix_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
*left_prefix,
|
||||
*right,
|
||||
*proximity,
|
||||
)?
|
||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
||||
.transpose()?
|
||||
.unwrap_or_default(),
|
||||
};
|
||||
// TODO: deserialize bitmap within a universe
|
||||
let bitmap = universe & pair;
|
||||
pair_docids |= bitmap;
|
||||
|
||||
let left_term = term_interner.get(left_term);
|
||||
let right_term = term_interner.get(right_term);
|
||||
|
||||
// e.g. for the simple words `sun .. flower`
|
||||
// the cost is 5
|
||||
// the forward proximity is 5
|
||||
// the backward proximity is 4
|
||||
//
|
||||
// for the 2gram `the sunflower`
|
||||
// the cost is 5
|
||||
// the forward proximity is 4
|
||||
// the backward proximity is 3
|
||||
let forward_proximity = 1 + cost - right_term_ngram_len;
|
||||
let backward_proximity = cost - right_term_ngram_len;
|
||||
|
||||
let mut used_words = FxHashSet::default();
|
||||
let mut used_phrases = FxHashSet::default();
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
|
||||
if let Some(right_prefix) = right_term.use_prefix_db {
|
||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
||||
compute_prefix_edges(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
left_word,
|
||||
right_prefix,
|
||||
left_phrase,
|
||||
forward_proximity,
|
||||
backward_proximity,
|
||||
&mut docids,
|
||||
universe,
|
||||
&mut used_words,
|
||||
&mut used_phrases,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(pair_docids)
|
||||
// TODO: add safeguard in case the cartesian product is too large!
|
||||
// even if we restrict the word derivations to a maximum of 100, the size of the
|
||||
// caterisan product could reach a maximum of 10_000 derivations, which is way too much.
|
||||
// Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
|
||||
// + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
|
||||
// reached
|
||||
|
||||
for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) {
|
||||
for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) {
|
||||
compute_non_prefix_edges(
|
||||
index,
|
||||
txn,
|
||||
db_cache,
|
||||
word_interner,
|
||||
left_word,
|
||||
right_word,
|
||||
&[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
|
||||
forward_proximity,
|
||||
backward_proximity,
|
||||
&mut docids,
|
||||
universe,
|
||||
&mut used_words,
|
||||
&mut used_phrases,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((docids, used_words, used_phrases))
|
||||
}
|
||||
|
||||
fn compute_prefix_edges<'ctx>(
|
||||
index: &mut &crate::Index,
|
||||
txn: &'ctx RoTxn,
|
||||
db_cache: &mut DatabaseCache<'ctx>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
left_word: Interned<String>,
|
||||
right_prefix: Interned<String>,
|
||||
left_phrase: Option<Interned<Phrase>>,
|
||||
forward_proximity: u8,
|
||||
backward_proximity: u8,
|
||||
docids: &mut RoaringBitmap,
|
||||
universe: &RoaringBitmap,
|
||||
used_words: &mut FxHashSet<Interned<String>>,
|
||||
used_phrases: &mut FxHashSet<Interned<Phrase>>,
|
||||
) -> Result<()> {
|
||||
if let Some(phrase) = left_phrase {
|
||||
// TODO: compute the phrase, take the intersection between
|
||||
// the phrase and the docids
|
||||
used_phrases.insert(phrase); // This is not fully correct
|
||||
}
|
||||
|
||||
if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
left_word,
|
||||
right_prefix,
|
||||
forward_proximity,
|
||||
)? {
|
||||
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||
if !new_docids.is_empty() {
|
||||
used_words.insert(left_word);
|
||||
used_words.insert(right_prefix);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
|
||||
// No swapping when computing the proximity between a phrase and a word
|
||||
if left_phrase.is_none() {
|
||||
if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
right_prefix,
|
||||
left_word,
|
||||
backward_proximity,
|
||||
)? {
|
||||
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||
if !new_docids.is_empty() {
|
||||
used_words.insert(left_word);
|
||||
used_words.insert(right_prefix);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compute_non_prefix_edges<'ctx>(
|
||||
index: &mut &crate::Index,
|
||||
txn: &'ctx RoTxn,
|
||||
db_cache: &mut DatabaseCache<'ctx>,
|
||||
word_interner: &mut DedupInterner<String>,
|
||||
word1: Interned<String>,
|
||||
word2: Interned<String>,
|
||||
phrases: &[Interned<Phrase>],
|
||||
forward_proximity: u8,
|
||||
backward_proximity: u8,
|
||||
docids: &mut RoaringBitmap,
|
||||
universe: &RoaringBitmap,
|
||||
used_words: &mut FxHashSet<Interned<String>>,
|
||||
used_phrases: &mut FxHashSet<Interned<Phrase>>,
|
||||
) -> Result<()> {
|
||||
if !phrases.is_empty() {
|
||||
// TODO: compute the docids associated with these phrases
|
||||
// take their intersection with the new docids
|
||||
used_phrases.extend(phrases); // This is not fully correct
|
||||
}
|
||||
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
word1,
|
||||
word2,
|
||||
forward_proximity,
|
||||
)? {
|
||||
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||
if !new_docids.is_empty() {
|
||||
used_words.insert(word1);
|
||||
used_words.insert(word2);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
if backward_proximity >= 1
|
||||
// no swapping when either term is a phrase
|
||||
&& phrases.is_empty()
|
||||
{
|
||||
if let Some(new_docids) = db_cache.get_word_pair_proximity_docids(
|
||||
index,
|
||||
txn,
|
||||
word_interner,
|
||||
word2,
|
||||
word1,
|
||||
backward_proximity,
|
||||
)? {
|
||||
let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?;
|
||||
if !new_docids.is_empty() {
|
||||
used_words.insert(word1);
|
||||
used_words.insert(word2);
|
||||
*docids |= new_docids;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn last_word_of_term_iter<'t>(
|
||||
t: &'t QueryTerm,
|
||||
phrase_interner: &'t DedupInterner<Phrase>,
|
||||
) -> impl Iterator<Item = (Option<Interned<Phrase>>, Interned<String>)> + 't {
|
||||
t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map(
|
||||
move |p| {
|
||||
let phrase = phrase_interner.get(p);
|
||||
phrase.words.last().unwrap().map(|last| (Some(p), last))
|
||||
},
|
||||
))
|
||||
}
|
||||
fn first_word_of_term_iter<'t>(
|
||||
t: &'t QueryTerm,
|
||||
phrase_interner: &'t DedupInterner<Phrase>,
|
||||
) -> impl Iterator<Item = (Interned<String>, Option<Interned<Phrase>>)> + 't {
|
||||
t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map(
|
||||
move |p| {
|
||||
let phrase = phrase_interner.get(p);
|
||||
phrase.words.first().unwrap().map(|first| (first, Some(p)))
|
||||
},
|
||||
))
|
||||
}
|
||||
|
@ -1,9 +1,7 @@
|
||||
pub mod build;
|
||||
pub mod compute_docids;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use fxhash::FxHashSet;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
|
||||
@ -13,31 +11,17 @@ use crate::search::new::query_term::{Phrase, QueryTerm};
|
||||
use crate::search::new::{QueryGraph, QueryNode, SearchContext};
|
||||
use crate::Result;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
||||
pub enum WordPair {
|
||||
Words {
|
||||
phrases: Vec<Interned<Phrase>>,
|
||||
left: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
},
|
||||
WordPrefix {
|
||||
phrases: Vec<Interned<Phrase>>,
|
||||
left: Interned<String>,
|
||||
right_prefix: Interned<String>,
|
||||
proximity: u8,
|
||||
},
|
||||
WordPrefixSwapped {
|
||||
left_prefix: Interned<String>,
|
||||
right: Interned<String>,
|
||||
proximity: u8,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq, Eq, Hash)]
|
||||
pub enum ProximityCondition {
|
||||
Term { term: Interned<QueryTerm> },
|
||||
Pairs { pairs: Box<[WordPair]> },
|
||||
Uninit {
|
||||
left_term: Interned<QueryTerm>,
|
||||
right_term: Interned<QueryTerm>,
|
||||
right_term_ngram_len: u8,
|
||||
cost: u8,
|
||||
},
|
||||
Term {
|
||||
term: Interned<QueryTerm>,
|
||||
},
|
||||
}
|
||||
|
||||
pub enum ProximityGraph {}
|
||||
@ -49,7 +33,8 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
condition: &Self::Condition,
|
||||
universe: &RoaringBitmap,
|
||||
) -> Result<roaring::RoaringBitmap> {
|
||||
) -> Result<(roaring::RoaringBitmap, FxHashSet<Interned<String>>, FxHashSet<Interned<Phrase>>)>
|
||||
{
|
||||
compute_docids::compute_docids(ctx, condition, universe)
|
||||
}
|
||||
|
||||
@ -79,107 +64,14 @@ impl RankingRuleGraphTrait for ProximityGraph {
|
||||
condition: &Self::Condition,
|
||||
) -> Result<String> {
|
||||
match condition {
|
||||
ProximityCondition::Uninit { cost, .. } => {
|
||||
// TODO
|
||||
Ok(format!("{cost}: cost"))
|
||||
}
|
||||
ProximityCondition::Term { term } => {
|
||||
let term = ctx.term_interner.get(*term);
|
||||
Ok(format!("{} : exists", ctx.word_interner.get(term.original)))
|
||||
}
|
||||
ProximityCondition::Pairs { pairs } => {
|
||||
let mut s = String::new();
|
||||
for pair in pairs.iter() {
|
||||
match pair {
|
||||
WordPair::Words { phrases, left, right, proximity } => {
|
||||
let left = ctx.word_interner.get(*left);
|
||||
let right = ctx.word_interner.get(*right);
|
||||
if !phrases.is_empty() {
|
||||
s.push_str(&format!("{} phrases + ", phrases.len()));
|
||||
}
|
||||
s.push_str(&format!("\"{left} {right}\": {proximity}\n"));
|
||||
}
|
||||
WordPair::WordPrefix { phrases, left, right_prefix, proximity } => {
|
||||
let left = ctx.word_interner.get(*left);
|
||||
let right = ctx.word_interner.get(*right_prefix);
|
||||
if !phrases.is_empty() {
|
||||
s.push_str(&format!("{} phrases + ", phrases.len()));
|
||||
}
|
||||
s.push_str(&format!("\"{left} {right}...\" : {proximity}\n"));
|
||||
}
|
||||
WordPair::WordPrefixSwapped { left_prefix, right, proximity } => {
|
||||
let left = ctx.word_interner.get(*left_prefix);
|
||||
let right = ctx.word_interner.get(*right);
|
||||
s.push_str(&format!("\"{left}... {right}\" : {proximity}\n"));
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn words_used_by_condition<'ctx>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
condition: &Self::Condition,
|
||||
) -> Result<HashSet<Interned<String>>> {
|
||||
match condition {
|
||||
ProximityCondition::Term { term } => {
|
||||
let term = ctx.term_interner.get(*term);
|
||||
Ok(HashSet::from_iter(term.all_single_words_except_prefix_db()))
|
||||
}
|
||||
ProximityCondition::Pairs { pairs } => {
|
||||
let mut set = HashSet::new();
|
||||
for pair in pairs.iter() {
|
||||
match pair {
|
||||
WordPair::Words { phrases: _, left, right, proximity: _ } => {
|
||||
set.insert(*left);
|
||||
set.insert(*right);
|
||||
}
|
||||
WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => {
|
||||
set.insert(*left);
|
||||
// TODO: this is not correct, there should be another trait method for collecting the prefixes
|
||||
// to be used with the prefix DBs
|
||||
set.insert(*right_prefix);
|
||||
}
|
||||
WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => {
|
||||
// TODO: this is not correct, there should be another trait method for collecting the prefixes
|
||||
// to be used with the prefix DBs
|
||||
set.insert(*left_prefix);
|
||||
set.insert(*right);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(set)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn phrases_used_by_condition<'ctx>(
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
condition: &Self::Condition,
|
||||
) -> Result<HashSet<Interned<Phrase>>> {
|
||||
match condition {
|
||||
ProximityCondition::Term { term } => {
|
||||
let term = ctx.term_interner.get(*term);
|
||||
Ok(HashSet::from_iter(term.all_phrases()))
|
||||
}
|
||||
ProximityCondition::Pairs { pairs } => {
|
||||
let mut set = HashSet::new();
|
||||
for pair in pairs.iter() {
|
||||
match pair {
|
||||
WordPair::Words { phrases, left: _, right: _, proximity: _ } => {
|
||||
set.extend(phrases.iter().copied());
|
||||
}
|
||||
WordPair::WordPrefix {
|
||||
phrases,
|
||||
left: _,
|
||||
right_prefix: _,
|
||||
proximity: _,
|
||||
} => {
|
||||
set.extend(phrases.iter().copied());
|
||||
}
|
||||
WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {}
|
||||
}
|
||||
}
|
||||
Ok(set)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user