Introduce a special word_derivations function for Proximity

This commit is contained in:
Kerollmops
2021-03-09 17:48:05 +01:00
parent facfb4b615
commit d301859bbd
2 changed files with 48 additions and 39 deletions

View File

@ -1,14 +1,13 @@
use std::borrow::Cow;
use std::collections::btree_map::{self, BTreeMap};
use std::collections::hash_map::{HashMap, Entry};
use std::collections::hash_map::HashMap;
use std::mem::take;
use roaring::RoaringBitmap;
use log::debug;
use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}};
use crate::{DocumentId, Position, search::{query_tree::QueryKind}};
use crate::search::query_tree::{maximum_proximity, Operation, Query};
use crate::search::WordDerivationsCache;
use crate::search::{build_dfa, WordDerivationsCache};
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree};
pub struct Proximity<'t> {
@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates(
docid: DocumentId,
consecutive: bool,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates(
let mut groups_positions = Vec::with_capacity(groups_len);
for operation in operations {
let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?;
let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?;
groups_positions.push(positions.into_iter());
}
@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates(
query_tree: &'a Operation,
docid: DocumentId,
rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>,
dwpcache: &mut HashMap<String, Option<RoaringBitmap>>,
words_positions: &HashMap<String, RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Vec<(Position, u8, Position)>>
{
@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates(
}
let result = match query_tree {
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?,
And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?,
Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?,
Or(_, ops) => {
let mut result = Vec::new();
for op in ops {
result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?)
result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?)
}
result.sort_unstable();
result
},
Operation::Query(Query {prefix, kind}) => {
let fst = ctx.words_fst();
let words = match kind {
Operation::Query(Query { prefix, kind }) => {
let mut result = Vec::new();
match kind {
QueryKind::Exact { word, .. } => {
if *prefix {
Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?)
let iter = word_derivations(word, true, 0, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
} else {
Cow::Owned(vec![(word.to_string(), 0)])
if let Some(positions) = words_positions.get(word) {
result.extend(positions.iter().map(|p| (p, 0, p)));
}
}
},
QueryKind::Tolerant { typo, word } => {
Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?)
}
};
let mut result = Vec::new();
for (word, _) in words.as_ref() {
let positions = match dwpcache.entry(word.to_string()) {
Entry::Occupied(entry) => entry.into_mut(),
Entry::Vacant(entry) => {
let positions = ctx.docid_word_positions(docid, word)?;
entry.insert(positions)
}
};
if let Some(positions) = positions {
let iter = positions.iter().map(|p| (p, 0, p));
let iter = word_derivations(word, *prefix, *typo, &words_positions)
.flat_map(|positions| positions.iter().map(|p| (p, 0, p)));
result.extend(iter);
}
}
@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates(
Ok(result)
}
let mut word_positions_cache = HashMap::new();
fn word_derivations<'a>(
word: &str,
is_prefix: bool,
max_typo: u8,
words_positions: &'a HashMap<String, RoaringBitmap>,
) -> impl Iterator<Item = &'a RoaringBitmap>
{
let dfa = build_dfa(word, max_typo, is_prefix);
words_positions.iter().filter_map(move |(document_word, positions)| {
use levenshtein_automata::Distance;
match dfa.eval(document_word) {
Distance::Exact(_) => Some(positions),
Distance::AtLeast(_) => None,
}
})
}
let mut resolve_operation_cache = HashMap::new();
let mut candidates = BTreeMap::new();
for docid in allowed_candidates {
word_positions_cache.clear();
let words_positions = ctx.docid_words_positions(docid)?;
resolve_operation_cache.clear();
let positions = resolve_operation(
ctx,
query_tree,
docid,
&mut resolve_operation_cache,
&mut word_positions_cache,
&words_positions,
wdcache,
)?;
let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity);