Intern all strings and phrases in the search logic

This commit is contained in:
Loïc Lecrenier
2023-03-06 19:21:55 +01:00
parent 3f1729a17f
commit e8c76cf7bf
19 changed files with 635 additions and 654 deletions

View File

@ -1,30 +1,30 @@
use std::collections::BTreeMap;
use heed::RoTxn;
use itertools::Itertools;
use super::ProximityEdge;
use crate::new::db_cache::DatabaseCache;
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::proximity::WordPair;
use crate::new::ranking_rule_graph::EdgeDetails;
use crate::new::QueryNode;
use crate::{Index, Result};
use crate::new::{QueryNode, SearchContext};
use crate::Result;
use itertools::Itertools;
use std::collections::BTreeMap;
pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations, i8)>> {
pub fn visit_from_node(
ctx: &mut SearchContext,
from_node: &QueryNode,
) -> Result<Option<(WordDerivations, i8)>> {
Ok(Some(match from_node {
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
QueryTerm::Phrase { phrase: phrase1 } => {
if let Some(original) = phrase1.words.last().unwrap().as_ref() {
let phrase1 = ctx.phrase_interner.get(*phrase1);
if let Some(original) = *phrase1.words.last().unwrap() {
(
WordDerivations {
original: original.clone(),
zero_typo: vec![original.to_owned()],
one_typo: vec![],
two_typos: vec![],
original,
zero_typo: Box::new([original]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![],
synonyms: Box::new([]),
split_words: None,
},
*pos1.end(),
@ -37,12 +37,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
},
QueryNode::Start => (
WordDerivations {
original: String::new(),
zero_typo: vec![],
one_typo: vec![],
two_typos: vec![],
original: ctx.word_interner.insert(String::new()),
zero_typo: Box::new([]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![],
synonyms: Box::new([]),
split_words: None,
},
-100,
@ -51,10 +51,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
}))
}
pub fn visit_to_node<'transaction, 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
pub fn visit_to_node<'search, 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data (WordDerivations, i8),
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
@ -69,15 +67,16 @@ pub fn visit_to_node<'transaction, 'from_data>(
let (derivations2, pos2, ngram_len2) = match value2 {
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
QueryTerm::Phrase { phrase: phrase2 } => {
if let Some(original) = phrase2.words.first().unwrap().as_ref() {
let phrase2 = ctx.phrase_interner.get(*phrase2);
if let Some(original) = *phrase2.words.first().unwrap() {
(
WordDerivations {
original: original.clone(),
zero_typo: vec![original.to_owned()],
one_typo: vec![],
two_typos: vec![],
original,
zero_typo: Box::new([original]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![],
synonyms: Box::new([]),
split_words: None,
},
*pos2.start(),
@ -106,19 +105,16 @@ pub fn visit_to_node<'transaction, 'from_data>(
let derivations1 = derivations1.all_derivations_except_prefix_db();
// TODO: eventually, we want to get rid of the uses from `orginal`
let original_word_2 = derivations2.original.clone();
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
if updb2 {
for word1 in derivations1.clone() {
for proximity in 1..=(8 - ngram_len2) {
let cost = (proximity + ngram_len2 - 1) as u8;
if db_cache
if ctx
.get_word_prefix_pair_proximity_docids(
index,
txn,
word1,
original_word_2.as_str(),
derivations2.original,
proximity as u8,
)?
.is_some()
@ -129,16 +125,14 @@ pub fn visit_to_node<'transaction, 'from_data>(
.entry(proximity as u8)
.or_default()
.push(WordPair::WordPrefix {
left: word1.to_owned(),
right_prefix: original_word_2.to_owned(),
left: word1,
right_prefix: derivations2.original,
});
}
if db_cache
if ctx
.get_prefix_word_pair_proximity_docids(
index,
txn,
original_word_2.as_str(),
word1.as_str(),
derivations2.original,
word1,
proximity as u8 - 1,
)?
.is_some()
@ -149,8 +143,8 @@ pub fn visit_to_node<'transaction, 'from_data>(
.entry(proximity as u8)
.or_default()
.push(WordPair::WordPrefixSwapped {
left_prefix: original_word_2.to_owned(),
right: word1.to_owned(),
left_prefix: derivations2.original,
right: word1,
});
}
}
@ -164,28 +158,23 @@ pub fn visit_to_node<'transaction, 'from_data>(
for (word1, word2) in product_derivations {
for proximity in 1..=(8 - ngram_len2) {
let cost = (proximity + ngram_len2 - 1) as u8;
if db_cache
.get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)?
.is_some()
{
if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() {
cost_proximity_word_pairs
.entry(cost)
.or_default()
.entry(proximity as u8)
.or_default()
.push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() });
.push(WordPair::Words { left: word1, right: word2 });
}
if proximity > 1
&& db_cache
.get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)?
.is_some()
&& ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some()
{
cost_proximity_word_pairs
.entry(cost)
.or_default()
.entry(proximity as u8 - 1)
.or_default()
.push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() });
.push(WordPair::Words { left: word2, right: word1 });
}
}
}

View File

@ -1,14 +1,10 @@
use heed::RoTxn;
use super::{ProximityEdge, WordPair};
use crate::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result};
use roaring::{MultiOps, RoaringBitmap};
use super::{ProximityEdge, WordPair};
use crate::new::db_cache::DatabaseCache;
use crate::{CboRoaringBitmapCodec, Result};
pub fn compute_docids<'transaction>(
index: &crate::Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
pub fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge: &ProximityEdge,
) -> Result<RoaringBitmap> {
let ProximityEdge { pairs, proximity } = edge;
@ -16,12 +12,14 @@ pub fn compute_docids<'transaction>(
for pair in pairs.iter() {
let bytes = match pair {
WordPair::Words { left, right } => {
db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity)
ctx.get_word_pair_proximity_docids(*left, *right, *proximity)
}
WordPair::WordPrefix { left, right_prefix } => {
ctx.get_word_prefix_pair_proximity_docids(*left, *right_prefix, *proximity)
}
WordPair::WordPrefixSwapped { left_prefix, right } => {
ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity)
}
WordPair::WordPrefix { left, right_prefix } => db_cache
.get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity),
WordPair::WordPrefixSwapped { left_prefix, right } => db_cache
.get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity),
}?;
let bitmap =
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();

View File

@ -1,25 +1,22 @@
pub mod build;
pub mod compute_docids;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::{EdgeDetails, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::interner::Interned;
use crate::new::logger::SearchLogger;
use crate::new::query_term::WordDerivations;
use crate::new::{QueryGraph, QueryNode};
use crate::{Index, Result};
use crate::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
use roaring::RoaringBitmap;
// TODO: intern the strings, refer to them by their pointer?
#[derive(Debug, Clone)]
#[derive(Clone)]
pub enum WordPair {
Words { left: String, right: String },
WordPrefix { left: String, right_prefix: String },
WordPrefixSwapped { left_prefix: String, right: String },
Words { left: Interned<String>, right: Interned<String> },
WordPrefix { left: Interned<String>, right_prefix: Interned<String> },
WordPrefixSwapped { left_prefix: Interned<String>, right: Interned<String> },
}
#[derive(Clone)]
@ -40,32 +37,26 @@ impl RankingRuleGraphTrait for ProximityGraph {
format!(", prox {proximity}, {} pairs", pairs.len())
}
fn compute_docids<'db_cache, 'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(index, txn, db_cache, edge)
compute_docids::compute_docids(ctx, edge)
}
fn build_visit_from_node<'transaction>(
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_from_node<'search>(
ctx: &mut SearchContext<'search>,
from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
build::visit_from_node(from_node)
build::visit_from_node(ctx, from_node)
}
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_to_node<'from_data, 'search: 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
build::visit_to_node(index, txn, db_cache, to_node, from_node_data)
build::visit_to_node(ctx, to_node, from_node_data)
}
fn log_state(