Intern all strings and phrases in the search logic

This commit is contained in:
Loïc Lecrenier
2023-03-06 19:21:55 +01:00
parent 3f1729a17f
commit e8c76cf7bf
19 changed files with 635 additions and 654 deletions

View File

@@ -1,51 +1,48 @@
use std::collections::hash_map::Entry;
use super::{interner::Interned, SearchContext};
use crate::Result;
use fxhash::FxHashMap;
use heed::types::ByteSlice;
use heed::RoTxn;
use crate::{Index, Result};
use std::collections::hash_map::Entry;
#[derive(Default)]
pub struct DatabaseCache<'transaction> {
pub word_pair_proximity_docids: FxHashMap<(u8, String, String), Option<&'transaction [u8]>>,
pub struct DatabaseCache<'search> {
// TODO: interner for all database cache keys
pub word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
pub word_prefix_pair_proximity_docids:
FxHashMap<(u8, String, String), Option<&'transaction [u8]>>,
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
pub prefix_word_pair_proximity_docids:
FxHashMap<(u8, String, String), Option<&'transaction [u8]>>,
pub word_docids: FxHashMap<String, Option<&'transaction [u8]>>,
pub exact_word_docids: FxHashMap<String, Option<&'transaction [u8]>>,
pub word_prefix_docids: FxHashMap<String, Option<&'transaction [u8]>>,
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
pub word_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
pub exact_word_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
}
impl<'transaction> DatabaseCache<'transaction> {
pub fn get_word_docids(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
word: &str,
) -> Result<Option<&'transaction [u8]>> {
let bitmap_ptr = match self.word_docids.entry(word.to_owned()) {
impl<'search> SearchContext<'search> {
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr = index.word_docids.remap_data_type::<ByteSlice>().get(txn, word)?;
let bitmap_ptr = self
.index
.word_docids
.remap_data_type::<ByteSlice>()
.get(self.txn, self.word_interner.get(word))?;
entry.insert(bitmap_ptr);
bitmap_ptr
}
};
Ok(bitmap_ptr)
}
pub fn get_prefix_docids(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
prefix: &str,
) -> Result<Option<&'transaction [u8]>> {
pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
// In the future, this will be a frozen roaring bitmap
let bitmap_ptr = match self.word_prefix_docids.entry(prefix.to_owned()) {
let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr =
index.word_prefix_docids.remap_data_type::<ByteSlice>().get(txn, prefix)?;
let bitmap_ptr = self
.index
.word_prefix_docids
.remap_data_type::<ByteSlice>()
.get(self.txn, self.word_interner.get(prefix))?;
entry.insert(bitmap_ptr);
bitmap_ptr
}
@@ -55,14 +52,12 @@ impl<'transaction> DatabaseCache<'transaction> {
pub fn get_word_pair_proximity_docids(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
word1: &str,
word2: &str,
word1: Interned<String>,
word2: Interned<String>,
proximity: u8,
) -> Result<Option<&'transaction [u8]>> {
let key = (proximity, word1.to_owned(), word2.to_owned());
match self.word_pair_proximity_docids.entry(key.clone()) {
) -> Result<Option<&'search [u8]>> {
let key = (proximity, word1, word2);
match self.db_cache.word_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
// We shouldn't greedily access this DB at all
@@ -86,10 +81,11 @@ impl<'transaction> DatabaseCache<'transaction> {
// output.push(word1, word2, proximities);
// }
// }
let bitmap_ptr = index
.word_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?;
let bitmap_ptr =
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
self.txn,
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
@@ -98,20 +94,22 @@ impl<'transaction> DatabaseCache<'transaction> {
pub fn get_word_prefix_pair_proximity_docids(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
word1: &str,
prefix2: &str,
word1: Interned<String>,
prefix2: Interned<String>,
proximity: u8,
) -> Result<Option<&'transaction [u8]>> {
let key = (proximity, word1.to_owned(), prefix2.to_owned());
match self.word_prefix_pair_proximity_docids.entry(key.clone()) {
) -> Result<Option<&'search [u8]>> {
let key = (proximity, word1, prefix2);
match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
let bitmap_ptr = index
let bitmap_ptr = self
.index
.word_prefix_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(txn, &(key.0, key.1.as_str(), key.2.as_str()))?;
.get(
self.txn,
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
@@ -119,20 +117,26 @@ impl<'transaction> DatabaseCache<'transaction> {
}
pub fn get_prefix_word_pair_proximity_docids(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
left_prefix: &str,
right: &str,
left_prefix: Interned<String>,
right: Interned<String>,
proximity: u8,
) -> Result<Option<&'transaction [u8]>> {
let key = (proximity, left_prefix.to_owned(), right.to_owned());
match self.prefix_word_pair_proximity_docids.entry(key) {
) -> Result<Option<&'search [u8]>> {
let key = (proximity, left_prefix, right);
match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
let bitmap_ptr = index
let bitmap_ptr = self
.index
.prefix_word_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(txn, &(proximity, left_prefix, right))?;
.get(
self.txn,
&(
proximity,
self.word_interner.get(left_prefix),
self.word_interner.get(right),
),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}

View File

@@ -1,15 +1,11 @@
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
use super::ranking_rule_graph::EdgeDocidsCache;
use super::ranking_rule_graph::EmptyPathsCache;
use super::ranking_rule_graph::{RankingRuleGraph, RankingRuleGraphTrait};
use super::SearchContext;
use super::{BitmapOrAllRef, QueryGraph, RankingRule, RankingRuleOutput};
use crate::{Index, Result};
use crate::Result;
use roaring::RoaringBitmap;
pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
id: String,
@@ -29,12 +25,10 @@ pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
cur_distance_idx: usize,
}
fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
fn remove_empty_edges<'search, G: RankingRuleGraphTrait>(
ctx: &mut SearchContext<'search>,
graph: &mut RankingRuleGraph<G>,
edge_docids_cache: &mut EdgeDocidsCache<G>,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
universe: &RoaringBitmap,
empty_paths_cache: &mut EmptyPathsCache,
) -> Result<()> {
@@ -42,8 +36,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
if graph.all_edges[edge_index as usize].is_none() {
continue;
}
let docids = edge_docids_cache
.get_edge_docids(index, txn, db_cache, edge_index, &*graph, universe)?;
let docids = edge_docids_cache.get_edge_docids(ctx, edge_index, &*graph, universe)?;
match docids {
BitmapOrAllRef::Bitmap(bitmap) => {
if bitmap.is_disjoint(universe) {
@@ -59,7 +52,7 @@ fn remove_empty_edges<'transaction, G: RankingRuleGraphTrait>(
Ok(())
}
impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGraph>
impl<'search, G: RankingRuleGraphTrait> RankingRule<'search, QueryGraph>
for GraphBasedRankingRule<G>
{
fn id(&self) -> String {
@@ -67,24 +60,20 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
}
fn start_iteration(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
) -> Result<()> {
// TODO: update old state instead of starting from scratch
let mut graph = RankingRuleGraph::build(index, txn, db_cache, query_graph.clone())?;
let mut graph = RankingRuleGraph::build(ctx, query_graph.clone())?;
let mut edge_docids_cache = EdgeDocidsCache::default();
let mut empty_paths_cache = EmptyPathsCache::new(graph.all_edges.len());
remove_empty_edges(
ctx,
&mut graph,
&mut edge_docids_cache,
index,
txn,
db_cache,
universe,
&mut empty_paths_cache,
)?;
@@ -105,20 +94,16 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
fn next_bucket(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
assert!(universe.len() > 1);
let mut state = self.state.take().unwrap();
remove_empty_edges(
ctx,
&mut state.graph,
&mut state.edge_docids_cache,
index,
txn,
db_cache,
universe,
&mut state.empty_paths_cache,
)?;
@@ -151,9 +136,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
);
let bucket = state.graph.resolve_paths(
index,
txn,
db_cache,
ctx,
&mut state.edge_docids_cache,
&mut state.empty_paths_cache,
universe,
@@ -169,9 +152,7 @@ impl<'transaction, G: RankingRuleGraphTrait> RankingRule<'transaction, QueryGrap
fn end_iteration(
&mut self,
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
_ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.state = None;

View File

@@ -0,0 +1,78 @@
use fxhash::FxHashMap;
use std::hash::Hash;
use std::marker::PhantomData;
pub struct Interned<T> {
idx: u32,
_phantom: PhantomData<T>,
}
impl<T> Interned<T> {
fn new(idx: u32) -> Self {
Self { idx, _phantom: PhantomData }
}
}
pub struct Interner<T> {
stable_store: Vec<T>,
lookup: FxHashMap<T, Interned<T>>,
}
impl<T> Default for Interner<T> {
fn default() -> Self {
Self { stable_store: Default::default(), lookup: Default::default() }
}
}
impl<T> Interner<T>
where
T: Clone + Eq + Hash,
{
pub fn insert(&mut self, s: T) -> Interned<T> {
if let Some(interned) = self.lookup.get(&s) {
*interned
} else {
self.stable_store.push(s.clone());
let interned = Interned::new(self.stable_store.len() as u32 - 1);
self.lookup.insert(s, interned);
interned
}
}
pub fn get(&self, interned: Interned<T>) -> &T {
&self.stable_store[interned.idx as usize]
}
}
// Interned<T> boilerplate implementations
impl<T> Hash for Interned<T> {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.idx.hash(state);
}
}
impl<T: Ord> Ord for Interned<T> {
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.idx.cmp(&other.idx)
}
}
impl<T> PartialOrd for Interned<T> {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
self.idx.partial_cmp(&other.idx)
}
}
impl<T> Eq for Interned<T> {}
impl<T> PartialEq for Interned<T> {
fn eq(&self, other: &Self) -> bool {
self.idx == other.idx
}
}
impl<T> Clone for Interned<T> {
fn clone(&self) -> Self {
Self { idx: self.idx, _phantom: PhantomData }
}
}
impl<T> Copy for Interned<T> {}

View File

@@ -6,7 +6,7 @@ use std::time::Instant;
use std::{io::Write, path::PathBuf};
use crate::new::ranking_rule_graph::TypoGraph;
use crate::new::{QueryNode, QueryGraph};
use crate::new::{QueryNode, QueryGraph, SearchContext};
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::EmptyPathsCache;
use crate::new::ranking_rule_graph::{Edge, EdgeDetails, RankingRuleGraphTrait};
@@ -176,7 +176,7 @@ impl SearchLogger<QueryGraph> for DetailedSearchLogger {
}
impl DetailedSearchLogger {
pub fn write_d2_description(&self) {
pub fn write_d2_description(&self,ctx: &mut SearchContext,) {
let mut prev_time = self.initial_query_time.unwrap();
let mut timestamp = vec![];
fn activated_id(timestamp: &[usize]) -> String {
@@ -193,12 +193,12 @@ impl DetailedSearchLogger {
writeln!(&mut file, "direction: right").unwrap();
writeln!(&mut file, "Initial Query Graph: {{").unwrap();
let initial_query_graph = self.initial_query.as_ref().unwrap();
Self::query_graph_d2_description(initial_query_graph, &mut file);
Self::query_graph_d2_description(ctx, initial_query_graph, &mut file);
writeln!(&mut file, "}}").unwrap();
writeln!(&mut file, "Query Graph Used To Compute Universe: {{").unwrap();
let query_graph_for_universe = self.query_for_universe.as_ref().unwrap();
Self::query_graph_d2_description(query_graph_for_universe, &mut file);
Self::query_graph_d2_description(ctx, query_graph_for_universe, &mut file);
writeln!(&mut file, "}}").unwrap();
let initial_universe = self.initial_universe.as_ref().unwrap();
@@ -308,7 +308,7 @@ results.{random} {{
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::query_graph_d2_description(query_graph, &mut new_file);
Self::query_graph_d2_description(ctx, query_graph, &mut new_file);
writeln!(
&mut file,
"{id} {{
@@ -323,7 +323,7 @@ results.{random} {{
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
Self::ranking_rule_graph_d2_description(ctx, graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
writeln!(
&mut file,
"{id} {{
@@ -339,7 +339,7 @@ results.{random} {{
let id = format!("{cur_ranking_rule}.{cur_activated_id}");
let new_file_path = self.folder_path.join(format!("{id}.d2"));
let mut new_file = std::fs::File::create(new_file_path).unwrap();
Self::ranking_rule_graph_d2_description(graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
Self::ranking_rule_graph_d2_description(ctx,graph, paths, empty_paths_cache, distances.clone(), &mut new_file);
writeln!(
&mut file,
"{id} {{
@@ -352,31 +352,40 @@ results.{random} {{
writeln!(&mut file, "}}").unwrap();
}
fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) {
fn query_node_d2_desc(ctx: &mut SearchContext, node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) {
match &node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => {
match value {
QueryTerm::Phrase { phrase } => {
let phrase_str = phrase.description();
let phrase = ctx.phrase_interner.get(*phrase);
let phrase_str = phrase.description(&ctx.word_interner);
writeln!(file,"{node_idx} : \"{phrase_str}\"").unwrap();
},
QueryTerm::Word { derivations: WordDerivations { original, zero_typo, one_typo, two_typos, use_prefix_db, synonyms, split_words } } => {
let original = ctx.word_interner.get(*original);
writeln!(file,"{node_idx} : \"{original}\" {{
shape: class").unwrap();
for w in zero_typo {
for w in zero_typo.iter().copied() {
let w = ctx.word_interner.get(w);
writeln!(file, "\"{w}\" : 0").unwrap();
}
for w in one_typo {
for w in one_typo.iter().copied() {
let w = ctx.word_interner.get(w);
writeln!(file, "\"{w}\" : 1").unwrap();
}
for w in two_typos {
for w in two_typos.iter().copied() {
let w = ctx.word_interner.get(w);
writeln!(file, "\"{w}\" : 2").unwrap();
}
if let Some((left, right)) = split_words {
writeln!(file, "\"{left} {right}\" : split_words").unwrap();
if let Some(split_words) = split_words {
let phrase = ctx.phrase_interner.get(*split_words);
let phrase_str = phrase.description(&ctx.word_interner);
writeln!(file, "\"{phrase_str}\" : split_words").unwrap();
}
for synonym in synonyms {
writeln!(file, "\"{}\" : synonym", synonym.description()).unwrap();
for synonym in synonyms.iter().copied() {
let phrase = ctx.phrase_interner.get(synonym);
let phrase_str = phrase.description(&ctx.word_interner);
writeln!(file, "\"{phrase_str}\" : synonym").unwrap();
}
if *use_prefix_db {
writeln!(file, "use prefix DB : true").unwrap();
@@ -398,20 +407,20 @@ shape: class").unwrap();
},
}
}
fn query_graph_d2_description(query_graph: &QueryGraph, file: &mut File) {
fn query_graph_d2_description(ctx: &mut SearchContext, query_graph: &QueryGraph, file: &mut File) {
writeln!(file,"direction: right").unwrap();
for node in 0..query_graph.nodes.len() {
if matches!(query_graph.nodes[node], QueryNode::Deleted) {
continue;
}
Self::query_node_d2_desc(node, &query_graph.nodes[node], &[], file);
Self::query_node_d2_desc(ctx, node, &query_graph.nodes[node], &[], file);
for edge in query_graph.edges[node].successors.iter() {
writeln!(file, "{node} -> {edge};\n").unwrap();
}
}
}
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u64>>, file: &mut File) {
fn ranking_rule_graph_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], _empty_paths_cache: &EmptyPathsCache, distances: Vec<Vec<u64>>, file: &mut File) {
writeln!(file,"direction: right").unwrap();
writeln!(file, "Proximity Graph {{").unwrap();
@@ -420,7 +429,7 @@ shape: class").unwrap();
continue;
}
let distances = &distances[node_idx];
Self::query_node_d2_desc(node_idx, node, distances.as_slice(), file);
Self::query_node_d2_desc(ctx, node_idx, node, distances.as_slice(), file);
}
for edge in graph.all_edges.iter().flatten() {
let Edge { from_node, to_node, details, .. } = edge;
@@ -449,7 +458,7 @@ shape: class").unwrap();
writeln!(file, "Shortest Paths {{").unwrap();
Self::paths_d2_description(graph, paths, file);
Self::paths_d2_description(ctx, graph, paths, file);
writeln!(file, "}}").unwrap();
// writeln!(file, "Empty Edge Couples {{").unwrap();
@@ -468,15 +477,18 @@ shape: class").unwrap();
// }
// writeln!(file, "}}").unwrap();
}
fn edge_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, edge_idx: u32, file: &mut File) {
fn edge_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext,graph: &RankingRuleGraph<R>, edge_idx: u32, file: &mut File) {
let Edge { from_node, to_node, cost, .. } = graph.all_edges[edge_idx as usize].as_ref().unwrap() ;
let from_node = &graph.query_graph.nodes[*from_node as usize];
let from_node_desc = match from_node {
QueryNode::Term(term) => match &term.value {
QueryTerm::Phrase { phrase } => {
phrase.description()
let phrase = ctx.phrase_interner.get(*phrase);
phrase.description(&ctx.word_interner)
},
QueryTerm::Word { derivations } => {
ctx.word_interner.get(derivations.original).to_owned()
},
QueryTerm::Word { derivations } => derivations.original.clone(),
},
QueryNode::Deleted => panic!(),
QueryNode::Start => "START".to_owned(),
@@ -485,8 +497,11 @@ shape: class").unwrap();
let to_node = &graph.query_graph.nodes[*to_node as usize];
let to_node_desc = match to_node {
QueryNode::Term(term) => match &term.value {
QueryTerm::Phrase { phrase } => phrase.description(),
QueryTerm::Word { derivations } => derivations.original.clone(),
QueryTerm::Phrase { phrase } => {
let phrase = ctx.phrase_interner.get(*phrase);
phrase.description(&ctx.word_interner)
},
QueryTerm::Word { derivations } => ctx.word_interner.get(derivations.original).to_owned(),
},
QueryNode::Deleted => panic!(),
QueryNode::Start => "START".to_owned(),
@@ -496,11 +511,11 @@ shape: class").unwrap();
shape: class
}}").unwrap();
}
fn paths_d2_description<R: RankingRuleGraphTrait>(graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], file: &mut File) {
fn paths_d2_description<R: RankingRuleGraphTrait>(ctx: &mut SearchContext, graph: &RankingRuleGraph<R>, paths: &[Vec<u32>], file: &mut File) {
for (path_idx, edge_indexes) in paths.iter().enumerate() {
writeln!(file, "{path_idx} {{").unwrap();
for edge_idx in edge_indexes.iter() {
Self::edge_d2_description(graph, *edge_idx, file);
Self::edge_d2_description(ctx, graph, *edge_idx, file);
}
for couple_edges in edge_indexes.windows(2) {
let [src_edge_idx, dest_edge_idx] = couple_edges else { panic!() };

View File

@@ -1,5 +1,6 @@
mod db_cache;
mod graph_based_ranking_rule;
mod interner;
mod logger;
mod query_graph;
mod query_term;
@@ -26,7 +27,9 @@ use query_graph::{QueryGraph, QueryNode};
use roaring::RoaringBitmap;
use self::{
interner::Interner,
logger::SearchLogger,
query_term::Phrase,
resolve_query_graph::{resolve_query_graph, NodeDocIdsCache},
};
@@ -35,14 +38,32 @@ pub enum BitmapOrAllRef<'s> {
All,
}
pub struct SearchContext<'search> {
pub index: &'search Index,
pub txn: &'search RoTxn<'search>,
pub db_cache: DatabaseCache<'search>,
pub word_interner: Interner<String>,
pub phrase_interner: Interner<Phrase>,
pub node_docids_cache: NodeDocIdsCache,
}
impl<'search> SearchContext<'search> {
pub fn new(index: &'search Index, txn: &'search RoTxn<'search>) -> Self {
Self {
index,
txn,
db_cache: <_>::default(),
word_interner: <_>::default(),
phrase_interner: <_>::default(),
node_docids_cache: <_>::default(),
}
}
}
#[allow(clippy::too_many_arguments)]
pub fn resolve_maximally_reduced_query_graph<'transaction>(
index: &Index,
txn: &'transaction heed::RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
pub fn resolve_maximally_reduced_query_graph<'search>(
ctx: &mut SearchContext<'search>,
universe: &RoaringBitmap,
query_graph: &QueryGraph,
node_docids_cache: &mut NodeDocIdsCache,
matching_strategy: TermsMatchingStrategy,
logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<RoaringBitmap> {
@@ -73,16 +94,14 @@ pub fn resolve_maximally_reduced_query_graph<'transaction>(
}
}
logger.query_for_universe(&graph);
let docids = resolve_query_graph(index, txn, db_cache, node_docids_cache, &graph, universe)?;
let docids = resolve_query_graph(ctx, &graph, universe)?;
Ok(docids)
}
#[allow(clippy::too_many_arguments)]
pub fn execute_search<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
pub fn execute_search<'search>(
ctx: &mut SearchContext<'search>,
query: &str,
filters: Option<Filter>,
from: usize,
@@ -90,26 +109,21 @@ pub fn execute_search<'transaction>(
logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<Vec<u32>> {
assert!(!query.is_empty());
let query_terms = located_query_terms_from_string(index, txn, query.tokenize(), None).unwrap();
let graph = QueryGraph::from_query(index, txn, db_cache, query_terms)?;
let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None).unwrap();
let graph = QueryGraph::from_query(ctx, query_terms)?;
logger.initial_query(&graph);
let universe = if let Some(filters) = filters {
filters.evaluate(txn, index)?
filters.evaluate(ctx.txn, ctx.index)?
} else {
index.documents_ids(txn)?
ctx.index.documents_ids(ctx.txn)?
};
let mut node_docids_cache = NodeDocIdsCache::default();
let universe = resolve_maximally_reduced_query_graph(
index,
txn,
db_cache,
ctx,
&universe,
&graph,
&mut node_docids_cache,
TermsMatchingStrategy::Last,
logger,
)?;
@@ -117,5 +131,5 @@ pub fn execute_search<'transaction>(
logger.initial_universe(&universe);
apply_ranking_rules(index, txn, db_cache, &graph, &universe, from, length, logger)
apply_ranking_rules(ctx, &graph, &universe, from, length, logger)
}

View File

@@ -1,13 +1,10 @@
use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
use super::SearchContext;
use crate::Result;
use roaring::RoaringBitmap;
use std::fmt::Debug;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::{Index, Result};
#[derive(Debug, Clone)]
#[derive(Clone)]
pub enum QueryNode {
Term(LocatedQueryTerm),
Deleted,
@@ -22,7 +19,7 @@ pub struct Edges {
pub successors: RoaringBitmap,
}
#[derive(Debug, Clone)]
#[derive(Clone)]
pub struct QueryGraph {
pub root_node: u32,
pub end_node: u32,
@@ -31,8 +28,8 @@ pub struct QueryGraph {
}
fn _assert_sizes() {
// TODO: QueryNodes are too big now, 184B is an unreasonable size
let _: [u8; 184] = [0; std::mem::size_of::<QueryNode>()];
// TODO: QueryNodes are too big now, 88B is a bit too big
let _: [u8; 88] = [0; std::mem::size_of::<QueryNode>()];
let _: [u8; 48] = [0; std::mem::size_of::<Edges>()];
}
@@ -72,19 +69,14 @@ impl QueryGraph {
impl QueryGraph {
// TODO: return the list of all matching words here as well
pub fn from_query<'transaction>(
index: &Index,
txn: &RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
terms: Vec<LocatedQueryTerm>,
) -> Result<QueryGraph> {
pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
// TODO: maybe empty nodes should not be removed here, to compute
// the score of the `words` ranking rule correctly
// it is very easy to traverse the graph and remove afterwards anyway
// Still, I'm keeping this here as a demo
let mut empty_nodes = vec![];
let word_set = index.words_fst(txn)?;
let word_set = ctx.index.words_fst(ctx.txn)?;
let mut graph = QueryGraph::default();
let (mut prev2, mut prev1, mut prev0): (Vec<u32>, Vec<u32>, Vec<u32>) =
@@ -105,20 +97,20 @@ impl QueryGraph {
if !prev1.is_empty() {
if let Some((ngram2_str, ngram2_pos)) =
query_term::ngram2(&query[length - 2], &query[length - 1])
query_term::ngram2(ctx, &query[length - 2], &query[length - 1])
{
if word_set.contains(ngram2_str.as_bytes()) {
if word_set.contains(ctx.word_interner.get(ngram2_str)) {
let ngram2 = LocatedQueryTerm {
value: QueryTerm::Word {
derivations: WordDerivations {
original: ngram2_str.clone(),
original: ngram2_str,
// TODO: could add a typo if it's an ngram?
zero_typo: vec![ngram2_str],
one_typo: vec![],
two_typos: vec![],
zero_typo: Box::new([ngram2_str]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![], // TODO: ngram synonyms
split_words: None, // TODO: maybe ngram split words?
synonyms: Box::new([]), // TODO: ngram synonyms
split_words: None, // TODO: maybe ngram split words?
},
},
positions: ngram2_pos,
@@ -129,22 +121,25 @@ impl QueryGraph {
}
}
if !prev2.is_empty() {
if let Some((ngram3_str, ngram3_pos)) =
query_term::ngram3(&query[length - 3], &query[length - 2], &query[length - 1])
{
if word_set.contains(ngram3_str.as_bytes()) {
if let Some((ngram3_str, ngram3_pos)) = query_term::ngram3(
ctx,
&query[length - 3],
&query[length - 2],
&query[length - 1],
) {
if word_set.contains(ctx.word_interner.get(ngram3_str)) {
let ngram3 = LocatedQueryTerm {
value: QueryTerm::Word {
derivations: WordDerivations {
original: ngram3_str.clone(),
original: ngram3_str,
// TODO: could add a typo if it's an ngram?
zero_typo: vec![ngram3_str],
one_typo: vec![],
two_typos: vec![],
zero_typo: Box::new([ngram3_str]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![], // TODO: ngram synonyms
split_words: None, // TODO: maybe ngram split words?
// would be nice for typos like su nflower
synonyms: Box::new([]), // TODO: ngram synonyms
split_words: None, // TODO: maybe ngram split words?
// would be nice for typos like su nflower
},
},
positions: ngram3_pos,

View File

@@ -16,30 +16,35 @@ use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
use crate::search::{build_dfa, get_first};
use crate::{CboRoaringBitmapLenCodec, Index, Result};
#[derive(Debug, Default, Clone)]
use super::interner::{Interned, Interner};
use super::SearchContext;
#[derive(Default, Clone, PartialEq, Eq, Hash)]
pub struct Phrase {
pub words: Vec<Option<String>>,
pub words: Vec<Option<Interned<String>>>,
}
impl Phrase {
pub fn description(&self) -> String {
self.words.iter().flatten().join(" ")
pub fn description(&self, interner: &Interner<String>) -> String {
self.words.iter().flatten().map(|w| interner.get(*w)).join(" ")
}
}
#[derive(Debug, Clone)]
#[derive(Clone)]
pub struct WordDerivations {
pub original: String,
pub original: Interned<String>,
// TODO: pub prefix_of: Vec<String>,
pub synonyms: Vec<Phrase>,
pub split_words: Option<(String, String)>,
pub zero_typo: Vec<String>,
pub one_typo: Vec<String>,
pub two_typos: Vec<String>,
pub synonyms: Box<[Interned<Phrase>]>,
pub split_words: Option<Interned<Phrase>>,
pub zero_typo: Box<[Interned<String>]>,
pub one_typo: Box<[Interned<String>]>,
pub two_typos: Box<[Interned<String>]>,
pub use_prefix_db: bool,
}
impl WordDerivations {
pub fn all_derivations_except_prefix_db(&self) -> impl Iterator<Item = &String> + Clone {
self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter())
pub fn all_derivations_except_prefix_db(
&'_ self,
) -> impl Iterator<Item = Interned<String>> + Clone + '_ {
self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied()
}
fn is_empty(&self) -> bool {
self.zero_typo.is_empty()
@@ -50,15 +55,21 @@ impl WordDerivations {
}
pub fn word_derivations(
index: &Index,
txn: &RoTxn,
ctx: &mut SearchContext,
word: &str,
max_typo: u8,
is_prefix: bool,
fst: &fst::Set<Cow<[u8]>>,
) -> Result<WordDerivations> {
let word_interned = ctx.word_interner.insert(word.to_owned());
let use_prefix_db = is_prefix
&& index.word_prefix_docids.remap_data_type::<DecodeIgnore>().get(txn, word)?.is_some();
&& ctx
.index
.word_prefix_docids
.remap_data_type::<DecodeIgnore>()
.get(ctx.txn, word)?
.is_some();
let mut zero_typo = vec![];
let mut one_typo = vec![];
@@ -70,11 +81,12 @@ pub fn word_derivations(
let mut stream = fst.search(prefix).into_stream();
while let Some(word) = stream.next() {
let word = std::str::from_utf8(word)?;
zero_typo.push(word.to_string());
let word = std::str::from_utf8(word)?.to_owned();
let word_interned = ctx.word_interner.insert(word);
zero_typo.push(word_interned);
}
} else if fst.contains(word) {
zero_typo.push(word.to_string());
zero_typo.push(word_interned);
}
} else if max_typo == 1 {
let dfa = build_dfa(word, 1, is_prefix);
@@ -83,13 +95,14 @@ pub fn word_derivations(
while let Some((word, state)) = stream.next() {
let word = std::str::from_utf8(word)?;
let word_interned = ctx.word_interner.insert(word.to_owned());
let d = dfa.distance(state.1);
match d.to_u8() {
0 => {
zero_typo.push(word.to_string());
zero_typo.push(word_interned);
}
1 => {
one_typo.push(word.to_string());
one_typo.push(word_interned);
}
_ => panic!(),
}
@@ -105,47 +118,56 @@ pub fn word_derivations(
while let Some((found_word, state)) = stream.next() {
let found_word = std::str::from_utf8(found_word)?;
let found_word_interned = ctx.word_interner.insert(found_word.to_owned());
// in the case the typo is on the first letter, we know the number of typo
// is two
if get_first(found_word) != get_first(word) {
two_typos.push(found_word.to_string());
two_typos.push(found_word_interned);
} else {
// Else, we know that it is the second dfa that matched and compute the
// correct distance
let d = second_dfa.distance((state.1).0);
match d.to_u8() {
0 => {
zero_typo.push(found_word.to_string());
zero_typo.push(found_word_interned);
}
1 => {
one_typo.push(found_word.to_string());
one_typo.push(found_word_interned);
}
2 => {
two_typos.push(found_word.to_string());
two_typos.push(found_word_interned);
}
_ => panic!(),
}
}
}
}
let split_words = split_best_frequency(index, txn, word)?;
let split_words = split_best_frequency(ctx.index, ctx.txn, word)?.map(|(l, r)| {
ctx.phrase_interner.insert(Phrase {
words: vec![Some(ctx.word_interner.insert(l)), Some(ctx.word_interner.insert(r))],
})
});
let synonyms = ctx.index.synonyms(ctx.txn)?;
let synonyms = index.synonyms(txn)?;
let synonyms = synonyms
.get(&vec![word.to_owned()])
.cloned()
.unwrap_or_default()
.into_iter()
.map(|words| Phrase { words: words.into_iter().map(Some).collect() })
.map(|words| {
let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
ctx.phrase_interner.insert(Phrase { words })
})
.collect();
Ok(WordDerivations {
original: word.to_owned(),
original: ctx.word_interner.insert(word.to_owned()),
synonyms,
split_words,
zero_typo,
one_typo,
two_typos,
zero_typo: zero_typo.into_boxed_slice(),
one_typo: one_typo.into_boxed_slice(),
two_typos: two_typos.into_boxed_slice(),
use_prefix_db,
})
}
@@ -176,33 +198,36 @@ fn split_best_frequency(
Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned())))
}
#[derive(Debug, Clone)]
#[derive(Clone)]
pub enum QueryTerm {
// TODO: should there be SplitWord, NGram2, and NGram3 variants?
// NGram2 can have 1 typo and synonyms
// NGram3 cannot have typos but can have synonyms
// SplitWords are a phrase
// Can NGrams be prefixes?
Phrase { phrase: Phrase },
Phrase { phrase: Interned<Phrase> },
Word { derivations: WordDerivations },
}
impl QueryTerm {
pub fn original_single_word(&self) -> Option<&str> {
pub fn original_single_word<'interner>(
&self,
word_interner: &'interner Interner<String>,
) -> Option<&'interner str> {
match self {
QueryTerm::Phrase { phrase: _ } => None,
QueryTerm::Word { derivations } => {
if derivations.is_empty() {
None
} else {
Some(derivations.original.as_str())
Some(word_interner.get(derivations.original))
}
}
}
}
}
#[derive(Debug, Clone)]
#[derive(Clone)]
pub struct LocatedQueryTerm {
pub value: QueryTerm,
pub positions: RangeInclusive<i8>,
@@ -217,18 +242,17 @@ impl LocatedQueryTerm {
}
}
pub fn located_query_terms_from_string<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
pub fn located_query_terms_from_string<'search>(
ctx: &mut SearchContext<'search>,
query: NormalizedTokenIter<Vec<u8>>,
words_limit: Option<usize>,
) -> Result<Vec<LocatedQueryTerm>> {
let authorize_typos = index.authorize_typos(txn)?;
let min_len_one_typo = index.min_word_len_one_typo(txn)?;
let min_len_two_typos = index.min_word_len_two_typos(txn)?;
let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
let exact_words = index.exact_words(txn)?;
let fst = index.words_fst(txn)?;
let exact_words = ctx.index.exact_words(ctx.txn)?;
let fst = ctx.index.words_fst(ctx.txn)?;
let nbr_typos = |word: &str| {
if !authorize_typos
@@ -243,10 +267,6 @@ pub fn located_query_terms_from_string<'transaction>(
}
};
let derivations = |word: &str, is_prefix: bool| {
word_derivations(index, txn, word, nbr_typos(word), is_prefix, &fst)
};
let mut primitive_query = Vec::new();
let mut phrase = Vec::new();
@@ -279,14 +299,17 @@ pub fn located_query_terms_from_string<'transaction>(
if let TokenKind::StopWord = token.kind {
phrase.push(None);
} else {
let word = ctx.word_interner.insert(token.lemma().to_string());
// TODO: in a phrase, check that every word exists
// otherwise return WordDerivations::Empty
phrase.push(Some(token.lemma().to_string()));
phrase.push(Some(word));
}
} else if peekable.peek().is_some() {
match token.kind {
TokenKind::Word => {
let derivations = derivations(token.lemma(), false)?;
let word = token.lemma();
let derivations =
word_derivations(ctx, word, nbr_typos(word), false, &fst)?;
let located_term = LocatedQueryTerm {
value: QueryTerm::Word { derivations },
positions: position..=position,
@@ -296,7 +319,8 @@ pub fn located_query_terms_from_string<'transaction>(
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
}
} else {
let derivations = derivations(token.lemma(), true)?;
let word = token.lemma();
let derivations = word_derivations(ctx, word, nbr_typos(word), true, &fst)?;
let located_term = LocatedQueryTerm {
value: QueryTerm::Word { derivations },
positions: position..=position,
@@ -323,7 +347,9 @@ pub fn located_query_terms_from_string<'transaction>(
{
let located_query_term = LocatedQueryTerm {
value: QueryTerm::Phrase {
phrase: Phrase { words: mem::take(&mut phrase) },
phrase: ctx
.phrase_interner
.insert(Phrase { words: mem::take(&mut phrase) }),
},
positions: phrase_start..=phrase_end,
};
@@ -337,7 +363,9 @@ pub fn located_query_terms_from_string<'transaction>(
// If a quote is never closed, we consider all of the end of the query as a phrase.
if !phrase.is_empty() {
let located_query_term = LocatedQueryTerm {
value: QueryTerm::Phrase { phrase: Phrase { words: mem::take(&mut phrase) } },
value: QueryTerm::Phrase {
phrase: ctx.phrase_interner.insert(Phrase { words: mem::take(&mut phrase) }),
},
positions: phrase_start..=phrase_end,
};
primitive_query.push(located_query_term);
@@ -347,35 +375,49 @@ pub fn located_query_terms_from_string<'transaction>(
}
// TODO: return a word derivations instead?
pub fn ngram2(x: &LocatedQueryTerm, y: &LocatedQueryTerm) -> Option<(String, RangeInclusive<i8>)> {
pub fn ngram2(
ctx: &mut SearchContext,
x: &LocatedQueryTerm,
y: &LocatedQueryTerm,
) -> Option<(Interned<String>, RangeInclusive<i8>)> {
if *x.positions.end() != y.positions.start() - 1 {
return None;
}
match (&x.value.original_single_word(), &y.value.original_single_word()) {
match (
&x.value.original_single_word(&ctx.word_interner),
&y.value.original_single_word(&ctx.word_interner),
) {
(Some(w1), Some(w2)) => {
let term = (format!("{w1}{w2}"), *x.positions.start()..=*y.positions.end());
let term = (
ctx.word_interner.insert(format!("{w1}{w2}")),
*x.positions.start()..=*y.positions.end(),
);
Some(term)
}
_ => None,
}
}
pub fn ngram3(
ctx: &mut SearchContext,
x: &LocatedQueryTerm,
y: &LocatedQueryTerm,
z: &LocatedQueryTerm,
) -> Option<(String, RangeInclusive<i8>)> {
) -> Option<(Interned<String>, RangeInclusive<i8>)> {
if *x.positions.end() != y.positions.start() - 1
|| *y.positions.end() != z.positions.start() - 1
{
return None;
}
match (
&x.value.original_single_word(),
&y.value.original_single_word(),
&z.value.original_single_word(),
&x.value.original_single_word(&ctx.word_interner),
&y.value.original_single_word(&ctx.word_interner),
&z.value.original_single_word(&ctx.word_interner),
) {
(Some(w1), Some(w2), Some(w3)) => {
let term = (format!("{w1}{w2}{w3}"), *x.positions.start()..=*z.positions.end());
let term = (
ctx.word_interner.insert(format!("{w1}{w2}{w3}")),
*x.positions.start()..=*z.positions.end(),
);
Some(term)
}
_ => None,

View File

@@ -1,18 +1,10 @@
use heed::RoTxn;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::{QueryGraph, SearchContext};
use crate::Result;
use roaring::RoaringBitmap;
use super::{Edge, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::QueryGraph;
use crate::{Index, Result};
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn build<'db_cache, 'transaction: 'db_cache>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
query_graph: QueryGraph,
) -> Result<Self> {
pub fn build(ctx: &mut SearchContext, query_graph: QueryGraph) -> Result<Self> {
let mut ranking_rule_graph =
Self { query_graph, all_edges: vec![], node_edges: vec![], successors: vec![] };
@@ -22,12 +14,11 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
let new_edges = ranking_rule_graph.node_edges.last_mut().unwrap();
let new_successors = ranking_rule_graph.successors.last_mut().unwrap();
let Some(from_node_data) = G::build_visit_from_node(index, txn, db_cache, node)? else { continue };
let Some(from_node_data) = G::build_visit_from_node(ctx, node)? else { continue };
for successor_idx in ranking_rule_graph.query_graph.edges[node_idx].successors.iter() {
let to_node = &ranking_rule_graph.query_graph.nodes[successor_idx as usize];
let mut edges =
G::build_visit_to_node(index, txn, db_cache, to_node, &from_node_data)?;
let mut edges = G::build_visit_to_node(ctx, to_node, &from_node_data)?;
if edges.is_empty() {
continue;
}

View File

@@ -1,13 +1,10 @@
use std::marker::PhantomData;
use fxhash::FxHashMap;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::BitmapOrAllRef;
use crate::{Index, Result};
use crate::new::{BitmapOrAllRef, SearchContext};
use crate::Result;
use fxhash::FxHashMap;
use roaring::RoaringBitmap;
// TODO: the cache should have a G::EdgeDetails as key
// but then it means that we should have a quick way of
@@ -25,11 +22,9 @@ impl<G: RankingRuleGraphTrait> Default for EdgeDocidsCache<G> {
}
}
impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
pub fn get_edge_docids<'s, 'transaction>(
pub fn get_edge_docids<'s, 'search>(
&'s mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
edge_index: u32,
graph: &RankingRuleGraph<G>,
// TODO: maybe universe doesn't belong here
@@ -46,7 +41,7 @@ impl<G: RankingRuleGraphTrait> EdgeDocidsCache<G> {
return Ok(BitmapOrAllRef::Bitmap(&self.cache[&edge_index]));
}
// TODO: maybe universe doesn't belong here
let docids = universe & G::compute_docids(index, txn, db_cache, details)?;
let docids = universe & G::compute_docids(ctx, details)?;
let _ = self.cache.insert(edge_index, docids);
let docids = &self.cache[&edge_index];
Ok(BitmapOrAllRef::Bitmap(docids))

View File

@@ -7,20 +7,15 @@ mod proximity;
mod resolve_paths;
mod typo;
use super::logger::SearchLogger;
use super::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
pub use edge_docids_cache::EdgeDocidsCache;
pub use empty_paths_cache::EmptyPathsCache;
pub use proximity::ProximityGraph;
pub use typo::TypoGraph;
use std::ops::ControlFlow;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
use super::{QueryGraph, QueryNode};
use crate::{Index, Result};
use std::ops::ControlFlow;
pub use typo::TypoGraph;
#[derive(Debug, Clone)]
pub enum EdgeDetails<E> {
@@ -42,6 +37,48 @@ pub struct EdgePointer<'graph, E> {
pub edge: &'graph Edge<E>,
}
// pub struct SubWordDerivations {
// words: FxHashSet<Interned<String>>,
// synonyms: FxHashSet<Interned<Phrase>>, // NO! they're phrases, not strings
// split_words: bool,
// use_prefix_db: bool,
// }
// pub struct EdgeWordDerivations {
// // TODO: not Option, instead: Any | All | Subset(SubWordDerivations)
// from_words: Option<SubWordDerivations>, // ???
// to_words: Option<SubWordDerivations>, // + use prefix db?
// }
// fn aggregate_edge_word_derivations(
// graph: (),
// edges: Vec<usize>,
// ) -> BTreeMap<usize, SubWordDerivations> {
// todo!()
// }
// fn reduce_word_term_to_sub_word_derivations(
// term: &mut WordDerivations,
// derivations: &SubWordDerivations,
// ) {
// let mut new_one_typo = vec![];
// for w in term.one_typo {
// if derivations.words.contains(w) {
// new_one_typo.push(w);
// }
// }
// if term.use_prefix_db && !derivations.use_prefix_db {
// term.use_prefix_db = false;
// }
// // etc.
// }
// fn word_derivations_used_by_edge<G: RankingRuleGraphTrait>(
// edge: G::EdgeDetails,
// ) -> SubWordDerivations {
// todo!()
// }
pub trait RankingRuleGraphTrait: Sized {
/// The details of an edge connecting two query nodes. These details
/// should be sufficient to compute the edge's cost and associated document ids
@@ -55,10 +92,8 @@ pub trait RankingRuleGraphTrait: Sized {
fn graphviz_edge_details_label(edge: &Self::EdgeDetails) -> String;
/// Compute the document ids associated with the given edge.
fn compute_docids<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge_details: &Self::EdgeDetails,
) -> Result<RoaringBitmap>;
@@ -66,19 +101,15 @@ pub trait RankingRuleGraphTrait: Sized {
///
/// This call is followed by zero, one or more calls to [`build_visit_to_node`](RankingRuleGraphTrait::build_visit_to_node),
/// which builds the actual edges.
fn build_visit_from_node<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_from_node<'search>(
ctx: &mut SearchContext<'search>,
from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>>;
/// Return the cost and details of the edges going from the previously visited node
/// (with [`build_visit_from_node`](RankingRuleGraphTrait::build_visit_from_node)) to `to_node`.
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_to_node<'from_data, 'search: 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>>;

View File

@@ -1,30 +1,30 @@
use std::collections::BTreeMap;
use heed::RoTxn;
use itertools::Itertools;
use super::ProximityEdge;
use crate::new::db_cache::DatabaseCache;
use crate::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
use crate::new::ranking_rule_graph::proximity::WordPair;
use crate::new::ranking_rule_graph::EdgeDetails;
use crate::new::QueryNode;
use crate::{Index, Result};
use crate::new::{QueryNode, SearchContext};
use crate::Result;
use itertools::Itertools;
use std::collections::BTreeMap;
pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations, i8)>> {
pub fn visit_from_node(
ctx: &mut SearchContext,
from_node: &QueryNode,
) -> Result<Option<(WordDerivations, i8)>> {
Ok(Some(match from_node {
QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {
QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
QueryTerm::Phrase { phrase: phrase1 } => {
if let Some(original) = phrase1.words.last().unwrap().as_ref() {
let phrase1 = ctx.phrase_interner.get(*phrase1);
if let Some(original) = *phrase1.words.last().unwrap() {
(
WordDerivations {
original: original.clone(),
zero_typo: vec![original.to_owned()],
one_typo: vec![],
two_typos: vec![],
original,
zero_typo: Box::new([original]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![],
synonyms: Box::new([]),
split_words: None,
},
*pos1.end(),
@@ -37,12 +37,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
},
QueryNode::Start => (
WordDerivations {
original: String::new(),
zero_typo: vec![],
one_typo: vec![],
two_typos: vec![],
original: ctx.word_interner.insert(String::new()),
zero_typo: Box::new([]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![],
synonyms: Box::new([]),
split_words: None,
},
-100,
@@ -51,10 +51,8 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
}))
}
pub fn visit_to_node<'transaction, 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
pub fn visit_to_node<'search, 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data (WordDerivations, i8),
) -> Result<Vec<(u8, EdgeDetails<ProximityEdge>)>> {
@@ -69,15 +67,16 @@ pub fn visit_to_node<'transaction, 'from_data>(
let (derivations2, pos2, ngram_len2) = match value2 {
QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
QueryTerm::Phrase { phrase: phrase2 } => {
if let Some(original) = phrase2.words.first().unwrap().as_ref() {
let phrase2 = ctx.phrase_interner.get(*phrase2);
if let Some(original) = *phrase2.words.first().unwrap() {
(
WordDerivations {
original: original.clone(),
zero_typo: vec![original.to_owned()],
one_typo: vec![],
two_typos: vec![],
original,
zero_typo: Box::new([original]),
one_typo: Box::new([]),
two_typos: Box::new([]),
use_prefix_db: false,
synonyms: vec![],
synonyms: Box::new([]),
split_words: None,
},
*pos2.start(),
@@ -106,19 +105,16 @@ pub fn visit_to_node<'transaction, 'from_data>(
let derivations1 = derivations1.all_derivations_except_prefix_db();
// TODO: eventually, we want to get rid of the uses from `orginal`
let original_word_2 = derivations2.original.clone();
let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();
if updb2 {
for word1 in derivations1.clone() {
for proximity in 1..=(8 - ngram_len2) {
let cost = (proximity + ngram_len2 - 1) as u8;
if db_cache
if ctx
.get_word_prefix_pair_proximity_docids(
index,
txn,
word1,
original_word_2.as_str(),
derivations2.original,
proximity as u8,
)?
.is_some()
@@ -129,16 +125,14 @@ pub fn visit_to_node<'transaction, 'from_data>(
.entry(proximity as u8)
.or_default()
.push(WordPair::WordPrefix {
left: word1.to_owned(),
right_prefix: original_word_2.to_owned(),
left: word1,
right_prefix: derivations2.original,
});
}
if db_cache
if ctx
.get_prefix_word_pair_proximity_docids(
index,
txn,
original_word_2.as_str(),
word1.as_str(),
derivations2.original,
word1,
proximity as u8 - 1,
)?
.is_some()
@@ -149,8 +143,8 @@ pub fn visit_to_node<'transaction, 'from_data>(
.entry(proximity as u8)
.or_default()
.push(WordPair::WordPrefixSwapped {
left_prefix: original_word_2.to_owned(),
right: word1.to_owned(),
left_prefix: derivations2.original,
right: word1,
});
}
}
@@ -164,28 +158,23 @@ pub fn visit_to_node<'transaction, 'from_data>(
for (word1, word2) in product_derivations {
for proximity in 1..=(8 - ngram_len2) {
let cost = (proximity + ngram_len2 - 1) as u8;
if db_cache
.get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)?
.is_some()
{
if ctx.get_word_pair_proximity_docids(word1, word2, proximity as u8)?.is_some() {
cost_proximity_word_pairs
.entry(cost)
.or_default()
.entry(proximity as u8)
.or_default()
.push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() });
.push(WordPair::Words { left: word1, right: word2 });
}
if proximity > 1
&& db_cache
.get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)?
.is_some()
&& ctx.get_word_pair_proximity_docids(word2, word1, proximity as u8 - 1)?.is_some()
{
cost_proximity_word_pairs
.entry(cost)
.or_default()
.entry(proximity as u8 - 1)
.or_default()
.push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() });
.push(WordPair::Words { left: word2, right: word1 });
}
}
}

View File

@@ -1,14 +1,10 @@
use heed::RoTxn;
use super::{ProximityEdge, WordPair};
use crate::new::SearchContext;
use crate::{CboRoaringBitmapCodec, Result};
use roaring::{MultiOps, RoaringBitmap};
use super::{ProximityEdge, WordPair};
use crate::new::db_cache::DatabaseCache;
use crate::{CboRoaringBitmapCodec, Result};
pub fn compute_docids<'transaction>(
index: &crate::Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
pub fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge: &ProximityEdge,
) -> Result<RoaringBitmap> {
let ProximityEdge { pairs, proximity } = edge;
@@ -16,12 +12,14 @@ pub fn compute_docids<'transaction>(
for pair in pairs.iter() {
let bytes = match pair {
WordPair::Words { left, right } => {
db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity)
ctx.get_word_pair_proximity_docids(*left, *right, *proximity)
}
WordPair::WordPrefix { left, right_prefix } => {
ctx.get_word_prefix_pair_proximity_docids(*left, *right_prefix, *proximity)
}
WordPair::WordPrefixSwapped { left_prefix, right } => {
ctx.get_prefix_word_pair_proximity_docids(*left_prefix, *right, *proximity)
}
WordPair::WordPrefix { left, right_prefix } => db_cache
.get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity),
WordPair::WordPrefixSwapped { left_prefix, right } => db_cache
.get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity),
}?;
let bitmap =
bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();

View File

@@ -1,25 +1,22 @@
pub mod build;
pub mod compute_docids;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::{EdgeDetails, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::interner::Interned;
use crate::new::logger::SearchLogger;
use crate::new::query_term::WordDerivations;
use crate::new::{QueryGraph, QueryNode};
use crate::{Index, Result};
use crate::new::{QueryGraph, QueryNode, SearchContext};
use crate::Result;
use roaring::RoaringBitmap;
// TODO: intern the strings, refer to them by their pointer?
#[derive(Debug, Clone)]
#[derive(Clone)]
pub enum WordPair {
Words { left: String, right: String },
WordPrefix { left: String, right_prefix: String },
WordPrefixSwapped { left_prefix: String, right: String },
Words { left: Interned<String>, right: Interned<String> },
WordPrefix { left: Interned<String>, right_prefix: Interned<String> },
WordPrefixSwapped { left_prefix: Interned<String>, right: Interned<String> },
}
#[derive(Clone)]
@@ -40,32 +37,26 @@ impl RankingRuleGraphTrait for ProximityGraph {
format!(", prox {proximity}, {} pairs", pairs.len())
}
fn compute_docids<'db_cache, 'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn compute_docids<'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
) -> Result<roaring::RoaringBitmap> {
compute_docids::compute_docids(index, txn, db_cache, edge)
compute_docids::compute_docids(ctx, edge)
}
fn build_visit_from_node<'transaction>(
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_from_node<'search>(
ctx: &mut SearchContext<'search>,
from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
build::visit_from_node(from_node)
build::visit_from_node(ctx, from_node)
}
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_to_node<'from_data, 'search: 'from_data>(
ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
build::visit_to_node(index, txn, db_cache, to_node, from_node_data)
build::visit_to_node(ctx, to_node, from_node_data)
}
fn log_state(

View File

@@ -1,23 +1,18 @@
#![allow(clippy::too_many_arguments)]
use heed::RoTxn;
use roaring::{MultiOps, RoaringBitmap};
use super::edge_docids_cache::EdgeDocidsCache;
use super::empty_paths_cache::EmptyPathsCache;
use super::{RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::BitmapOrAllRef;
use crate::{Index, Result};
use crate::new::{BitmapOrAllRef, SearchContext};
use crate::Result;
use roaring::{MultiOps, RoaringBitmap};
impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
pub fn resolve_paths<'transaction>(
// TODO: reduce the universe after computing each path
// TODO: deserialize roaring bitmap within a universe
pub fn resolve_paths<'search>(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
edge_docids_cache: &mut EdgeDocidsCache<G>,
empty_paths_cache: &mut EmptyPathsCache,
universe: &RoaringBitmap,
@@ -52,8 +47,8 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
let mut cached_edge_docids = vec![];
'edge_loop: for edge_index in edge_indexes {
visited_edges.push(edge_index);
let edge_docids = edge_docids_cache
.get_edge_docids(index, txn, db_cache, edge_index, self, universe)?;
let edge_docids =
edge_docids_cache.get_edge_docids(ctx, edge_index, self, universe)?;
match edge_docids {
BitmapOrAllRef::Bitmap(edge_docids) => {
cached_edge_docids.push((edge_index, edge_docids.clone()));

View File

@@ -1,19 +1,17 @@
use heed::{BytesDecode, RoTxn};
use roaring::RoaringBitmap;
use super::empty_paths_cache::EmptyPathsCache;
use super::{EdgeDetails, RankingRuleGraph, RankingRuleGraphTrait};
use crate::new::db_cache::DatabaseCache;
use crate::new::interner::Interned;
use crate::new::logger::SearchLogger;
use crate::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
use crate::new::resolve_query_graph::resolve_phrase;
use crate::new::{QueryGraph, QueryNode};
use crate::{Index, Result, RoaringBitmapCodec};
use crate::new::{QueryGraph, QueryNode, SearchContext};
use crate::{Result, RoaringBitmapCodec};
use heed::BytesDecode;
use roaring::RoaringBitmap;
#[derive(Clone)]
pub enum TypoEdge {
Phrase { phrase: Phrase },
Phrase { phrase: Interned<Phrase> },
Word { derivations: WordDerivations, nbr_typos: u8 },
}
@@ -30,14 +28,12 @@ impl RankingRuleGraphTrait for TypoGraph {
}
}
fn compute_docids<'db_cache, 'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
fn compute_docids<'db_cache, 'search>(
ctx: &mut SearchContext<'search>,
edge: &Self::EdgeDetails,
) -> Result<RoaringBitmap> {
match edge {
TypoEdge::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase),
TypoEdge::Phrase { phrase } => resolve_phrase(ctx, *phrase),
TypoEdge::Word { derivations, nbr_typos } => {
let words = match nbr_typos {
0 => &derivations.zero_typo,
@@ -46,16 +42,14 @@ impl RankingRuleGraphTrait for TypoGraph {
_ => panic!(),
};
let mut docids = RoaringBitmap::new();
for word in words.iter() {
let Some(bytes) = db_cache.get_word_docids(index, txn, word)? else { continue };
for word in words.iter().copied() {
let Some(bytes) = ctx.get_word_docids(word)? else { continue };
let bitmap =
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
docids |= bitmap;
}
if *nbr_typos == 0 {
if let Some(bytes) =
db_cache.get_prefix_docids(index, txn, &derivations.original)?
{
if let Some(bytes) = ctx.get_prefix_docids(derivations.original)? {
let bitmap =
RoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding)?;
docids |= bitmap;
@@ -66,26 +60,22 @@ impl RankingRuleGraphTrait for TypoGraph {
}
}
fn build_visit_from_node<'transaction>(
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_from_node<'search>(
_ctx: &mut SearchContext<'search>,
_from_node: &QueryNode,
) -> Result<Option<Self::BuildVisitedFromNode>> {
Ok(Some(()))
}
fn build_visit_to_node<'from_data, 'transaction: 'from_data>(
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
fn build_visit_to_node<'from_data, 'search: 'from_data>(
_ctx: &mut SearchContext<'search>,
to_node: &QueryNode,
_from_node_data: &'from_data Self::BuildVisitedFromNode,
) -> Result<Vec<(u8, EdgeDetails<Self::EdgeDetails>)>> {
match to_node {
QueryNode::Term(LocatedQueryTerm { value, .. }) => match value {
QueryTerm::Phrase { phrase } => {
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase: phrase.clone() }))])
&QueryTerm::Phrase { phrase } => {
Ok(vec![(0, EdgeDetails::Data(TypoEdge::Phrase { phrase }))])
}
QueryTerm::Word { derivations } => {
let mut edges = vec![];

View File

@@ -1,33 +1,28 @@
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
use super::QueryGraph;
use super::SearchContext;
use crate::new::graph_based_ranking_rule::GraphBasedRankingRule;
use crate::new::ranking_rule_graph::ProximityGraph;
use crate::new::ranking_rule_graph::TypoGraph;
use crate::new::words::Words;
use roaring::RoaringBitmap;
// use crate::search::new::sort::Sort;
use crate::{Index, Result, TermsMatchingStrategy};
use crate::{Result, TermsMatchingStrategy};
pub trait RankingRuleOutputIter<'transaction, Query> {
pub trait RankingRuleOutputIter<'search, Query> {
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>>;
}
pub struct RankingRuleOutputIterWrapper<'transaction, Query> {
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
pub struct RankingRuleOutputIterWrapper<'search, Query> {
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'search>,
}
impl<'transaction, Query> RankingRuleOutputIterWrapper<'transaction, Query> {
pub fn new(
iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'transaction>,
) -> Self {
impl<'search, Query> RankingRuleOutputIterWrapper<'search, Query> {
pub fn new(iter: Box<dyn Iterator<Item = Result<RankingRuleOutput<Query>>> + 'search>) -> Self {
Self { iter }
}
}
impl<'transaction, Query> RankingRuleOutputIter<'transaction, Query>
for RankingRuleOutputIterWrapper<'transaction, Query>
impl<'search, Query> RankingRuleOutputIter<'search, Query>
for RankingRuleOutputIterWrapper<'search, Query>
{
fn next_bucket(&mut self) -> Result<Option<RankingRuleOutput<Query>>> {
match self.iter.next() {
@@ -44,7 +39,7 @@ pub struct PlaceholderQuery;
impl RankingRuleQueryTrait for PlaceholderQuery {}
impl RankingRuleQueryTrait for QueryGraph {}
pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
pub trait RankingRule<'search, Query: RankingRuleQueryTrait> {
fn id(&self) -> String;
/// Prepare the ranking rule such that it can start iterating over its
@@ -53,9 +48,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
/// The given universe is the universe that will be given to [`next_bucket`](RankingRule::next_bucket).
fn start_iteration(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
logger: &mut dyn SearchLogger<Query>,
universe: &RoaringBitmap,
query: &Query,
@@ -70,9 +63,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
/// - the universe given to [`start_iteration`](RankingRule::start_iteration)
fn next_bucket(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
logger: &mut dyn SearchLogger<Query>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Query>>>;
@@ -81,9 +72,7 @@ pub trait RankingRule<'transaction, Query: RankingRuleQueryTrait> {
/// The next call to this ranking rule, if any, will be [`start_iteration`](RankingRule::start_iteration).
fn end_iteration(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
logger: &mut dyn SearchLogger<Query>,
);
}
@@ -98,11 +87,9 @@ pub struct RankingRuleOutput<Q> {
// TODO: can make it generic over the query type (either query graph or placeholder) fairly easily
#[allow(clippy::too_many_arguments)]
pub fn apply_ranking_rules<'transaction>(
index: &Index,
txn: &'transaction heed::RoTxn,
pub fn apply_ranking_rules<'search>(
ctx: &mut SearchContext<'search>,
// TODO: ranking rules parameter
db_cache: &mut DatabaseCache<'transaction>,
query_graph: &QueryGraph,
universe: &RoaringBitmap,
from: usize,
@@ -115,7 +102,7 @@ pub fn apply_ranking_rules<'transaction>(
let proximity = &mut GraphBasedRankingRule::<ProximityGraph>::new("proximity".to_owned());
let typo = &mut GraphBasedRankingRule::<TypoGraph>::new("typo".to_owned());
// TODO: ranking rules given as argument
let mut ranking_rules: Vec<&mut dyn RankingRule<'transaction, QueryGraph>> =
let mut ranking_rules: Vec<&mut dyn RankingRule<'search, QueryGraph>> =
vec![words, typo, proximity /*sort*/];
logger.ranking_rules(&ranking_rules);
@@ -126,7 +113,7 @@ pub fn apply_ranking_rules<'transaction>(
let ranking_rules_len = ranking_rules.len();
logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe);
ranking_rules[0].start_iteration(index, txn, db_cache, logger, universe, query_graph)?;
ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?;
let mut candidates = vec![RoaringBitmap::default(); ranking_rules_len];
candidates[0] = universe.clone();
@@ -142,7 +129,7 @@ pub fn apply_ranking_rules<'transaction>(
&candidates[cur_ranking_rule_index],
);
candidates[cur_ranking_rule_index].clear();
ranking_rules[cur_ranking_rule_index].end_iteration(index, txn, db_cache, logger);
ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger);
if cur_ranking_rule_index == 0 {
break;
} else {
@@ -206,7 +193,7 @@ pub fn apply_ranking_rules<'transaction>(
continue;
}
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(index, txn, db_cache, logger, &candidates[cur_ranking_rule_index])? else {
let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else {
// TODO: add remaining candidates automatically here?
back!();
continue;
@@ -239,9 +226,7 @@ pub fn apply_ranking_rules<'transaction>(
&candidates[cur_ranking_rule_index],
);
ranking_rules[cur_ranking_rule_index].start_iteration(
index,
txn,
db_cache,
ctx,
logger,
&next_bucket.candidates,
&next_bucket.query,
@@ -255,9 +240,7 @@ pub fn apply_ranking_rules<'transaction>(
mod tests {
// use crate::allocator::ALLOC;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::new::db_cache::DatabaseCache;
use crate::new::execute_search;
use crate::new::{execute_search, SearchContext};
use big_s::S;
use heed::EnvOpenOptions;
use maplit::hashset;
@@ -269,55 +252,6 @@ mod tests {
use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy};
#[test]
fn execute_new_search() {
let index = TempIndex::new();
index
.add_documents(documents!([
{
"id": 7,
"text": "the super quick super brown fox jumps over",
},
{
"id": 8,
"text": "the super quick brown fox jumps over",
},
{
"id": 9,
"text": "the quick super brown fox jumps over",
},
{
"id": 10,
"text": "the quick brown fox jumps over",
},
{
"id": 11,
"text": "the quick brown fox jumps over the lazy dog",
},
{
"id": 12,
"text": "the quick brown cat jumps over the lazy dog",
},
]))
.unwrap();
let txn = index.read_txn().unwrap();
let mut db_cache = DatabaseCache::default();
let results = execute_search(
&index,
&txn,
&mut db_cache,
"releases from poison by the government",
None,
0,
50,
&mut DefaultSearchLogger,
)
.unwrap();
println!("{results:?}")
}
#[test]
fn search_wiki_new() {
let mut options = EnvOpenOptions::new();
@@ -331,24 +265,20 @@ mod tests {
// loop {
let start = Instant::now();
let mut db_cache = DatabaseCache::default();
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
// let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let results = execute_search(
&index,
&txn,
&mut db_cache,
&mut SearchContext::new(&index, &txn),
"releases from poison by the government",
None,
0,
20,
// &mut DefaultSearchLogger,
&mut logger,
&mut DefaultSearchLogger,
// &mut logger,
)
.unwrap();
logger.write_d2_description();
// logger.write_d2_description();
let elapsed = start.elapsed();
@@ -425,19 +355,15 @@ mod tests {
let index = Index::new(options, "data_movies").unwrap();
let txn = index.read_txn().unwrap();
let primary_key = index.primary_key(&txn).unwrap().unwrap();
let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// let primary_key = index.primary_key(&txn).unwrap().unwrap();
// let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap();
// loop {
let start = Instant::now();
let mut db_cache = DatabaseCache::default();
let mut logger = crate::new::logger::detailed::DetailedSearchLogger::new("log");
let mut ctx = SearchContext::new(&index, &txn);
let results = execute_search(
&index,
&txn,
&mut db_cache,
&mut ctx,
"releases from poison by the government",
None,
0,
@@ -447,24 +373,24 @@ mod tests {
)
.unwrap();
logger.write_d2_description();
logger.write_d2_description(&mut ctx);
let elapsed = start.elapsed();
let ids = index
.documents(&txn, results.iter().copied())
.unwrap()
.into_iter()
.map(|x| {
let obkv = &x.1;
let id = obkv.get(primary_key).unwrap();
let id: serde_json::Value = serde_json::from_slice(id).unwrap();
id.as_str().unwrap().to_owned()
})
.collect::<Vec<_>>();
// let ids = index
// .documents(&txn, results.iter().copied())
// .unwrap()
// .into_iter()
// .map(|x| {
// let obkv = &x.1;
// let id = obkv.get(primary_key).unwrap();
// let id: serde_json::Value = serde_json::from_slice(id).unwrap();
// id.as_str().unwrap().to_owned()
// })
// .collect::<Vec<_>>();
println!("{}us: {results:?}", elapsed.as_micros());
println!("external ids: {ids:?}");
// println!("external ids: {ids:?}");
// }
}

View File

@@ -1,34 +1,28 @@
use std::collections::VecDeque;
use fxhash::FxHashMap;
use heed::{BytesDecode, RoTxn};
use roaring::{MultiOps, RoaringBitmap};
use super::db_cache::DatabaseCache;
use super::interner::Interned;
use super::query_term::{Phrase, QueryTerm, WordDerivations};
use super::{QueryGraph, QueryNode};
use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
use super::{QueryGraph, QueryNode, SearchContext};
use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
use fxhash::FxHashMap;
use heed::BytesDecode;
use roaring::{MultiOps, RoaringBitmap};
use std::collections::VecDeque;
// TODO: manual performance metrics: access to DB, bitmap deserializations/operations, etc.
#[derive(Default)]
pub struct NodeDocIdsCache {
pub cache: FxHashMap<u32, RoaringBitmap>,
}
impl NodeDocIdsCache {
fn get_docids<'cache, 'transaction>(
impl<'search> SearchContext<'search> {
fn get_node_docids<'cache>(
&'cache mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
term: &QueryTerm,
node_idx: u32,
) -> Result<&'cache RoaringBitmap> {
if self.cache.contains_key(&node_idx) {
return Ok(&self.cache[&node_idx]);
if self.node_docids_cache.cache.contains_key(&node_idx) {
return Ok(&self.node_docids_cache.cache[&node_idx]);
};
let docids = match term {
QueryTerm::Phrase { phrase } => resolve_phrase(index, txn, db_cache, phrase)?,
QueryTerm::Phrase { phrase } => resolve_phrase(self, *phrase)?,
QueryTerm::Word {
derivations:
WordDerivations {
@@ -42,15 +36,14 @@ impl NodeDocIdsCache {
},
} => {
let mut or_docids = vec![];
for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()) {
if let Some(word_docids) = db_cache.get_word_docids(index, txn, word)? {
for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied()
{
if let Some(word_docids) = self.get_word_docids(word)? {
or_docids.push(word_docids);
}
}
if *use_prefix_db {
if let Some(prefix_docids) =
db_cache.get_prefix_docids(index, txn, original.as_str())?
{
if let Some(prefix_docids) = self.get_prefix_docids(*original)? {
or_docids.push(prefix_docids);
}
}
@@ -58,32 +51,25 @@ impl NodeDocIdsCache {
.into_iter()
.map(|slice| RoaringBitmapCodec::bytes_decode(slice).unwrap())
.collect::<Vec<_>>();
for synonym in synonyms {
for synonym in synonyms.iter().copied() {
// TODO: cache resolve_phrase?
docids.push(resolve_phrase(index, txn, db_cache, synonym)?);
docids.push(resolve_phrase(self, synonym)?);
}
if let Some((left, right)) = split_words {
if let Some(split_word_docids) =
db_cache.get_word_pair_proximity_docids(index, txn, left, right, 1)?
{
docids.push(CboRoaringBitmapCodec::deserialize_from(split_word_docids)?);
}
if let Some(split_words) = split_words {
docids.push(resolve_phrase(self, *split_words)?);
}
MultiOps::union(docids)
}
};
let _ = self.cache.insert(node_idx, docids);
let docids = &self.cache[&node_idx];
let _ = self.node_docids_cache.cache.insert(node_idx, docids);
let docids = &self.node_docids_cache.cache[&node_idx];
Ok(docids)
}
}
pub fn resolve_query_graph<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
node_docids_cache: &mut NodeDocIdsCache,
pub fn resolve_query_graph<'search>(
ctx: &mut SearchContext<'search>,
q: &QueryGraph,
universe: &RoaringBitmap,
) -> Result<RoaringBitmap> {
@@ -111,8 +97,7 @@ pub fn resolve_query_graph<'transaction>(
let node_docids = match n {
QueryNode::Term(located_term) => {
let term = &located_term.value;
let derivations_docids =
node_docids_cache.get_docids(index, txn, db_cache, term, node)?;
let derivations_docids = ctx.get_node_docids(term, node)?;
predecessors_docids & derivations_docids
}
QueryNode::Deleted => {
@@ -143,13 +128,8 @@ pub fn resolve_query_graph<'transaction>(
panic!()
}
pub fn resolve_phrase<'transaction>(
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
phrase: &Phrase,
) -> Result<RoaringBitmap> {
let Phrase { words } = phrase;
pub fn resolve_phrase(ctx: &mut SearchContext, phrase: Interned<Phrase>) -> Result<RoaringBitmap> {
let Phrase { words } = ctx.phrase_interner.get(phrase).clone();
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = words.len().min(3);
@@ -161,19 +141,19 @@ pub fn resolve_phrase<'transaction>(
for win in words.windows(winsize) {
// Get all the documents with the matching distance for each word pairs.
let mut bitmaps = Vec::with_capacity(winsize.pow(2));
for (offset, s1) in win
for (offset, &s1) in win
.iter()
.enumerate()
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{
for (dist, s2) in win
for (dist, &s2) in win
.iter()
.skip(offset + 1)
.enumerate()
.filter_map(|(index, word)| word.as_ref().map(|word| (index, word)))
{
if dist == 0 {
match db_cache.get_word_pair_proximity_docids(index, txn, s1, s2, 1)? {
match ctx.get_word_pair_proximity_docids(s1, s2, 1)? {
Some(m) => bitmaps.push(CboRoaringBitmapCodec::deserialize_from(m)?),
// If there are no documents for this pair, there will be no
// results for the phrase query.
@@ -182,13 +162,9 @@ pub fn resolve_phrase<'transaction>(
} else {
let mut bitmap = RoaringBitmap::new();
for dist in 0..=dist {
if let Some(m) = db_cache.get_word_pair_proximity_docids(
index,
txn,
s1,
s2,
dist as u8 + 1,
)? {
if let Some(m) =
ctx.get_word_pair_proximity_docids(s1, s2, dist as u8 + 1)?
{
bitmap |= CboRoaringBitmapCodec::deserialize_from(m)?;
}
}

View File

@@ -1,11 +1,7 @@
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
use super::{
RankingRule, RankingRuleOutput, RankingRuleOutputIter, RankingRuleOutputIterWrapper,
RankingRuleQueryTrait,
RankingRuleQueryTrait, SearchContext,
};
use crate::{
// facet::FacetType,
@@ -15,18 +11,19 @@ use crate::{
Index,
Result,
};
use roaring::RoaringBitmap;
pub struct Sort<'transaction, Query> {
pub struct Sort<'search, Query> {
field_name: String,
field_id: Option<FieldId>,
is_ascending: bool,
original_query: Option<Query>,
iter: Option<RankingRuleOutputIterWrapper<'transaction, Query>>,
iter: Option<RankingRuleOutputIterWrapper<'search, Query>>,
}
impl<'transaction, Query> Sort<'transaction, Query> {
pub fn new(
impl<'search, Query> Sort<'search, Query> {
pub fn _new(
index: &Index,
rtxn: &'transaction heed::RoTxn,
rtxn: &'search heed::RoTxn,
field_name: String,
is_ascending: bool,
) -> Result<Self> {
@@ -37,18 +34,14 @@ impl<'transaction, Query> Sort<'transaction, Query> {
}
}
impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query>
for Sort<'transaction, Query>
{
impl<'search, Query: RankingRuleQueryTrait> RankingRule<'search, Query> for Sort<'search, Query> {
fn id(&self) -> String {
let Self { field_name, is_ascending, .. } = self;
format!("{field_name}:{}", if *is_ascending { "asc" } else { "desc " })
}
fn start_iteration(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<Query>,
parent_candidates: &RoaringBitmap,
parent_query_graph: &Query,
@@ -59,8 +52,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
if self.is_ascending { ascending_facet_sort } else { descending_facet_sort };
let number_iter = make_iter(
txn,
index
ctx.txn,
ctx.index
.facet_id_f64_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
@@ -68,8 +61,8 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
)?;
let string_iter = make_iter(
txn,
index
ctx.txn,
ctx.index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
@@ -91,9 +84,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
fn next_bucket(
&mut self,
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
_ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<Query>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<Query>>> {
@@ -110,9 +101,7 @@ impl<'transaction, Query: RankingRuleQueryTrait> RankingRule<'transaction, Query
fn end_iteration(
&mut self,
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
_ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<Query>,
) {
self.original_query = None;

View File

@@ -1,13 +1,9 @@
use std::collections::BTreeSet;
use heed::RoTxn;
use roaring::RoaringBitmap;
use super::db_cache::DatabaseCache;
use super::logger::SearchLogger;
use super::resolve_query_graph::{resolve_query_graph, NodeDocIdsCache};
use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput};
use crate::{Index, Result, TermsMatchingStrategy};
use super::resolve_query_graph::resolve_query_graph;
use super::{QueryGraph, QueryNode, RankingRule, RankingRuleOutput, SearchContext};
use crate::{Result, TermsMatchingStrategy};
use roaring::RoaringBitmap;
use std::collections::BTreeSet;
pub struct Words {
exhausted: bool,
@@ -15,7 +11,6 @@ pub struct Words {
iterating: bool,
positions_to_remove: Vec<i8>,
terms_matching_strategy: TermsMatchingStrategy,
node_docids_cache: NodeDocIdsCache,
}
impl Words {
pub fn new(terms_matching_strategy: TermsMatchingStrategy) -> Self {
@@ -25,20 +20,17 @@ impl Words {
iterating: false,
positions_to_remove: vec![],
terms_matching_strategy,
node_docids_cache: <_>::default(),
}
}
}
impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
impl<'search> RankingRule<'search, QueryGraph> for Words {
fn id(&self) -> String {
"words".to_owned()
}
fn start_iteration(
&mut self,
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
_ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<QueryGraph>,
_parent_candidates: &RoaringBitmap,
parent_query_graph: &QueryGraph,
@@ -71,9 +63,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
fn next_bucket(
&mut self,
index: &Index,
txn: &'transaction RoTxn,
db_cache: &mut DatabaseCache<'transaction>,
ctx: &mut SearchContext<'search>,
logger: &mut dyn SearchLogger<QueryGraph>,
universe: &RoaringBitmap,
) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
@@ -87,14 +77,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
logger.log_words_state(query_graph);
let this_bucket = resolve_query_graph(
index,
txn,
db_cache,
&mut self.node_docids_cache,
query_graph,
universe,
)?;
let this_bucket = resolve_query_graph(ctx, query_graph, universe)?;
let child_query_graph = query_graph.clone();
loop {
@@ -115,9 +98,7 @@ impl<'transaction> RankingRule<'transaction, QueryGraph> for Words {
fn end_iteration(
&mut self,
_index: &Index,
_txn: &'transaction RoTxn,
_db_cache: &mut DatabaseCache<'transaction>,
_ctx: &mut SearchContext<'search>,
_logger: &mut dyn SearchLogger<QueryGraph>,
) {
self.iterating = false;