Add documentation

This commit is contained in:
Loïc Lecrenier
2023-03-08 13:26:29 +01:00
parent 4e266211bf
commit c232cdabf5
10 changed files with 358 additions and 548 deletions

View File

@ -1,15 +1,21 @@
use std::collections::hash_map::Entry;
use std::hash::Hash;
use fxhash::FxHashMap;
use heed::types::ByteSlice;
use heed::{BytesEncode, Database, RoTxn};
use super::interner::Interned;
use super::SearchContext;
use crate::Result;
/// A cache storing pointers to values in the LMDB databases.
///
/// Used for performance reasons only. By using this cache, we avoid performing a
/// database lookup and instead get a direct reference to the value using a fast
/// local HashMap lookup.
#[derive(Default)]
pub struct DatabaseCache<'search> {
// TODO: interner for all database cache keys?
pub word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<&'search [u8]>>,
pub word_prefix_pair_proximity_docids:
@ -21,36 +27,50 @@ pub struct DatabaseCache<'search> {
pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'search [u8]>>,
}
impl<'search> SearchContext<'search> {
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
let bitmap_ptr = match self.db_cache.word_docids.entry(word) {
fn get_value<'v, K1, KC>(
txn: &'search RoTxn,
cache_key: K1,
db_key: &'v KC::EItem,
cache: &mut FxHashMap<K1, Option<&'search [u8]>>,
db: Database<KC, ByteSlice>,
) -> Result<Option<&'search [u8]>>
where
K1: Copy + Eq + Hash,
KC: BytesEncode<'v>,
{
let bitmap_ptr = match cache.entry(cache_key) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.word_docids
.remap_data_type::<ByteSlice>()
.get(self.txn, self.word_interner.get(word))?;
let bitmap_ptr = db.get(txn, db_key)?;
entry.insert(bitmap_ptr);
bitmap_ptr
}
};
Ok(bitmap_ptr)
}
pub fn get_prefix_docids(&mut self, prefix: Interned<String>) -> Result<Option<&'search [u8]>> {
// In the future, this will be a frozen roaring bitmap
let bitmap_ptr = match self.db_cache.word_prefix_docids.entry(prefix) {
Entry::Occupied(bitmap_ptr) => *bitmap_ptr.get(),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.word_prefix_docids
.remap_data_type::<ByteSlice>()
.get(self.txn, self.word_interner.get(prefix))?;
entry.insert(bitmap_ptr);
bitmap_ptr
}
};
Ok(bitmap_ptr)
/// Retrieve or insert the given value in the `word_docids` database.
pub fn get_word_docids(&mut self, word: Interned<String>) -> Result<Option<&'search [u8]>> {
Self::get_value(
self.txn,
word,
self.word_interner.get(word).as_str(),
&mut self.db_cache.word_docids,
self.index.word_docids.remap_data_type::<ByteSlice>(),
)
}
/// Retrieve or insert the given value in the `word_prefix_docids` database.
pub fn get_word_prefix_docids(
&mut self,
prefix: Interned<String>,
) -> Result<Option<&'search [u8]>> {
Self::get_value(
self.txn,
prefix,
self.word_interner.get(prefix).as_str(),
&mut self.db_cache.word_prefix_docids,
self.index.word_prefix_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_word_pair_proximity_docids(
@ -59,40 +79,17 @@ impl<'search> SearchContext<'search> {
word2: Interned<String>,
proximity: u8,
) -> Result<Option<&'search [u8]>> {
let key = (proximity, word1, word2);
match self.db_cache.word_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
// We shouldn't greedily access this DB at all
// a DB (w1, w2) -> [proximities] would be much better
// We could even have a DB that is (w1) -> set of words such that (w1, w2) are in proximity
// And if we worked with words encoded as integers, the set of words could be a roaring bitmap
// Then, to find all the proximities between two list of words, we'd do:
// inputs:
// - words1 (roaring bitmap)
// - words2 (roaring bitmap)
// output:
// - [(word1, word2, [proximities])]
// algo:
// let mut ouput = vec![];
// for word1 in words1 {
// let all_words_in_proximity_of_w1 = pair_words_db.get(word1);
// let words_in_proximity_of_w1 = all_words_in_proximity_of_w1 & words2;
// for word2 in words_in_proximity_of_w1 {
// let proximties = prox_db.get(word1, word2);
// output.push(word1, word2, proximities);
// }
// }
let bitmap_ptr =
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>().get(
self.txn,
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
}
Self::get_value(
self.txn,
(proximity, word1, word2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(word2).as_str(),
),
&mut self.db_cache.word_pair_proximity_docids,
self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_word_prefix_pair_proximity_docids(
@ -101,22 +98,17 @@ impl<'search> SearchContext<'search> {
prefix2: Interned<String>,
proximity: u8,
) -> Result<Option<&'search [u8]>> {
let key = (proximity, word1, prefix2);
match self.db_cache.word_prefix_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.word_prefix_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(
self.txn,
&(key.0, self.word_interner.get(key.1), self.word_interner.get(key.2)),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
}
Self::get_value(
self.txn,
(proximity, word1, prefix2),
&(
proximity,
self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(),
),
&mut self.db_cache.word_prefix_pair_proximity_docids,
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
pub fn get_prefix_word_pair_proximity_docids(
&mut self,
@ -124,25 +116,16 @@ impl<'search> SearchContext<'search> {
right: Interned<String>,
proximity: u8,
) -> Result<Option<&'search [u8]>> {
let key = (proximity, left_prefix, right);
match self.db_cache.prefix_word_pair_proximity_docids.entry(key) {
Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
Entry::Vacant(entry) => {
let bitmap_ptr = self
.index
.prefix_word_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.get(
self.txn,
&(
proximity,
self.word_interner.get(left_prefix),
self.word_interner.get(right),
),
)?;
entry.insert(bitmap_ptr);
Ok(bitmap_ptr)
}
}
Self::get_value(
self.txn,
(proximity, left_prefix, right),
&(
proximity,
self.word_interner.get(left_prefix).as_str(),
self.word_interner.get(right).as_str(),
),
&mut self.db_cache.prefix_word_pair_proximity_docids,
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
}
}