Add the raw document IDs to the postings lists

This commit is contained in:
Clément Renault
2020-01-08 15:30:43 +01:00
parent 9420edadf4
commit 81c573ec92
7 changed files with 54 additions and 59 deletions

View File

@@ -30,6 +30,7 @@ use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
use crate::{store, Document, DocumentId, MResult}; use crate::{store, Document, DocumentId, MResult};
use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult}; use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult};
use crate::query_tree::Context as QTContext; use crate::query_tree::Context as QTContext;
use crate::store::Postings;
pub fn bucket_sort<'c, FI>( pub fn bucket_sort<'c, FI>(
reader: &heed::RoTxn<MainT>, reader: &heed::RoTxn<MainT>,
@@ -569,12 +570,12 @@ fn fetch_matches<'txn, 'tag>(
number_of_words += 1; number_of_words += 1;
let before_postings_lists_fetching = Instant::now(); let before_postings_lists_fetching = Instant::now();
if let Some(postings_list) = pplc_store.prefix_postings_list(reader, prefix)? { if let Some(postings) = pplc_store.prefix_postings_list(reader, prefix)? {
debug!("Found cached postings list for {:?}", query); debug!("Found cached postings list for {:?}", query);
postings_lists_original_length += postings_list.len(); postings_lists_original_length += postings.matches.len();
let input = Rc::from(&prefix[..]); let input = Rc::from(&prefix[..]);
let postings_list = Rc::new(postings_list); let postings_list = Rc::new(postings.matches);
let postings_list_view = PostingsListView::original(input, postings_list); let postings_list_view = PostingsListView::original(input, postings_list);
let mut offset = 0; let mut offset = 0;
@@ -751,11 +752,11 @@ fn split_best_frequency<'a>(
let left_freq = postings_lists_store let left_freq = postings_lists_store
.postings_list(reader, left.as_ref())? .postings_list(reader, left.as_ref())?
.map_or(0, |i| i.len()); .map_or(0, |p| p.docids.len());
let right_freq = postings_lists_store let right_freq = postings_lists_store
.postings_list(reader, right.as_ref())? .postings_list(reader, right.as_ref())?
.map_or(0, |i| i.len()); .map_or(0, |p| p.docids.len());
let min_freq = cmp::min(left_freq, right_freq); let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {

View File

@@ -107,8 +107,14 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'
for (i, _) in chars { for (i, _) in chars {
let (left, right) = word.split_at(i); let (left, right) = word.split_at(i);
let left_freq = ctx.postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); let left_freq = ctx.postings_lists
let right_freq = ctx.postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); .postings_list(reader, left.as_bytes())?
.map(|p| p.docids.len())
.unwrap_or(0);
let right_freq = ctx.postings_lists
.postings_list(reader, right.as_bytes())?
.map(|p| p.docids.len())
.unwrap_or(0);
let min_freq = cmp::min(left_freq, right_freq); let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
@@ -208,12 +214,12 @@ pub fn create_query_tree(reader: &heed::RoTxn<MainT>, ctx: &Context, query: &str
} }
pub struct QueryResult<'o, 'txn> { pub struct QueryResult<'o, 'txn> {
pub docids: SetBuf<DocumentId>, pub docids: Cow<'txn, Set<DocumentId>>,
pub queries: HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>, pub queries: HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>,
} }
pub type Postings<'o, 'txn> = HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>; pub type Postings<'o, 'txn> = HashMap<&'o Query, Cow<'txn, Set<DocIndex>>>;
pub type Cache<'o, 'c> = HashMap<&'o Operation, SetBuf<DocumentId>>; pub type Cache<'o, 'txn> = HashMap<&'o Operation, Cow<'txn, Set<DocumentId>>>;
pub fn traverse_query_tree<'o, 'txn>( pub fn traverse_query_tree<'o, 'txn>(
reader: &'txn heed::RoTxn<MainT>, reader: &'txn heed::RoTxn<MainT>,
@@ -228,7 +234,7 @@ pub fn traverse_query_tree<'o, 'txn>(
postings: &mut Postings<'o, 'txn>, postings: &mut Postings<'o, 'txn>,
depth: usize, depth: usize,
operations: &'o [Operation], operations: &'o [Operation],
) -> MResult<SetBuf<DocumentId>> ) -> MResult<Cow<'txn, Set<DocumentId>>>
{ {
println!("{:1$}AND", "", depth * 2); println!("{:1$}AND", "", depth * 2);
@@ -257,7 +263,7 @@ pub fn traverse_query_tree<'o, 'txn>(
println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
Ok(docids) Ok(Cow::Owned(docids))
} }
fn execute_or<'o, 'txn>( fn execute_or<'o, 'txn>(
@@ -267,7 +273,7 @@ pub fn traverse_query_tree<'o, 'txn>(
postings: &mut Postings<'o, 'txn>, postings: &mut Postings<'o, 'txn>,
depth: usize, depth: usize,
operations: &'o [Operation], operations: &'o [Operation],
) -> MResult<SetBuf<DocumentId>> ) -> MResult<Cow<'txn, Set<DocumentId>>>
{ {
println!("{:1$}OR", "", depth * 2); println!("{:1$}OR", "", depth * 2);
@@ -294,7 +300,7 @@ pub fn traverse_query_tree<'o, 'txn>(
println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2);
Ok(docids) Ok(Cow::Owned(docids))
} }
fn execute_query<'o, 'txn>( fn execute_query<'o, 'txn>(
@@ -303,7 +309,7 @@ pub fn traverse_query_tree<'o, 'txn>(
postings: &mut Postings<'o, 'txn>, postings: &mut Postings<'o, 'txn>,
depth: usize, depth: usize,
query: &'o Query, query: &'o Query,
) -> MResult<SetBuf<DocumentId>> ) -> MResult<Cow<'txn, Set<DocumentId>>>
{ {
let before = Instant::now(); let before = Instant::now();
@@ -313,14 +319,7 @@ pub fn traverse_query_tree<'o, 'txn>(
if *prefix && word.len() == 1 { if *prefix && word.len() == 1 {
let prefix = [word.as_bytes()[0], 0, 0, 0]; let prefix = [word.as_bytes()[0], 0, 0, 0];
let matches = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default(); let matches = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default();
matches.docids
let before = Instant::now();
let mut docids: Vec<_> = matches.into_iter().map(|m| m.document_id).collect();
docids.dedup();
let docids = SetBuf::new(docids).unwrap();
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
docids
} else { } else {
let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) }; let dfa = if *prefix { build_prefix_dfa(word) } else { build_dfa(word) };
@@ -333,8 +332,8 @@ pub fn traverse_query_tree<'o, 'txn>(
let mut docids = Vec::new(); let mut docids = Vec::new();
while let Some(input) = stream.next() { while let Some(input) = stream.next() {
if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? { if let Some(postings) = ctx.postings_lists.postings_list(reader, input)? {
docids.extend(matches.iter().map(|d| d.document_id)) docids.extend_from_slice(&postings.docids);
} }
} }
@@ -342,7 +341,7 @@ pub fn traverse_query_tree<'o, 'txn>(
let docids = SetBuf::from_dirty(docids); let docids = SetBuf::from_dirty(docids);
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
docids Cow::Owned(docids)
} }
}, },
QueryKind::Exact(word) => { QueryKind::Exact(word) => {
@@ -358,16 +357,12 @@ pub fn traverse_query_tree<'o, 'txn>(
let mut docids = Vec::new(); let mut docids = Vec::new();
while let Some(input) = stream.next() { while let Some(input) = stream.next() {
if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? { if let Some(postings) = ctx.postings_lists.postings_list(reader, input)? {
docids.extend(matches.iter().map(|d| d.document_id)) docids.extend_from_slice(&postings.docids);
} }
} }
let before = Instant::now(); Cow::Owned(SetBuf::from_dirty(docids))
let docids = SetBuf::from_dirty(docids);
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
docids
}, },
QueryKind::Phrase(words) => { QueryKind::Phrase(words) => {
// TODO support prefix and non-prefix exact DFA // TODO support prefix and non-prefix exact DFA
@@ -375,7 +370,7 @@ pub fn traverse_query_tree<'o, 'txn>(
let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default(); let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default();
let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default(); let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default();
let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| { let iter = merge_join_by(first.matches.as_slice(), second.matches.as_slice(), |a, b| {
let x = (a.document_id, a.attribute, (a.word_index as u32) + 1); let x = (a.document_id, a.attribute, (a.word_index as u32) + 1);
let y = (b.document_id, b.attribute, b.word_index as u32); let y = (b.document_id, b.attribute, b.word_index as u32);
x.cmp(&y) x.cmp(&y)
@@ -394,10 +389,10 @@ pub fn traverse_query_tree<'o, 'txn>(
println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2); println!("{:2$}docids construction took {:.02?}", "", before.elapsed(), depth * 2);
println!("{:2$}matches {:?}", "", matches, depth * 2); println!("{:2$}matches {:?}", "", matches, depth * 2);
docids Cow::Owned(docids)
} else { } else {
println!("{:2$}{:?} skipped", "", words, depth * 2); println!("{:2$}{:?} skipped", "", words, depth * 2);
SetBuf::default() Cow::default()
} }
}, },
}; };

View File

@@ -59,13 +59,13 @@ impl DocumentAttrKey {
} }
} }
#[derive(Debug)] #[derive(Default, Debug)]
pub struct Postings<'a> { pub struct Postings<'a> {
pub docids: Cow<'a, Set<DocumentId>>, pub docids: Cow<'a, Set<DocumentId>>,
pub matches: Cow<'a, Set<DocIndex>>, pub matches: Cow<'a, Set<DocIndex>>,
} }
struct PostingsCodec; pub struct PostingsCodec;
impl<'a> BytesEncode<'a> for PostingsCodec { impl<'a> BytesEncode<'a> for PostingsCodec {
type EItem = Postings<'a>; type EItem = Postings<'a>;
@@ -125,7 +125,6 @@ impl<'a> BytesDecode<'a> for PostingsCodec {
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let u64_size = mem::size_of::<u64>(); let u64_size = mem::size_of::<u64>();
let docid_size = mem::size_of::<DocumentId>(); let docid_size = mem::size_of::<DocumentId>();
let docindex_size = mem::size_of::<DocIndex>();
let (len_bytes, bytes) = bytes.split_at(u64_size); let (len_bytes, bytes) = bytes.split_at(u64_size);
let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize; let docids_len = len_bytes.try_into().ok().map(u64::from_be_bytes)? as usize;

View File

@@ -1,14 +1,12 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::convert::TryInto;
use std::{mem, ptr};
use heed::Result as ZResult; use heed::Result as ZResult;
use heed::types::{ByteSlice, CowSlice}; use heed::types::ByteSlice;
use sdset::{Set, SetBuf}; use sdset::{Set, SetBuf};
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::database::MainT; use crate::database::MainT;
use crate::{DocIndex, DocumentId}; use crate::DocIndex;
use crate::store::{Postings, PostingsCodec}; use crate::store::{Postings, PostingsCodec};
#[derive(Copy, Clone)] #[derive(Copy, Clone)]

View File

@@ -1,15 +1,17 @@
use std::borrow::Cow; use std::borrow::Cow;
use heed::Result as ZResult; use heed::Result as ZResult;
use heed::types::{OwnedType, CowSlice}; use heed::types::OwnedType;
use sdset::{Set, SetBuf}; use sdset::{Set, SetBuf};
use slice_group_by::GroupBy;
use crate::DocIndex;
use crate::database::MainT; use crate::database::MainT;
use crate::DocIndex;
use crate::store::{PostingsCodec, Postings};
#[derive(Copy, Clone)] #[derive(Copy, Clone)]
pub struct PrefixPostingsListsCache { pub struct PrefixPostingsListsCache {
pub(crate) prefix_postings_lists_cache: heed::Database<OwnedType<[u8; 4]>, CowSlice<DocIndex>>, pub(crate) prefix_postings_lists_cache: heed::Database<OwnedType<[u8; 4]>, PostingsCodec>,
} }
impl PrefixPostingsListsCache { impl PrefixPostingsListsCache {
@@ -17,10 +19,15 @@ impl PrefixPostingsListsCache {
self, self,
writer: &mut heed::RwTxn<MainT>, writer: &mut heed::RwTxn<MainT>,
prefix: [u8; 4], prefix: [u8; 4],
postings_list: &Set<DocIndex>, matches: &Set<DocIndex>,
) -> ZResult<()> ) -> ZResult<()>
{ {
self.prefix_postings_lists_cache.put(writer, &prefix, postings_list) let docids = matches.linear_group_by_key(|m| m.document_id).map(|g| g[0].document_id).collect();
let docids = Cow::Owned(SetBuf::new_unchecked(docids));
let matches = Cow::Borrowed(matches);
let postings = Postings { docids, matches };
self.prefix_postings_lists_cache.put(writer, &prefix, &postings)
} }
pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> { pub fn clear(self, writer: &mut heed::RwTxn<MainT>) -> ZResult<()> {
@@ -31,12 +38,8 @@ impl PrefixPostingsListsCache {
self, self,
reader: &'txn heed::RoTxn<MainT>, reader: &'txn heed::RoTxn<MainT>,
prefix: [u8; 4], prefix: [u8; 4],
) -> ZResult<Option<Cow<'txn, Set<DocIndex>>>> ) -> ZResult<Option<Postings<'txn>>>
{ {
match self.prefix_postings_lists_cache.get(reader, &prefix)? { self.prefix_postings_lists_cache.get(reader, &prefix)
Some(Cow::Owned(vec)) => Ok(Some(Cow::Owned(SetBuf::new_unchecked(vec)))),
Some(Cow::Borrowed(slice)) => Ok(Some(Cow::Borrowed(Set::new_unchecked(slice)))),
None => Ok(None),
}
} }
} }

View File

@@ -1,8 +1,7 @@
use std::collections::HashMap; use std::collections::HashMap;
use std::borrow::Cow;
use fst::{set::OpBuilder, SetBuilder, IntoStreamer, Streamer}; use fst::{set::OpBuilder, SetBuilder, IntoStreamer, Streamer};
use sdset::{duo::Union, SetOperation, Set, SetBuf}; use sdset::{duo::Union, SetOperation, Set};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use log::debug; use log::debug;
@@ -201,7 +200,7 @@ pub fn apply_documents_addition<'a, 'b>(
// compute prefixes and store those in the PrefixPostingsListsCache. // compute prefixes and store those in the PrefixPostingsListsCache.
let mut stream = words_fst.into_stream(); let mut stream = words_fst.into_stream();
while let Some(input) = stream.next() { while let Some(input) = stream.next() {
if let Some(postings_list) = postings_lists_store.postings_list(writer, input)?.map(Cow::into_owned) { if let Some(postings_list) = postings_lists_store.postings_list(writer, input)?.map(|p| p.matches.into_owned()) {
let prefix = &input[..1]; let prefix = &input[..1];
let mut arr = [0; 4]; let mut arr = [0; 4];
@@ -453,7 +452,7 @@ pub fn write_documents_addition_index(
delta_words_builder.insert(&word).unwrap(); delta_words_builder.insert(&word).unwrap();
let set = match postings_lists_store.postings_list(writer, &word)? { let set = match postings_lists_store.postings_list(writer, &word)? {
Some(set) => Union::new(&set, &delta_set).into_set_buf(), Some(postings) => Union::new(&postings.matches, &delta_set).into_set_buf(),
None => delta_set, None => delta_set,
}; };

View File

@@ -142,8 +142,8 @@ pub fn apply_documents_deletion(
for (word, document_ids) in words_document_ids { for (word, document_ids) in words_document_ids {
let document_ids = SetBuf::from_dirty(document_ids); let document_ids = SetBuf::from_dirty(document_ids);
if let Some(doc_indexes) = postings_lists_store.postings_list(writer, &word)? { if let Some(postings) = postings_lists_store.postings_list(writer, &word)? {
let op = DifferenceByKey::new(&doc_indexes, &document_ids, |d| d.document_id, |id| *id); let op = DifferenceByKey::new(&postings.matches, &document_ids, |d| d.document_id, |id| *id);
let doc_indexes = op.into_set_buf(); let doc_indexes = op.into_set_buf();
if !doc_indexes.is_empty() { if !doc_indexes.is_empty() {