introduce exact_word_docids db

This commit is contained in:
ad hoc
2022-03-24 15:22:57 +01:00
parent 5f9f82757d
commit 0a77be4ec0
10 changed files with 133 additions and 47 deletions

View File

@ -10,17 +10,21 @@ use super::helpers::{
};
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::index_documents::MergeFn;
use crate::Result;
/// Extracts the word and the documents ids where this word appear.
///
/// Returns a grenad reader with the list of extracted words and
/// documents ids from the given chunk of docid word positions.
///
/// The first returned reader in the one for normal word_docids, and the second one is for
/// exact_word_docids
#[logging_timer::time]
pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
let max_memory = indexer.max_memory_by_thread();
let mut word_docids_sorter = create_sorter(
@ -43,5 +47,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
word_docids_sorter.insert(word_bytes, &value_buffer)?;
}
sorter_into_reader(word_docids_sorter, indexer)
let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn);
Ok((
sorter_into_reader(word_docids_sorter, indexer)?,
sorter_into_reader(empty_sorter, indexer)?,
))
}

View File

@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents(
"field-id-wordcount-docids",
);
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
docid_word_positions_chunks.clone(),
indexer.clone(),
lmdb_writer_sx.clone(),
extract_word_docids,
merge_roaring_bitmaps,
TypedChunk::WordDocids,
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
},
"word-docids",
);

View File

@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database(
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(())
}
/// Used when trying to merge readers, but you don't actually care about the values.
pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(Cow::Owned(Vec::new()))
}

View File

@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto};
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
use fst::{IntoStreamer, Streamer};
pub use grenad_helpers::{
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing,
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
GrenadParameters, MergeableReader,
};

View File

@ -20,7 +20,7 @@ pub use self::helpers::{
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
};
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters};
pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader;
pub use crate::update::index_documents::helpers::CursorClonableMmap;
@ -282,6 +282,7 @@ where
let mut word_pair_proximity_docids = None;
let mut word_position_docids = None;
let mut word_docids = None;
let mut _exact_word_docids = None;
let mut databases_seen = 0;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@ -291,10 +292,13 @@ where
for result in lmdb_writer_rx {
let typed_chunk = match result? {
TypedChunk::WordDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk);
TypedChunk::WordDocids(chunk)
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
_exact_word_docids = Some(cloneable_chunk);
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
}
TypedChunk::WordPairProximityDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
@ -425,6 +429,10 @@ where
});
if let Some(word_docids) = word_docids {
let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
word_docids_builder.push(word_docids.into_cursor()?);
// TODO: push exact_word_docids
let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
// Run the word prefix docids update operation.
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
@ -432,7 +440,7 @@ where
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
builder.max_memory = self.indexer_config.max_memory;
builder.execute(
word_docids,
word_docids_iter,
&new_prefix_fst_words,
&common_prefix_fst_words,
&del_prefix_fst_words,

View File

@ -3,14 +3,16 @@ use std::convert::TryInto;
use std::fs::File;
use std::io;
use grenad::MergerBuilder;
use heed::types::ByteSlice;
use heed::{BytesDecode, RwTxn};
use roaring::RoaringBitmap;
use super::helpers::{
self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn};
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{
@ -25,7 +27,10 @@ pub(crate) enum TypedChunk {
Documents(grenad::Reader<CursorClonableMmap>),
FieldIdWordcountDocids(grenad::Reader<File>),
NewDocumentsIds(RoaringBitmap),
WordDocids(grenad::Reader<File>),
WordDocids {
word_docids_reader: grenad::Reader<File>,
exact_word_docids_reader: grenad::Reader<File>,
},
WordPositionDocids(grenad::Reader<File>),
WordPairProximityDocids(grenad::Reader<File>),
FieldIdFacetStringDocids(grenad::Reader<File>),
@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index(
TypedChunk::NewDocumentsIds(documents_ids) => {
return Ok((documents_ids, is_merged_database))
}
TypedChunk::WordDocids(word_docids_iter) => {
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?;
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
append_entries_into_database(
word_docids_iter.clone(),
&index.word_docids,
@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index(
merge_roaring_bitmaps,
)?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
append_entries_into_database(
exact_word_docids_iter.clone(),
&index.exact_word_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
)?;
// create fst from word docids
let mut builder = fst::SetBuilder::memory();
let mut cursor = word_docids_iter.into_cursor()?;
while let Some((word, _value)) = cursor.move_on_next()? {
// This is a lexicographically ordered word position
// we use the key to construct the words fst.
builder.insert(word)?;
}
let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
let db_fst = index.words_fst(wtxn)?;
// merge new fst with database fst
@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index(
Ok((RoaringBitmap::new(), is_merged_database))
}
fn merge_word_docids_reader_into_fst(
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
) -> Result<fst::Set<Vec<u8>>> {
let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn);
merger_builder.push(word_docids_iter.into_cursor()?);
merger_builder.push(exact_word_docids_iter.into_cursor()?);
let mut iter = merger_builder.build().into_stream_merger_iter()?;
let mut builder = fst::SetBuilder::memory();
while let Some((k, _)) = iter.next()? {
builder.insert(k)?;
}
Ok(builder.into_set())
}
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
let new_value = RoaringBitmap::deserialize_from(new_value)?;
let db_value = RoaringBitmap::deserialize_from(db_value)?;