Plug new indexer

This commit is contained in:
many
2021-08-16 13:36:30 +02:00
parent 3aaf1d62f3
commit 1d314328f0
36 changed files with 1920 additions and 1826 deletions

View File

@@ -0,0 +1,22 @@
use std::sync::Arc;
use memmap::Mmap;
#[derive(Debug, Clone)]
pub struct ClonableMmap {
inner: Arc<Mmap>,
}
impl AsRef<[u8]> for ClonableMmap {
fn as_ref(&self) -> &[u8] {
self.inner.as_ref()
}
}
impl From<Mmap> for ClonableMmap {
fn from(inner: Mmap) -> ClonableMmap {
ClonableMmap { inner: Arc::new(inner) }
}
}
pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>;

View File

@@ -0,0 +1,276 @@
use std::borrow::Cow;
use std::fs::File;
use std::io::{self, Seek, SeekFrom};
use std::time::Instant;
use byte_unit::Byte;
use grenad::{CompressionType, MergerIter, Reader, Sorter};
use heed::types::ByteSlice;
use log::debug;
use super::{ClonableMmap, MergeFn};
use crate::error::InternalError;
use crate::update::index_documents::WriteMethod;
use crate::Result;
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
pub fn create_writer<R: io::Write>(
typ: grenad::CompressionType,
level: Option<u32>,
file: R,
) -> io::Result<grenad::Writer<R>> {
let mut builder = grenad::Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
builder.compression_level(level);
}
builder.build(file)
}
pub fn create_sorter(
merge: MergeFn,
chunk_compression_type: grenad::CompressionType,
chunk_compression_level: Option<u32>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
) -> grenad::Sorter<MergeFn> {
let mut builder = grenad::Sorter::builder(merge);
builder.chunk_compression_type(chunk_compression_type);
if let Some(level) = chunk_compression_level {
builder.chunk_compression_level(level);
}
if let Some(nb_chunks) = max_nb_chunks {
builder.max_nb_chunks(nb_chunks);
}
if let Some(memory) = max_memory {
builder.dump_threshold(memory);
builder.allow_realloc(false);
}
builder.build()
}
pub fn sorter_into_reader(
sorter: grenad::Sorter<MergeFn>,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file)
})?;
sorter.write_into(&mut writer)?;
Ok(writer_into_reader(writer)?)
}
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
let mut file = writer.into_inner()?;
file.seek(SeekFrom::Start(0))?;
grenad::Reader::new(file).map_err(Into::into)
}
pub unsafe fn into_clonable_grenad(
reader: grenad::Reader<File>,
) -> Result<grenad::Reader<CursorClonableMmap>> {
let file = reader.into_inner();
let mmap = memmap::Mmap::map(&file)?;
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
let reader = grenad::Reader::new(cursor)?;
Ok(reader)
}
pub fn merge_readers<R: io::Read>(
readers: Vec<grenad::Reader<R>>,
merge_fn: MergeFn,
indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> {
let mut merger_builder = grenad::MergerBuilder::new(merge_fn);
merger_builder.extend(readers);
let merger = merger_builder.build();
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(indexer.chunk_compression_type, indexer.chunk_compression_level, file)
})?;
merger.write_into(&mut writer)?;
let reader = writer_into_reader(writer)?;
Ok(reader)
}
#[derive(Debug, Clone, Copy)]
pub struct GrenadParameters {
pub chunk_compression_type: CompressionType,
pub chunk_compression_level: Option<u32>,
pub max_memory: Option<usize>,
pub max_nb_chunks: Option<usize>,
}
impl Default for GrenadParameters {
fn default() -> Self {
Self {
chunk_compression_type: CompressionType::None,
chunk_compression_level: None,
max_memory: None,
max_nb_chunks: None,
}
}
}
impl GrenadParameters {
pub fn max_memory_by_thread(&self) -> Option<usize> {
self.max_memory.map(|max_memory| max_memory / rayon::current_num_threads())
}
}
/// Returns an iterator that outputs grenad readers of obkv documents
/// with a maximum size of approximately `documents_chunks_size`.
///
/// The grenad obkv entries are composed of an incremental document id big-endian
/// encoded as the key and an obkv object with an `u8` for the field as the key
/// and a simple UTF-8 encoded string as the value.
pub fn grenad_obkv_into_chunks<R: io::Read>(
mut reader: grenad::Reader<R>,
indexer: GrenadParameters,
log_frequency: Option<usize>,
documents_chunk_size: Byte,
) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
let mut document_count = 0;
let mut continue_reading = true;
let indexer_clone = indexer.clone();
let mut transposer = move || {
if !continue_reading {
return Ok(None);
}
let mut current_chunk_size = 0u64;
let mut obkv_documents = tempfile::tempfile().and_then(|file| {
create_writer(
indexer_clone.chunk_compression_type,
indexer_clone.chunk_compression_level,
file,
)
})?;
while let Some((document_id, obkv)) = reader.next()? {
obkv_documents.insert(document_id, obkv)?;
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
document_count += 1;
if log_frequency.map_or(false, |log_frequency| document_count % log_frequency == 0) {
debug!("reached {} chunked documents", document_count);
}
if current_chunk_size >= documents_chunk_size.get_bytes() {
return writer_into_reader(obkv_documents).map(Some);
}
}
continue_reading = false;
writer_into_reader(obkv_documents).map(Some)
};
Ok(std::iter::from_fn(move || {
let result = transposer().transpose();
if result.as_ref().map_or(false, |r| r.is_ok()) {
debug!(
"A new chunk of approximately {} has been generated",
documents_chunk_size.get_appropriate_unit(true),
);
}
result
}))
}
pub fn write_into_lmdb_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
mut reader: Reader<File>,
merge: MergeFn,
method: WriteMethod,
) -> Result<()> {
debug!("Writing MTBL stores...");
let before = Instant::now();
match method {
WriteMethod::Append => {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
while let Some((k, v)) = reader.next()? {
// safety: we don't keep references from inside the LMDB database.
unsafe { out_iter.append(k, v)? };
}
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = reader.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
match iter.next().transpose()? {
Some((key, old_val)) if key == k => {
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
let val = merge(k, &vals)?;
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(k, &val)? };
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
}
}
}
}
}
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
Ok(())
}
pub fn sorter_into_lmdb_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
sorter: Sorter<MergeFn>,
merge: MergeFn,
method: WriteMethod,
) -> Result<()> {
debug!("Writing MTBL sorter...");
let before = Instant::now();
merger_iter_into_lmdb_database(wtxn, database, sorter.into_merger_iter()?, merge, method)?;
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(())
}
fn merger_iter_into_lmdb_database<R: io::Read>(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
mut sorter: MergerIter<R, MergeFn>,
merge: MergeFn,
method: WriteMethod,
) -> Result<()> {
match method {
WriteMethod::Append => {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
while let Some((k, v)) = sorter.next()? {
// safety: we don't keep references from inside the LMDB database.
unsafe { out_iter.append(k, v)? };
}
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = sorter.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
match iter.next().transpose()? {
Some((key, old_val)) if key == k => {
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
let val = merge(k, &vals).map_err(|_| {
// TODO just wrap this error?
InternalError::IndexingMergingKeys { process: "get-put-merge" }
})?;
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(k, &val)? };
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
}
}
}
}
}
Ok(())
}

View File

@@ -0,0 +1,171 @@
use std::borrow::Cow;
use std::io;
use std::result::Result as StdResult;
use roaring::RoaringBitmap;
use super::read_u32_ne_bytes;
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::Result;
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
let capacity = values.iter().map(|v| v.len()).sum::<usize>();
let mut output = Vec::with_capacity(capacity);
values.iter().for_each(|integers| output.extend_from_slice(integers));
Ok(Cow::Owned(output))
}
}
pub fn roaring_bitmap_from_u32s_array(slice: &[u8]) -> RoaringBitmap {
read_u32_ne_bytes(slice).collect()
}
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
buffer.clear();
buffer.reserve(bitmap.serialized_size());
bitmap.serialize_into(buffer)
}
pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
let merged = values
.iter()
.map(AsRef::as_ref)
.map(RoaringBitmap::deserialize_from)
.map(StdResult::unwrap)
.reduce(|a, b| a | b)
.unwrap();
let mut buffer = Vec::new();
serialize_roaring_bitmap(&merged, &mut buffer)?;
Ok(Cow::Owned(buffer))
}
}
pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
if values.len() == 1 {
Ok(values[0].clone())
} else {
let original = decode_prefix_string(&values[0]).unwrap().0;
let merged_bitmaps = values
.iter()
.map(AsRef::as_ref)
.map(decode_prefix_string)
.map(Option::unwrap)
.map(|(_, bitmap_bytes)| bitmap_bytes)
.map(RoaringBitmap::deserialize_from)
.map(StdResult::unwrap)
.reduce(|a, b| a | b)
.unwrap();
let cap = std::mem::size_of::<u16>() + original.len() + merged_bitmaps.serialized_size();
let mut buffer = Vec::with_capacity(cap);
encode_prefix_string(original, &mut buffer)?;
merged_bitmaps.serialize_into(&mut buffer)?;
Ok(Cow::Owned(buffer))
}
}
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(values[0].clone())
}
/// Only the last value associated with an id is kept.
pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(obkvs.last().unwrap().clone())
}
/// Merge all the obks in the order we see them.
pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(obkvs
.into_iter()
.cloned()
.reduce(|acc, current| {
let first = obkv::KvReader::new(&acc);
let second = obkv::KvReader::new(&current);
let mut buffer = Vec::new();
merge_two_obkvs(first, second, &mut buffer);
Cow::from(buffer)
})
.unwrap())
}
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};
buffer.clear();
let mut writer = obkv::KvWriter::new(buffer);
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
match eob {
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
}
}
writer.finish().unwrap();
}
pub fn merge_cbo_roaring_bitmaps<'a>(
_key: &[u8],
values: &[Cow<'a, [u8]>],
) -> Result<Cow<'a, [u8]>> {
match values.split_first().unwrap() {
(head, []) => Ok(head.clone()),
(head, tail) => {
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
for value in tail {
head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?;
}
let mut vec = Vec::new();
CboRoaringBitmapCodec::serialize_into(&head, &mut vec);
Ok(Cow::from(vec))
}
}
}
// /// Uses the FacetStringLevelZeroValueCodec to merge the values.
// pub fn tuple_string_cbo_roaring_bitmap_merge<'a>(
// _key: &[u8],
// values: &[Cow<[u8]>],
// ) -> Result<Cow<'a, [u8]>> {
// let (head, tail) = values.split_first().unwrap();
// let (head_string, mut head_rb) = FacetStringLevelZeroValueCodec::bytes_decode(&head[..])
// .ok_or(SerializationError::Decoding { db_name: None })?;
// for value in tail {
// let (_string, rb) = FacetStringLevelZeroValueCodec::bytes_decode(&value[..])
// .ok_or(SerializationError::Decoding { db_name: None })?;
// head_rb |= rb;
// }
// FacetStringLevelZeroValueCodec::bytes_encode(&(head_string, head_rb))
// .map(|cow| cow.into_owned())
// .ok_or(SerializationError::Encoding { db_name: None })
// .map_err(Into::into)
// }
// pub fn cbo_roaring_bitmap_merge<'a>(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Cow<'a, [u8]>> {
// let (head, tail) = values.split_first().unwrap();
// let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
// for value in tail {
// head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?;
// }
// let mut vec = Vec::new();
// CboRoaringBitmapCodec::serialize_into(&head, &mut vec);
// Ok(vec)
// }

View File

@@ -0,0 +1,49 @@
mod clonable_mmap;
mod grenad_helpers;
mod merge_functions;
use std::convert::{TryFrom, TryInto};
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
pub use grenad_helpers::{
create_sorter, create_writer, grenad_obkv_into_chunks, into_clonable_grenad, merge_readers,
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
GrenadParameters,
};
pub use merge_functions::{
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs,
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
};
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
key.as_ref().len() <= 511
}
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
if mid <= slice.len() {
Some(slice.split_at(mid))
} else {
None
}
}
/// Divides one slice into an array and the tail at an index,
/// returns `None` if `N` is out of bounds.
pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
where
[T; N]: for<'a> TryFrom<&'a [T]>,
{
let (head, tail) = try_split_at(slice, N)?;
let head = head.try_into().ok()?;
Some((head, tail))
}
// pub fn pretty_thousands<A: Borrow<T>, T: fmt::Display>(number: A) -> String {
// thousands::Separable::separate_with_spaces(number.borrow())
// }
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
}