Introduce the compressed obkv readers and writers

This commit is contained in:
Clément Renault
2024-07-01 16:38:52 +02:00
parent 2099b4f0dd
commit 2f0567fad1
4 changed files with 72 additions and 0 deletions

View File

@@ -38,6 +38,7 @@ heed = { version = "0.20.3", default-features = false, features = [
indexmap = { version = "2.2.6", features = ["serde"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
lz4_flex = "0.11.3"
memmap2 = "0.9.4"
obkv = "0.2.2"
once_cell = "1.19.0"

View File

@@ -0,0 +1,50 @@
use std::borrow::Cow;
use heed::BoxedError;
use obkv::KvReaderU16;
pub struct ObkvCompressedCodec;
impl<'a> heed::BytesDecode<'a> for ObkvCompressedCodec {
type DItem = CompressedKvReaderU16<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
Ok(CompressedKvReaderU16(bytes))
}
}
impl heed::BytesEncode<'_> for ObkvCompressedCodec {
type EItem = CompressedKvWriterU16;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
Ok(Cow::Borrowed(&item.0))
}
}
pub struct CompressedKvReaderU16<'a>(&'a [u8]);
impl<'a> CompressedKvReaderU16<'a> {
pub fn decompress_with<'b>(
&self,
buffer: &'b mut Vec<u8>,
dictionnary: &[u8],
) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
let max_size = lz4_flex::block::get_maximum_output_size(self.0.len());
buffer.resize(max_size, 0);
let size = lz4_flex::block::decompress_into_with_dict(
self.0,
&mut buffer[..max_size],
dictionnary,
)?;
Ok(KvReaderU16::new(&buffer[..size]))
}
}
pub struct CompressedKvWriterU16(Vec<u8>);
impl CompressedKvWriterU16 {
// TODO ask for a KvReaderU16 here
pub fn new_with_dictionnary(writer: &[u8], dictionnary: &[u8]) -> Self {
CompressedKvWriterU16(lz4_flex::block::compress_with_dict(writer, dictionnary))
}
}

View File

@@ -1,6 +1,7 @@
mod beu16_str_codec;
mod beu32_str_codec;
mod byte_slice_ref;
mod compressed_obkv_codec;
pub mod facet;
mod field_id_word_count_codec;
mod fst_set_codec;