mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-11-23 21:26:02 +00:00
Compare commits
3 Commits
default-ex
...
try-roarin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f747964f65 | ||
|
|
47f6ab1279 | ||
|
|
67b08a180f |
3
Cargo.lock
generated
3
Cargo.lock
generated
@@ -5168,8 +5168,7 @@ checksum = "3582f63211428f83597b51b2ddb88e2a91a9d52d12831f9d08f5e624e8977422"
|
|||||||
[[package]]
|
[[package]]
|
||||||
name = "roaring"
|
name = "roaring"
|
||||||
version = "0.10.12"
|
version = "0.10.12"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "git+https://github.com/RoaringBitmap/roaring-rs.git#6535a822358fce546eb021da8b02d52e4906fe7a"
|
||||||
checksum = "19e8d2cfa184d94d0726d650a9f4a1be7f9b76ac9fdb954219878dc00c1c1e7b"
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
|
|||||||
@@ -49,3 +49,6 @@ opt-level = 3
|
|||||||
opt-level = 3
|
opt-level = 3
|
||||||
[profile.dev.package.roaring]
|
[profile.dev.package.roaring]
|
||||||
opt-level = 3
|
opt-level = 3
|
||||||
|
|
||||||
|
[patch.crates-io]
|
||||||
|
roaring = { git = "https://github.com/RoaringBitmap/roaring-rs.git" }
|
||||||
|
|||||||
@@ -43,8 +43,10 @@ impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
|
|||||||
type EItem = RoaringBitmap;
|
type EItem = RoaringBitmap;
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||||
|
let mut item = item.clone();
|
||||||
|
item.optimize();
|
||||||
let mut out = Vec::new();
|
let mut out = Vec::new();
|
||||||
BoRoaringBitmapCodec::serialize_into(item, &mut out);
|
BoRoaringBitmapCodec::serialize_into(&item, &mut out);
|
||||||
Ok(Cow::Owned(out))
|
Ok(Cow::Owned(out))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,30 +1,19 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::io::{self, Cursor};
|
use std::io::{self, Cursor};
|
||||||
use std::mem::size_of;
|
|
||||||
|
|
||||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
|
||||||
use heed::BoxedError;
|
use heed::BoxedError;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
|
|
||||||
/// This is the limit where using a byteorder became less size efficient
|
|
||||||
/// than using a direct roaring encoding, it is also the point where we are able
|
|
||||||
/// to determine the encoding used only by using the array of bytes length.
|
|
||||||
pub const THRESHOLD: usize = 7;
|
|
||||||
|
|
||||||
/// A conditionnal codec that either use the RoaringBitmap
|
/// A conditionnal codec that either use the RoaringBitmap
|
||||||
/// or a lighter ByteOrder en/decoding method.
|
/// or a lighter ByteOrder en/decoding method.
|
||||||
pub struct CboRoaringBitmapCodec;
|
pub struct CboRoaringBitmapCodec;
|
||||||
|
|
||||||
impl CboRoaringBitmapCodec {
|
impl CboRoaringBitmapCodec {
|
||||||
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
||||||
if roaring.len() <= THRESHOLD as u64 {
|
roaring.serialized_size()
|
||||||
roaring.len() as usize * size_of::<u32>()
|
|
||||||
} else {
|
|
||||||
roaring.serialized_size()
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec<u8>) {
|
pub fn serialize_into_vec(roaring: &RoaringBitmap, vec: &mut Vec<u8>) {
|
||||||
@@ -33,55 +22,23 @@ impl CboRoaringBitmapCodec {
|
|||||||
|
|
||||||
pub fn serialize_into_writer<W: io::Write>(
|
pub fn serialize_into_writer<W: io::Write>(
|
||||||
roaring: &RoaringBitmap,
|
roaring: &RoaringBitmap,
|
||||||
mut writer: W,
|
writer: W,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
if roaring.len() <= THRESHOLD as u64 {
|
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
||||||
// If the number of items (u32s) to encode is less than or equal to the threshold
|
roaring.serialize_into(writer)
|
||||||
// it means that it would weigh the same or less than the RoaringBitmap
|
|
||||||
// header, so we directly encode them using ByteOrder instead.
|
|
||||||
for integer in roaring {
|
|
||||||
writer.write_u32::<NativeEndian>(integer)?;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
|
||||||
roaring.serialize_into(writer)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
pub fn deserialize_from(bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||||
// If there is threshold or less than threshold integers that can fit into this array
|
// that the header takes threshold integers.
|
||||||
// of bytes it means that we used the ByteOrder codec serializer.
|
RoaringBitmap::deserialize_unchecked_from(bytes)
|
||||||
let mut bitmap = RoaringBitmap::new();
|
|
||||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
|
||||||
bitmap.insert(integer);
|
|
||||||
}
|
|
||||||
Ok(bitmap)
|
|
||||||
} else {
|
|
||||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
|
||||||
// that the header takes threshold integers.
|
|
||||||
RoaringBitmap::deserialize_unchecked_from(bytes)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn intersection_with_serialized(
|
pub fn intersection_with_serialized(
|
||||||
mut bytes: &[u8],
|
bytes: &[u8],
|
||||||
other: &RoaringBitmap,
|
other: &RoaringBitmap,
|
||||||
) -> io::Result<RoaringBitmap> {
|
) -> io::Result<RoaringBitmap> {
|
||||||
// See above `deserialize_from` method for implementation details.
|
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
|
||||||
let mut bitmap = RoaringBitmap::new();
|
|
||||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
|
||||||
if other.contains(integer) {
|
|
||||||
bitmap.insert(integer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(bitmap)
|
|
||||||
} else {
|
|
||||||
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
/// Merge serialized CboRoaringBitmaps in a buffer.
|
||||||
@@ -98,29 +55,16 @@ impl CboRoaringBitmapCodec {
|
|||||||
let mut vec = Vec::new();
|
let mut vec = Vec::new();
|
||||||
|
|
||||||
for bytes in slices {
|
for bytes in slices {
|
||||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
|
||||||
let mut reader = bytes.as_ref();
|
|
||||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
|
||||||
vec.push(integer);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if roaring.is_empty() {
|
if roaring.is_empty() {
|
||||||
vec.sort_unstable();
|
vec.sort_unstable();
|
||||||
vec.dedup();
|
vec.dedup();
|
||||||
|
|
||||||
if vec.len() <= THRESHOLD {
|
// We can unwrap safely because the vector is sorted upper.
|
||||||
for integer in vec {
|
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||||
buffer.extend_from_slice(&integer.to_ne_bytes());
|
roaring.serialize_into(buffer)?;
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// We can unwrap safely because the vector is sorted upper.
|
|
||||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
|
||||||
roaring.serialize_into(buffer)?;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
roaring.extend(vec);
|
roaring.extend(vec);
|
||||||
roaring.serialize_into(buffer)?;
|
roaring.serialize_into(buffer)?;
|
||||||
@@ -177,47 +121,20 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
|||||||
type EItem = RoaringBitmap;
|
type EItem = RoaringBitmap;
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||||
let mut vec = Vec::with_capacity(Self::serialized_size(item));
|
let mut item = item.clone();
|
||||||
Self::serialize_into_vec(item, &mut vec);
|
item.optimize();
|
||||||
|
let mut vec = Vec::with_capacity(Self::serialized_size(&item));
|
||||||
|
Self::serialize_into_vec(&item, &mut vec);
|
||||||
Ok(Cow::Owned(vec))
|
Ok(Cow::Owned(vec))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::iter::FromIterator;
|
use heed::BytesEncode;
|
||||||
|
|
||||||
use heed::{BytesDecode, BytesEncode};
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn verify_encoding_decoding() {
|
|
||||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
|
||||||
let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap();
|
|
||||||
let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap();
|
|
||||||
assert_eq!(input, output);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn verify_threshold() {
|
|
||||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
|
||||||
|
|
||||||
// use roaring bitmap
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
input.serialize_into(&mut bytes).unwrap();
|
|
||||||
let roaring_size = bytes.len();
|
|
||||||
|
|
||||||
// use byteorder directly
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
for integer in input {
|
|
||||||
bytes.write_u32::<NativeEndian>(integer).unwrap();
|
|
||||||
}
|
|
||||||
let bo_size = bytes.len();
|
|
||||||
|
|
||||||
assert!(roaring_size > bo_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn merge_cbo_roaring_bitmaps() {
|
fn merge_cbo_roaring_bitmaps() {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
|||||||
@@ -27,6 +27,8 @@ impl heed::BytesEncode<'_> for RoaringBitmapCodec {
|
|||||||
type EItem = RoaringBitmap;
|
type EItem = RoaringBitmap;
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||||
|
let mut item = item.clone();
|
||||||
|
item.optimize();
|
||||||
let mut bytes = Vec::with_capacity(item.serialized_size());
|
let mut bytes = Vec::with_capacity(item.serialized_size());
|
||||||
item.serialize_into(&mut bytes)?;
|
item.serialize_into(&mut bytes)?;
|
||||||
Ok(Cow::Owned(bytes))
|
Ok(Cow::Owned(bytes))
|
||||||
|
|||||||
@@ -1,9 +1,6 @@
|
|||||||
use std::mem;
|
|
||||||
|
|
||||||
use heed::{BoxedError, BytesDecode};
|
use heed::{BoxedError, BytesDecode};
|
||||||
|
|
||||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
use super::RoaringBitmapLenCodec;
|
||||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
|
|
||||||
pub struct CboRoaringBitmapLenCodec;
|
pub struct CboRoaringBitmapLenCodec;
|
||||||
@@ -12,15 +9,9 @@ impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
|||||||
type DItem = u64;
|
type DItem = u64;
|
||||||
|
|
||||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||||
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
|
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||||
// If there is threshold or less than threshold integers that can fit into this array
|
// that the header takes threshold integers.
|
||||||
// of bytes it means that we used the ByteOrder codec serializer.
|
RoaringBitmapLenCodec::bytes_decode(bytes)
|
||||||
BoRoaringBitmapLenCodec::bytes_decode(bytes)
|
|
||||||
} else {
|
|
||||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
|
||||||
// that the header takes threshold integers.
|
|
||||||
RoaringBitmapLenCodec::bytes_decode(bytes)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user