Compare commits

...

10 Commits

Author SHA1 Message Date
Kerollmops
1c9b714a00 Use the DeCboRoaringBitmapCodec everywhere 2025-12-12 14:37:24 +01:00
Kerollmops
8f30397c16 WIP hide the CboRoaringBitmapCodec to replace it more conveniently 2025-12-12 11:56:29 +01:00
Kerollmops
46a0899979 Expose a CLI parameter to enable or disable delta-encoding of bitmaps 2025-12-12 11:56:28 +01:00
Kerollmops
7728d4bcfa Use unsafe blocks to set env variables 2025-12-12 11:56:28 +01:00
Kerollmops
c805094243 Support conditionally serializing with new format 2025-12-12 11:56:28 +01:00
Kerollmops
dd78d65236 Simplify the computation of the raw u32s bytes size 2025-12-12 11:56:28 +01:00
Clément Renault
fb1a24cef9 WIP missing DeCboRoaringBitmapCodec::bytes_encode implem 2025-12-12 11:56:28 +01:00
Kerollmops
2cdebc0d4b Remove the unwraps, asserts and return actual io errors 2025-12-12 11:56:28 +01:00
Kerollmops
40c90147ad Move to a two bytes header 2025-12-12 11:56:28 +01:00
Kerollmops
0cac701520 Introduce a first working version of the DeBitmapCodec 2025-12-12 11:56:28 +01:00
38 changed files with 1077 additions and 419 deletions

567
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -4,7 +4,7 @@ use std::fmt::Write;
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats};
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str};
use meilisearch_types::heed::{Database, RoTxn};
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Details, Kind, Status, Task};
use meilisearch_types::versioning::{self, VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use roaring::RoaringBitmap;
@@ -187,7 +187,7 @@ pub fn snapshot_all_batches(rtxn: &RoTxn, db: Database<BEU32, SerdeJson<Batch>>)
pub fn snapshot_batches_to_tasks_mappings(
rtxn: &RoTxn,
db: Database<BEU32, CboRoaringBitmapCodec>,
db: Database<BEU32, DeCboRoaringBitmapCodec>,
) -> String {
let mut snap = String::new();
let iter = db.iter(rtxn).unwrap();
@@ -198,7 +198,7 @@ pub fn snapshot_batches_to_tasks_mappings(
snap
}
pub fn snapshot_date_db(rtxn: &RoTxn, db: Database<BEI128, CboRoaringBitmapCodec>) -> String {
pub fn snapshot_date_db(rtxn: &RoTxn, db: Database<BEI128, DeCboRoaringBitmapCodec>) -> String {
let mut snap = String::new();
let iter = db.iter(rtxn).unwrap();
for next in iter {

View File

@@ -4,7 +4,7 @@ use std::ops::{Bound, RangeBounds};
use meilisearch_types::batches::{Batch, BatchId};
use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Kind, Status};
use roaring::{MultiOps, RoaringBitmap};
use time::OffsetDateTime;
@@ -42,11 +42,11 @@ pub struct BatchQueue {
/// Store the batches associated to an index.
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
/// Store the batches containing tasks which were enqueued at a specific date
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) enqueued_at: Database<BEI128, DeCboRoaringBitmapCodec>,
/// Store the batches containing finished tasks started at a specific date
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) started_at: Database<BEI128, DeCboRoaringBitmapCodec>,
/// Store the batches containing tasks finished at a specific date
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) finished_at: Database<BEI128, DeCboRoaringBitmapCodec>,
}
impl BatchQueue {

View File

@@ -14,7 +14,7 @@ use std::time::Duration;
use file_store::FileStore;
use meilisearch_types::batches::BatchId;
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
use meilisearch_types::milli::{CboRoaringBitmapCodec, BEU32};
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
use roaring::RoaringBitmap;
use time::format_description::well_known::Rfc3339;
@@ -130,7 +130,7 @@ pub struct Queue {
pub(crate) batches: batches::BatchQueue,
/// Matches a batch id with the associated task ids.
pub(crate) batch_to_tasks_mapping: Database<BEU32, CboRoaringBitmapCodec>,
pub(crate) batch_to_tasks_mapping: Database<BEU32, DeCboRoaringBitmapCodec>,
/// The list of files referenced by the tasks.
pub(crate) file_store: FileStore,

View File

@@ -2,7 +2,7 @@ use std::ops::{Bound, RangeBounds};
use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
use meilisearch_types::tasks::{Kind, Status, Task};
use roaring::{MultiOps, RoaringBitmap};
use time::OffsetDateTime;
@@ -43,11 +43,11 @@ pub struct TaskQueue {
/// Store the tasks that were canceled by a task uid
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
/// Store the task ids of tasks which were enqueued at a specific date
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) enqueued_at: Database<BEI128, DeCboRoaringBitmapCodec>,
/// Store the task ids of finished tasks which started being processed at a specific date
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) started_at: Database<BEI128, DeCboRoaringBitmapCodec>,
/// Store the task ids of tasks which finished at a specific date
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
pub(crate) finished_at: Database<BEI128, DeCboRoaringBitmapCodec>,
}
impl TaskQueue {

View File

@@ -6,7 +6,7 @@ use std::sync::Arc;
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats};
use meilisearch_types::heed::{Database, RoTxn, RwTxn};
use meilisearch_types::milli::CboRoaringBitmapCodec;
use meilisearch_types::milli::DeCboRoaringBitmapCodec;
use meilisearch_types::task_view::DetailsView;
use meilisearch_types::tasks::{
BatchStopReason, Details, IndexSwap, Kind, KindWithContent, Status,
@@ -161,7 +161,7 @@ impl ProcessingBatch {
pub(crate) fn insert_task_datetime(
wtxn: &mut RwTxn,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<BEI128, DeCboRoaringBitmapCodec>,
time: OffsetDateTime,
task_id: TaskId,
) -> Result<()> {
@@ -174,7 +174,7 @@ pub(crate) fn insert_task_datetime(
pub(crate) fn remove_task_datetime(
wtxn: &mut RwTxn,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<BEI128, DeCboRoaringBitmapCodec>,
time: OffsetDateTime,
task_id: TaskId,
) -> Result<()> {
@@ -193,7 +193,7 @@ pub(crate) fn remove_task_datetime(
pub(crate) fn remove_n_tasks_datetime_earlier_than(
wtxn: &mut RwTxn,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<BEI128, DeCboRoaringBitmapCodec>,
earlier_than: OffsetDateTime,
mut count: usize,
task_id: TaskId,
@@ -221,7 +221,7 @@ pub(crate) fn remove_n_tasks_datetime_earlier_than(
pub(crate) fn keep_ids_within_datetimes(
rtxn: &RoTxn,
ids: &mut RoaringBitmap,
database: Database<BEI128, CboRoaringBitmapCodec>,
database: Database<BEI128, DeCboRoaringBitmapCodec>,
after: Option<OffsetDateTime>,
before: Option<OffsetDateTime>,
) -> Result<()> {

View File

@@ -300,6 +300,7 @@ impl Infos {
max_indexing_memory,
max_indexing_threads,
skip_index_budget: _,
experimental_disable_delta_encoding: _,
experimental_no_edition_2024_for_settings,
experimental_no_edition_2024_for_dumps,
experimental_no_edition_2024_for_prefix_post_processing,

View File

@@ -21,6 +21,7 @@ use meilisearch::{
LogStderrType, Opt, ServicesData, SubscriberForSecondLayer,
};
use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE};
use meilisearch_types::milli::heed_codec::DELTA_ENCODING_STATUS;
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use tracing::level_filters::LevelFilter;
use tracing_subscriber::layer::SubscriberExt as _;
@@ -95,6 +96,14 @@ async fn main() -> anyhow::Result<()> {
async fn try_main(runtime: tokio::runtime::Handle) -> anyhow::Result<()> {
let (opt, config_read_from) = Opt::try_build()?;
// Disables the delta encoding of bitmaps as soon as possible
if opt.indexer_options.experimental_disable_delta_encoding {
DELTA_ENCODING_STATUS.set_to_disabled()
} else {
DELTA_ENCODING_STATUS.set_to_enabled()
}
.expect("the delta-encoding status to be set only once");
std::panic::set_hook(Box::new(on_panic));
anyhow::ensure!(

View File

@@ -60,6 +60,7 @@ const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING: &str =
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING";
const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_PREFIX_POST_PROCESSING: &str =
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_PREFIX_POST_PROCESSING";
const MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING: &str = "MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING";
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER";
@@ -845,6 +846,14 @@ pub struct IndexerOpts {
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING)]
#[serde(default)]
pub experimental_no_edition_2024_for_facet_post_processing: bool,
/// Experimental disable delta-encoding for bitmaps. For more information,
/// see: <https://github.com/orgs/meilisearch/discussions/875>
///
/// Enables the experimental disable delta-encoding for bitmaps feature.
#[clap(long, env = MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING)]
#[serde(default)]
pub experimental_disable_delta_encoding: bool,
}
impl IndexerOpts {
@@ -858,6 +867,7 @@ impl IndexerOpts {
experimental_no_edition_2024_for_dumps,
experimental_no_edition_2024_for_prefix_post_processing,
experimental_no_edition_2024_for_facet_post_processing,
experimental_disable_delta_encoding,
} = self;
if let Some(max_indexing_memory) = max_indexing_memory.0 {
export_to_env_if_not_present(
@@ -895,6 +905,12 @@ impl IndexerOpts {
experimental_no_edition_2024_for_facet_post_processing.to_string(),
);
}
if experimental_disable_delta_encoding {
export_to_env_if_not_present(
MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING,
experimental_disable_delta_encoding.to_string(),
);
}
}
}
@@ -910,6 +926,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
experimental_no_edition_2024_for_dumps,
experimental_no_edition_2024_for_prefix_post_processing,
experimental_no_edition_2024_for_facet_post_processing,
experimental_disable_delta_encoding: _, // managed in try_main
} = other;
let thread_pool = ThreadPoolNoAbortBuilder::new_for_indexing()
@@ -1245,7 +1262,7 @@ where
T: AsRef<OsStr>,
{
if let Err(VarError::NotPresent) = std::env::var(key) {
std::env::set_var(key, value);
unsafe { std::env::set_var(key, value) }
}
}

View File

@@ -43,9 +43,9 @@ impl Server<Owned> {
let dir = TempDir::new().unwrap();
if cfg!(windows) {
std::env::set_var("TMP", TEST_TEMP_DIR.path());
unsafe { std::env::set_var("TMP", TEST_TEMP_DIR.path()) }
} else {
std::env::set_var("TMPDIR", TEST_TEMP_DIR.path());
unsafe { std::env::set_var("TMPDIR", TEST_TEMP_DIR.path()) }
}
let options = default_settings(dir.path());
@@ -58,9 +58,9 @@ impl Server<Owned> {
pub async fn new_auth_with_options(mut options: Opt, dir: TempDir) -> Self {
if cfg!(windows) {
std::env::set_var("TMP", TEST_TEMP_DIR.path());
unsafe { std::env::set_var("TMP", TEST_TEMP_DIR.path()) }
} else {
std::env::set_var("TMPDIR", TEST_TEMP_DIR.path());
unsafe { std::env::set_var("TMPDIR", TEST_TEMP_DIR.path()) }
}
options.master_key = Some("MASTER_KEY".to_string());
@@ -215,9 +215,9 @@ impl Server<Shared> {
let dir = TempDir::new().unwrap();
if cfg!(windows) {
std::env::set_var("TMP", TEST_TEMP_DIR.path());
unsafe { std::env::set_var("TMP", TEST_TEMP_DIR.path()) }
} else {
std::env::set_var("TMPDIR", TEST_TEMP_DIR.path());
unsafe { std::env::set_var("TMPDIR", TEST_TEMP_DIR.path()) }
}
let options = default_settings(dir.path());
@@ -508,6 +508,8 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
experimental_no_edition_2024_for_dumps: false,
experimental_no_edition_2024_for_prefix_post_processing: false,
experimental_no_edition_2024_for_facet_post_processing: false,
// It has no effect to set the delta encoding here as the toggle is done in try_main
experimental_disable_delta_encoding: false,
},
experimental_enable_metrics: false,
..Parser::parse_from(None as Option<&str>)

View File

@@ -120,14 +120,16 @@ twox-hash = { version = "2.1.2", default-features = false, features = [
] }
geo-types = "0.7.17"
zerometry = "0.3.0"
bitpacking = "0.9.2"
[dev-dependencies]
mimalloc = { version = "0.1.48", default-features = false }
# fixed version due to format breakages in v1.40
insta = "=1.39.0"
mimalloc = { version = "0.1.48", default-features = false }
maplit = "1.0.2"
md5 = "0.8.0"
meili-snap = { path = "../meili-snap" }
quickcheck = "1.0.3"
rand = { version = "0.8.5", features = ["small_rng"] }
[features]

View File

@@ -12,7 +12,7 @@ use roaring::RoaringBitmap;
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
pub use self::ordered_f64_codec::OrderedF64Codec;
use super::StrRefCodec;
use crate::{CboRoaringBitmapCodec, BEU16};
use crate::{DeCboRoaringBitmapCodec, BEU16};
pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
@@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
let mut v = vec![value.size];
CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v);
DeCboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v);
Ok(Cow::Owned(v))
}
}
@@ -107,7 +107,7 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
let size = bytes[0];
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
let bitmap = DeCboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
Ok(FacetGroupValue { size, bitmap })
}
}

View File

@@ -22,7 +22,9 @@ pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::fst_set_codec::FstSetCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap::{
BoRoaringBitmapCodec, DeCboRoaringBitmapCodec, RoaringBitmapCodec, DELTA_ENCODING_STATUS,
};
pub use self::roaring_bitmap_length::{
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
};

View File

@@ -19,8 +19,19 @@ pub const THRESHOLD: usize = 7;
pub struct CboRoaringBitmapCodec;
impl CboRoaringBitmapCodec {
/// If the number of items (u32s) to encode is less than or equal to the threshold
/// it means that it would weigh the same or less than the RoaringBitmap
/// header, so we directly encode them using ByteOrder instead.
pub fn bitmap_serialize_as_raw_u32s(roaring: &RoaringBitmap) -> bool {
roaring.len() <= THRESHOLD as u64
}
pub fn bytes_deserialize_as_raw_u32s(bytes: &[u8]) -> bool {
bytes.len() <= THRESHOLD * size_of::<u32>()
}
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
if roaring.len() <= THRESHOLD as u64 {
if Self::bitmap_serialize_as_raw_u32s(roaring) {
roaring.len() as usize * size_of::<u32>()
} else {
roaring.serialized_size()
@@ -35,10 +46,7 @@ impl CboRoaringBitmapCodec {
roaring: &RoaringBitmap,
mut writer: W,
) -> io::Result<()> {
if roaring.len() <= THRESHOLD as u64 {
// If the number of items (u32s) to encode is less than or equal to the threshold
// it means that it would weigh the same or less than the RoaringBitmap
// header, so we directly encode them using ByteOrder instead.
if Self::bitmap_serialize_as_raw_u32s(roaring) {
for integer in roaring {
writer.write_u32::<NativeEndian>(integer)?;
}
@@ -51,7 +59,7 @@ impl CboRoaringBitmapCodec {
}
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
if bytes.len() <= THRESHOLD * size_of::<u32>() {
if Self::bytes_deserialize_as_raw_u32s(bytes) {
// If there is threshold or less than threshold integers that can fit into this array
// of bytes it means that we used the ByteOrder codec serializer.
let mut bitmap = RoaringBitmap::new();
@@ -71,7 +79,7 @@ impl CboRoaringBitmapCodec {
other: &RoaringBitmap,
) -> io::Result<RoaringBitmap> {
// See above `deserialize_from` method for implementation details.
if bytes.len() <= THRESHOLD * size_of::<u32>() {
if Self::bytes_deserialize_as_raw_u32s(bytes) {
let mut bitmap = RoaringBitmap::new();
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
if other.contains(integer) {
@@ -98,7 +106,7 @@ impl CboRoaringBitmapCodec {
let mut vec = Vec::new();
for bytes in slices {
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
if Self::bytes_deserialize_as_raw_u32s(bytes.as_ref()) {
let mut reader = bytes.as_ref();
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
vec.push(integer);
@@ -112,6 +120,8 @@ impl CboRoaringBitmapCodec {
vec.sort_unstable();
vec.dedup();
// Be careful when modifying this condition,
// the rule must be the same everywhere
if vec.len() <= THRESHOLD {
for integer in vec {
buffer.extend_from_slice(&integer.to_ne_bytes());

View File

@@ -0,0 +1,177 @@
use std::borrow::Cow;
use std::io::{self, ErrorKind};
use std::sync::OnceLock;
use heed::BoxedError;
use roaring::RoaringBitmap;
use super::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
use super::de_roaring_bitmap_codec::DeRoaringBitmapCodec;
use crate::heed_codec::BytesDecodeOwned;
use crate::update::del_add::KvReaderDelAdd;
/// Defines the status of the delta encoding on whether we have enabled it or not.
pub static DELTA_ENCODING_STATUS: DeltaEncodingStatusLock = DeltaEncodingStatusLock::new();
pub struct DeCboRoaringBitmapCodec;
impl DeCboRoaringBitmapCodec {
pub fn serialized_size_with_tmp_buffer(
bitmap: &RoaringBitmap,
tmp_buffer: &mut Vec<u32>,
) -> usize {
// We are stuck with this format because the CboRoaringBitmapCodec decides to write
// raw and unencoded u32s, without a header when there is at most THRESHOLD elements.
if CboRoaringBitmapCodec::bitmap_serialize_as_raw_u32s(bitmap)
&& DELTA_ENCODING_STATUS.is_disabled()
{
CboRoaringBitmapCodec::serialized_size(bitmap)
} else {
DeRoaringBitmapCodec::serialized_size_with_tmp_buffer(bitmap, tmp_buffer)
}
}
/// Writes the delta-encoded compressed version of
/// the given roaring bitmap into the provided writer.
pub fn serialize_into<W: io::Write>(bitmap: &RoaringBitmap, writer: &mut W) -> io::Result<()> {
let mut tmp_buffer = Vec::new();
Self::serialize_into_with_tmp_buffer(bitmap, writer, &mut tmp_buffer)
}
/// Same as [Self::serialize_into] but accepts a buffer to avoid allocating one.
///
/// Note that we always serialize the bitmap with the delta-encoded compressed version.
pub fn serialize_into_with_tmp_buffer<W: io::Write>(
bitmap: &RoaringBitmap,
writer: &mut W,
tmp_buffer: &mut Vec<u32>,
) -> io::Result<()> {
// We are stuck with this format because the CboRoaringBitmapCodec decides to write
// raw and unencoded u32s, without a header when there is at most THRESHOLD elements.
if CboRoaringBitmapCodec::bitmap_serialize_as_raw_u32s(bitmap)
&& DELTA_ENCODING_STATUS.is_disabled()
{
CboRoaringBitmapCodec::serialize_into_writer(bitmap, writer)
} else {
DeRoaringBitmapCodec::serialize_into_with_tmp_buffer(bitmap, writer, tmp_buffer)
}
}
/// Returns the delta-decoded roaring bitmap from the compressed bytes.
pub fn deserialize_from(compressed: &[u8]) -> io::Result<RoaringBitmap> {
let mut tmp_buffer = Vec::new();
Self::deserialize_from_with_tmp_buffer(compressed, &mut tmp_buffer)
}
/// Same as [Self::deserialize_from] but accepts a buffer to avoid allocating one.
///
/// It tries to decode the input by using the delta-decoded version and
/// if it fails, falls back to the CboRoaringBitmap version.
pub fn deserialize_from_with_tmp_buffer(
input: &[u8],
tmp_buffer: &mut Vec<u32>,
) -> io::Result<RoaringBitmap> {
// The input is too short to be a valid delta-decoded bitmap.
// We fall back to the CboRoaringBitmap version with raw u32s.
if CboRoaringBitmapCodec::bytes_deserialize_as_raw_u32s(input) {
return CboRoaringBitmapCodec::deserialize_from(input);
}
match DeRoaringBitmapCodec::deserialize_from_with_tmp_buffer(input, tmp_buffer) {
Ok(bitmap) => Ok(bitmap),
// If the error kind is Other it means that the delta-decoder found
// an invalid magic header. We fall back to the CboRoaringBitmap version.
Err(e) if e.kind() == ErrorKind::Other => {
CboRoaringBitmapCodec::deserialize_from(input)
}
Err(e) => Err(e),
}
}
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
where
I: IntoIterator<Item = A>,
A: AsRef<[u8]>,
{
todo!()
}
pub fn intersection_with_serialized(
mut bytes: &[u8],
other: &RoaringBitmap,
) -> io::Result<RoaringBitmap> {
todo!()
}
pub fn merge_deladd_into<'a>(
deladd: &KvReaderDelAdd,
previous: &[u8],
buffer: &'a mut Vec<u8>,
) -> io::Result<Option<&'a [u8]>> {
todo!()
}
}
impl heed::BytesDecode<'_> for DeCboRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
Self::deserialize_from(bytes).map_err(Into::into)
}
}
impl BytesDecodeOwned for DeCboRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
Self::deserialize_from(bytes).map_err(Into::into)
}
}
impl heed::BytesEncode<'_> for DeCboRoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
let mut tmp_buffer = Vec::new();
let capacity = Self::serialized_size_with_tmp_buffer(&item, &mut tmp_buffer);
let mut output = Vec::with_capacity(capacity);
Self::serialize_into_with_tmp_buffer(item, &mut output, &mut tmp_buffer)?;
Ok(Cow::Owned(output))
}
}
/// Manages the global status of the delta encoding.
///
/// Whether we must use delta encoding or not when encoding roaring bitmaps.
pub struct DeltaEncodingStatusLock(OnceLock<DeltaEncodingStatus>);
impl DeltaEncodingStatusLock {
pub const fn new() -> Self {
Self(OnceLock::new())
}
}
#[derive(Default)]
enum DeltaEncodingStatus {
Enabled,
#[default]
Disabled,
}
impl DeltaEncodingStatusLock {
pub fn set_to_enabled(&self) -> Result<(), ()> {
self.0.set(DeltaEncodingStatus::Enabled).map_err(drop)
}
pub fn set_to_disabled(&self) -> Result<(), ()> {
self.0.set(DeltaEncodingStatus::Disabled).map_err(drop)
}
pub fn is_enabled(&self) -> bool {
matches!(self.0.get(), Some(DeltaEncodingStatus::Enabled))
}
pub fn is_disabled(&self) -> bool {
!self.is_enabled()
}
}

View File

@@ -0,0 +1,377 @@
use std::io::{self, ErrorKind};
use std::mem::{self, size_of, size_of_val};
use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
use roaring::RoaringBitmap;
/// The magic header for our custom encoding format
const MAGIC_HEADER: u16 = 36869;
pub struct DeRoaringBitmapCodec;
// TODO reintroduce:
// - serialized_size?
// - serialize_into_vec
// - intersection_with_serialized
// - merge_into
// - merge_deladd_into
impl DeRoaringBitmapCodec {
/// Returns the serialized size of the given roaring bitmap with the delta encoding format.
pub fn serialized_size_with_tmp_buffer(
bitmap: &RoaringBitmap,
tmp_buffer: &mut Vec<u32>,
) -> usize {
let mut size = 2; // u16 magic header
let bitpacker8x = BitPacker8x::new();
let bitpacker4x = BitPacker4x::new();
let bitpacker1x = BitPacker1x::new();
// This temporary buffer is used to store each chunk of decompressed u32s.
tmp_buffer.resize(BitPacker8x::BLOCK_LEN, 0u32);
let decompressed = &mut tmp_buffer[..];
let mut buffer_index = 0;
let mut initial = None;
// We initially collect all the integers into a flat buffer of the size
// of the largest bitpacker. We encode them with it until we don't have
// enough of them...
for n in bitmap {
decompressed[buffer_index] = n;
buffer_index += 1;
if buffer_index == BitPacker8x::BLOCK_LEN {
let num_bits = bitpacker8x.num_bits_strictly_sorted(initial, decompressed);
let compressed_len = BitPacker8x::compressed_block_size(num_bits);
size += 1; // u8 chunk header
size += compressed_len; // compressed data length
initial = Some(n);
buffer_index = 0;
}
}
// ...We then switch to a smaller bitpacker to encode the remaining chunks...
let decompressed = &decompressed[..buffer_index];
let mut chunks = decompressed.chunks_exact(BitPacker4x::BLOCK_LEN);
for decompressed in chunks.by_ref() {
let num_bits = bitpacker4x.num_bits_strictly_sorted(initial, decompressed);
let compressed_len = BitPacker4x::compressed_block_size(num_bits);
size += 1; // u8 chunk header
size += compressed_len; // compressed data length
initial = decompressed.iter().last().copied();
}
// ...And so on...
let decompressed = chunks.remainder();
let mut chunks = decompressed.chunks_exact(BitPacker1x::BLOCK_LEN);
for decompressed in chunks.by_ref() {
let num_bits = bitpacker1x.num_bits_strictly_sorted(initial, decompressed);
let compressed_len = BitPacker1x::compressed_block_size(num_bits);
size += 1; // u8 chunk header
size += compressed_len; // compressed data length
initial = decompressed.iter().last().copied();
}
// ...Until we don't have any small enough bitpacker. We put them raw
// at the end of out buffer with a header indicating the matter.
let decompressed = chunks.remainder();
if !decompressed.is_empty() {
size += 1; // u8 chunk header
size += mem::size_of_val(decompressed); // remaining uncompressed u32s
}
size
}
/// Writes the delta-encoded compressed version of
/// the given roaring bitmap into the provided writer.
pub fn serialize_into<W: io::Write>(bitmap: &RoaringBitmap, writer: W) -> io::Result<()> {
let mut tmp_buffer = Vec::new();
Self::serialize_into_with_tmp_buffer(bitmap, writer, &mut tmp_buffer)
}
/// Same as [Self::serialize_into] but accepts a buffer to avoid allocating one.
pub fn serialize_into_with_tmp_buffer<W: io::Write>(
bitmap: &RoaringBitmap,
mut writer: W,
tmp_buffer: &mut Vec<u32>,
) -> io::Result<()> {
// Insert the magic header
writer.write_all(&MAGIC_HEADER.to_ne_bytes())?;
let bitpacker8x = BitPacker8x::new();
let bitpacker4x = BitPacker4x::new();
let bitpacker1x = BitPacker1x::new();
// This temporary buffer is used to store each chunk of decompressed and
// compressed and delta-encoded u32s. We need room for the decompressed
// u32s coming from the roaring bitmap, the compressed output that can
// be as large as the decompressed u32s, and the chunk header.
tmp_buffer.resize((BitPacker8x::BLOCK_LEN * 2) + 1, 0u32);
let (decompressed, compressed) = tmp_buffer.split_at_mut(BitPacker8x::BLOCK_LEN);
let compressed = bytemuck::cast_slice_mut(compressed);
let mut buffer_index = 0;
let mut initial = None;
// We initially collect all the integers into a flat buffer of the size
// of the largest bitpacker. We encode them with it until we don't have
// enough of them...
for n in bitmap {
decompressed[buffer_index] = n;
buffer_index += 1;
if buffer_index == BitPacker8x::BLOCK_LEN {
let output = encode_with_packer(&bitpacker8x, decompressed, initial, compressed);
writer.write_all(output)?;
initial = Some(n);
buffer_index = 0;
}
}
// ...We then switch to a smaller bitpacker to encode the remaining chunks...
let decompressed = &decompressed[..buffer_index];
let mut chunks = decompressed.chunks_exact(BitPacker4x::BLOCK_LEN);
for decompressed in chunks.by_ref() {
let output = encode_with_packer(&bitpacker4x, decompressed, initial, compressed);
writer.write_all(output)?;
initial = decompressed.iter().last().copied();
}
// ...And so on...
let decompressed = chunks.remainder();
let mut chunks = decompressed.chunks_exact(BitPacker1x::BLOCK_LEN);
for decompressed in chunks.by_ref() {
let output = encode_with_packer(&bitpacker1x, decompressed, initial, compressed);
writer.write_all(output)?;
initial = decompressed.iter().last().copied();
}
// ...Until we don't have any small enough bitpacker. We put them raw
// at the end of out buffer with a header indicating the matter.
let decompressed = chunks.remainder();
if !decompressed.is_empty() {
let header = encode_chunk_header(BitPackerLevel::None, u32::BITS as u8);
// Note: Not convinced about the performance of writing a single
// byte followed by a larger write. However, we will use this
// codec with a BufWriter or directly with a Vec of bytes.
writer.write_all(&[header])?;
writer.write_all(bytemuck::cast_slice(decompressed))?;
}
Ok(())
}
/// Returns the delta-decoded roaring bitmap from the compressed bytes.
pub fn deserialize_from(compressed: &[u8]) -> io::Result<RoaringBitmap> {
let mut tmp_buffer = Vec::new();
Self::deserialize_from_with_tmp_buffer(compressed, &mut tmp_buffer)
}
/// Same as [Self::deserialize_from] but accepts a buffer to avoid allocating one.
pub fn deserialize_from_with_tmp_buffer(
input: &[u8],
tmp_buffer: &mut Vec<u32>,
) -> io::Result<RoaringBitmap> {
let Some((header, mut compressed)) = input.split_at_checked(size_of_val(&MAGIC_HEADER))
else {
return Err(io::Error::new(ErrorKind::UnexpectedEof, "expecting a two-bytes header"));
};
// Safety: This unwrap cannot happen as the header buffer is the right size
let header = u16::from_ne_bytes(header.try_into().unwrap());
if header != MAGIC_HEADER {
return Err(io::Error::other("invalid header value"));
}
let bitpacker8x = BitPacker8x::new();
let bitpacker4x = BitPacker4x::new();
let bitpacker1x = BitPacker1x::new();
let mut bitmap = RoaringBitmap::new();
tmp_buffer.resize(BitPacker8x::BLOCK_LEN, 0u32);
let decompressed = &mut tmp_buffer[..];
let mut initial = None;
while let Some((&chunk_header, encoded)) = compressed.split_first() {
let (level, num_bits) = decode_chunk_header(chunk_header);
let (bytes_read, decompressed) = match level {
BitPackerLevel::None => {
if num_bits != u32::BITS as u8 {
return Err(io::Error::new(
ErrorKind::InvalidData,
"invalid number of bits to encode non-compressed u32s",
));
}
let chunks = encoded.chunks_exact(size_of::<u32>());
if !chunks.remainder().is_empty() {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"expecting last chunk to be a multiple of the size of an u32",
));
}
let integers = chunks
// safety: This unwrap cannot happen as
// the size of u32 is set correctly.
.map(|b| b.try_into().unwrap())
.map(u32::from_ne_bytes);
bitmap
.append(integers)
.map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?;
// This is basically always the last chunk that exists in
// this delta-encoded format as the raw u32s are appended
// when there is not enough of them to fit in a bitpacker.
break;
}
BitPackerLevel::BitPacker1x => {
decode_with_packer(&bitpacker1x, decompressed, initial, encoded, num_bits)
}
BitPackerLevel::BitPacker4x => {
decode_with_packer(&bitpacker4x, decompressed, initial, encoded, num_bits)
}
BitPackerLevel::BitPacker8x => {
decode_with_packer(&bitpacker8x, decompressed, initial, encoded, num_bits)
}
};
initial = decompressed.iter().last().copied();
// TODO investigate perf
// Safety: Bitpackers cannot output unsorter integers when
// used with the compress_strictly_sorted function.
bitmap.append(decompressed.iter().copied()).unwrap();
// What the delta-decoding read plus the chunk header size
compressed = &compressed[bytes_read + 1..];
}
Ok(bitmap)
}
}
/// Takes a strickly sorted list of u32s and outputs delta-encoded
/// bytes with a chunk header. We expect the output buffer to be
/// at least BLOCK_LEN + 1.
fn encode_with_packer<'c, B: BitPackerExt>(
bitpacker: &B,
decompressed: &[u32],
initial: Option<u32>,
output: &'c mut [u8],
) -> &'c [u8] {
let num_bits = bitpacker.num_bits_strictly_sorted(initial, decompressed);
let compressed_len = B::compressed_block_size(num_bits);
let chunk_header = encode_chunk_header(B::level(), num_bits);
let buffer = &mut output[..compressed_len + 1];
// Safety: The buffer is at least one byte
let (header_in_buffer, encoded) = buffer.split_first_mut().unwrap();
*header_in_buffer = chunk_header;
bitpacker.compress_strictly_sorted(initial, decompressed, encoded, num_bits);
buffer
}
/// Returns the number of bytes read and the decoded unsigned integers.
fn decode_with_packer<'d, B: BitPacker>(
bitpacker: &B,
decompressed: &'d mut [u32],
initial: Option<u32>,
compressed: &[u8],
num_bits: u8,
) -> (usize, &'d [u32]) {
let decompressed = &mut decompressed[..B::BLOCK_LEN];
let read = bitpacker.decompress_strictly_sorted(initial, compressed, decompressed, num_bits);
(read, decompressed)
}
/// An identifier for the bitpacker to be able
/// to correctly decode the compressed integers.
#[derive(Debug, PartialEq, Eq)]
#[repr(u8)]
enum BitPackerLevel {
/// The remaining bytes are raw little endian encoded u32s.
None,
/// The remaining bits are encoded using a `BitPacker1x`.
BitPacker1x,
/// The remaining bits are encoded using a `BitPacker4x`.
BitPacker4x,
/// The remaining bits are encoded using a `BitPacker8x`.
BitPacker8x,
}
/// Returns the chunk header based on the bitpacker level
/// and the number of bits to encode the list of integers.
fn encode_chunk_header(level: BitPackerLevel, num_bits: u8) -> u8 {
debug_assert!(num_bits as u32 <= 2_u32.pow(6));
let level = level as u8;
debug_assert!(level <= 3);
num_bits | (level << 6)
}
/// Decodes the chunk header and output the bitpacker level
/// and the number of bits to decode the following bytes.
fn decode_chunk_header(data: u8) -> (BitPackerLevel, u8) {
let num_bits = data & 0b00111111;
let level = match data >> 6 {
0 => BitPackerLevel::None,
1 => BitPackerLevel::BitPacker1x,
2 => BitPackerLevel::BitPacker4x,
3 => BitPackerLevel::BitPacker8x,
invalid => panic!("Invalid bitpacker level: {invalid}"),
};
debug_assert!(num_bits as u32 <= 2_u32.pow(6));
(level, num_bits)
}
/// A simple helper trait to get the BitPackerLevel
/// and correctly generate the chunk header.
trait BitPackerExt: BitPacker {
/// Returns the level of the bitpacker: an identifier to be
/// able to decode the numbers with the right bitpacker.
fn level() -> BitPackerLevel;
}
impl BitPackerExt for BitPacker8x {
fn level() -> BitPackerLevel {
BitPackerLevel::BitPacker8x
}
}
impl BitPackerExt for BitPacker4x {
fn level() -> BitPackerLevel {
BitPackerLevel::BitPacker4x
}
}
impl BitPackerExt for BitPacker1x {
fn level() -> BitPackerLevel {
BitPackerLevel::BitPacker1x
}
}
#[cfg(test)]
mod tests {
use quickcheck::quickcheck;
use roaring::RoaringBitmap;
use super::DeRoaringBitmapCodec;
quickcheck! {
fn qc_random(xs: Vec<u32>) -> bool {
let bitmap = RoaringBitmap::from_iter(xs);
let mut compressed = Vec::new();
DeRoaringBitmapCodec::serialize_into(&bitmap, &mut compressed).unwrap();
let decompressed = DeRoaringBitmapCodec::deserialize_from(&compressed[..]).unwrap();
decompressed == bitmap
}
}
quickcheck! {
fn qc_random_check_serialized_size(xs: Vec<u32>) -> bool {
let bitmap = RoaringBitmap::from_iter(xs);
let mut compressed = Vec::new();
let mut tmp_buffer = Vec::new();
DeRoaringBitmapCodec::serialize_into(&bitmap, &mut compressed).unwrap();
let expected_len = DeRoaringBitmapCodec::serialized_size_with_tmp_buffer(&bitmap, &mut tmp_buffer);
compressed.len() == expected_len
}
}
}

View File

@@ -1,7 +1,10 @@
mod bo_roaring_bitmap_codec;
pub mod cbo_roaring_bitmap_codec;
pub mod de_cbo_roaring_bitmap_codec;
mod de_roaring_bitmap_codec;
mod roaring_bitmap_codec;
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
// pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::de_cbo_roaring_bitmap_codec::{DeCboRoaringBitmapCodec, DELTA_ENCODING_STATUS};
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;

View File

@@ -34,7 +34,7 @@ use crate::update::new::StdResult;
use crate::vector::db::IndexEmbeddingConfigs;
use crate::vector::{Embedding, VectorStore, VectorStoreBackend, VectorStoreStats};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
default_criteria, Criterion, DeCboRoaringBitmapCodec, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
FieldidsWeightsMap, FilterableAttributesRule, GeoPoint, LocalizedAttributesRule, ObkvCodec,
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32,
@@ -133,38 +133,38 @@ pub struct Index {
pub external_documents_ids: Database<Str, BEU32>,
/// A word and all the documents ids containing the word.
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
pub word_docids: Database<Str, DeCboRoaringBitmapCodec>,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
pub exact_word_docids: Database<Str, DeCboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
pub word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
pub exact_word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
/// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
pub word_pair_proximity_docids: Database<U8StrStrCodec, DeCboRoaringBitmapCodec>,
/// Maps the word and the position with the docids that corresponds to it.
pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
pub word_position_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
/// Maps the word and the field id with the docids that corresponds to it.
pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
pub word_fid_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
/// Maps the field id and the word count with the docids that corresponds to it.
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, DeCboRoaringBitmapCodec>,
/// Maps the word prefix and a position with all the docids where the prefix appears at the position.
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
pub word_prefix_position_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
/// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
pub word_prefix_fid_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field exists
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
pub facet_id_exists_docids: Database<FieldIdCodec, DeCboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is set as null
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
pub facet_id_is_null_docids: Database<FieldIdCodec, DeCboRoaringBitmapCodec>,
/// Maps the facet field id and the docids for which this field is considered empty
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
pub facet_id_is_empty_docids: Database<FieldIdCodec, DeCboRoaringBitmapCodec>,
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,

View File

@@ -73,7 +73,7 @@ pub use self::filterable_attributes_rules::{
};
pub use self::heed_codec::{
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
CboRoaringBitmapLenCodec, DeCboRoaringBitmapCodec, FieldIdWordCountCodec, ObkvCodec,
RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
UncheckedU8StrStrCodec,
};

View File

@@ -10,7 +10,7 @@ use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, DocumentId};
use crate::{DeCboRoaringBitmapCodec, DocumentId};
/// Call the given closure on the facet distribution of the candidate documents.
///
@@ -88,7 +88,7 @@ where
if key.field_id != field_id {
break;
}
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
let intersection = DeCboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
@@ -120,7 +120,7 @@ where
if key.field_id != field_id {
break;
}
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
let intersection = DeCboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
@@ -173,7 +173,7 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
let docids_in_common = DeCboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
@@ -210,7 +210,7 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
let docids_in_common = DeCboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;

View File

@@ -8,7 +8,7 @@ use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::{CboRoaringBitmapCodec, Result};
use crate::{DeCboRoaringBitmapCodec, Result};
/// Find all the document ids for which the given field contains a value contained within
/// the two bounds.
@@ -114,11 +114,11 @@ impl<'t> FacetRangeSearch<'t, '_, '_> {
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
*self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
Some(universe) => DeCboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
None => DeCboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
};
}
}
@@ -211,11 +211,11 @@ impl<'t> FacetRangeSearch<'t, '_, '_> {
};
if should_take_whole_group {
*self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
Some(universe) => DeCboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
None => DeCboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
previous_key = next_key;
previous_value = next_value;
@@ -313,11 +313,11 @@ impl<'t> FacetRangeSearch<'t, '_, '_> {
};
if should_take_whole_group {
*self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
Some(universe) => DeCboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
None => DeCboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
} else {
let level = level - 1;

View File

@@ -14,7 +14,7 @@ use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::proximity::ProximityPrecision;
use crate::update::MergeCboRoaringBitmaps;
use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
CboRoaringBitmapLenCodec, DeCboRoaringBitmapCodec, Result, SearchContext, U8StrStrCodec,
};
/// A cache storing pointers to values in the LMDB databases.
@@ -72,11 +72,11 @@ impl<'ctx> DatabaseCache<'ctx> {
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
DeCboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some)
.map_err(Into::into)
}
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
(bytes, None) => DeCboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
@@ -157,11 +157,11 @@ impl<'ctx> DatabaseCache<'ctx> {
match (bitmap_bytes, universe) {
(bytes, Some(universe)) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
DeCboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
.map(Some)
.map_err(Into::into)
}
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
(bytes, None) => DeCboRoaringBitmapCodec::bytes_decode_owned(bytes)
.map(Some)
.map_err(heed::Error::Decoding)
.map_err(Into::into),
@@ -377,7 +377,7 @@ impl<'ctx> SearchContext<'ctx> {
{
docids
.as_ref()
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
.map(|d| DeCboRoaringBitmapCodec::bytes_decode_owned(d))
.transpose()
.map_err(heed::Error::Decoding)?
} else {
@@ -395,7 +395,7 @@ impl<'ctx> SearchContext<'ctx> {
docids |= word1_docids & word2_docids;
}
}
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
let encoded = DeCboRoaringBitmapCodec::bytes_encode(&docids)
.map(Cow::into_owned)
.map(Cow::Owned)
.map(Some)

View File

@@ -6,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput};
use crate::score_details::{self, ScoreDetails};
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm;
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger, TimeBudget};
use crate::{DeCboRoaringBitmapCodec, Result, SearchContext, SearchLogger, TimeBudget};
/// A ranking rule that produces 3 disjoint buckets:
///
@@ -219,7 +219,7 @@ impl State {
match bitmap_bytes {
Some(bytes) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
DeCboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
}
None => RoaringBitmap::default(),
}

View File

@@ -15,7 +15,7 @@ use crate::heed_codec::BytesRefCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
use crate::{CboRoaringBitmapLenCodec, DeCboRoaringBitmapCodec, FieldId, Index, Result};
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
/// by rebuilding the database "from scratch".
@@ -162,7 +162,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
Some(prev_value) => {
// prev_value is the group size for level 0, followed by the previous bitmap.
let old_bitmap = &prev_value[1..];
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
DeCboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
}
None => {
// it is safe to ignore the del in that case.

View File

@@ -16,7 +16,7 @@ use crate::search::facet::get_highest_level;
use crate::update::del_add::DelAdd;
use crate::update::index_documents::valid_lmdb_key;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{CboRoaringBitmapCodec, Index, Result};
use crate::{DeCboRoaringBitmapCodec, Index, Result};
/// Enum used as a return value for the facet incremental indexing.
///
@@ -112,13 +112,13 @@ impl FacetsUpdateIncremental {
let value = KvReader::from_slice(value);
let docids_to_delete = value
.get(DelAdd::Deletion)
.map(CboRoaringBitmapCodec::bytes_decode)
.map(DeCboRoaringBitmapCodec::bytes_decode)
.map(|o| o.map_err(heed::Error::Encoding))
.transpose()?;
let docids_to_add = value
.get(DelAdd::Addition)
.map(CboRoaringBitmapCodec::bytes_decode)
.map(DeCboRoaringBitmapCodec::bytes_decode)
.map(|o| o.map_err(heed::Error::Encoding))
.transpose()?;

View File

@@ -366,7 +366,7 @@ pub(crate) mod test_helpers {
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::MergeDeladdCboRoaringBitmaps;
use crate::update::FacetsUpdateIncrementalInner;
use crate::CboRoaringBitmapCodec;
use crate::DeCboRoaringBitmapCodec;
/// Utility function to generate a string whose position in a lexicographically
/// ordered list is `i`.
@@ -496,7 +496,7 @@ pub(crate) mod test_helpers {
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap();
let mut inner_writer = KvWriterDelAdd::memory();
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
let value = DeCboRoaringBitmapCodec::bytes_encode(docids).unwrap();
inner_writer.insert(DelAdd::Addition, value).unwrap();
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
}

View File

@@ -19,7 +19,7 @@ use crate::facet::value_encoding::f64_into_bytes;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
use crate::{DeCboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
@@ -311,8 +311,8 @@ fn deladd_obkv_cbo_roaring_bitmaps(
) -> io::Result<()> {
buffer.clear();
let mut obkv = KvWriterDelAdd::new(buffer);
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
let del_bitmap_bytes = DeCboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
let add_bitmap_bytes = DeCboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
obkv.finish()

View File

@@ -7,7 +7,7 @@ use either::Either;
use grenad::MergeFunction;
use roaring::RoaringBitmap;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::DeCboRoaringBitmapCodec;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::transform::Operation;
use crate::Result;
@@ -200,7 +200,7 @@ impl MergeFunction for MergeCboRoaringBitmaps {
Ok(values[0].clone())
} else {
let mut vec = Vec::new();
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
DeCboRoaringBitmapCodec::merge_into(values, &mut vec)?;
Ok(Cow::from(vec))
}
}
@@ -232,10 +232,10 @@ impl MergeFunction for MergeDeladdCboRoaringBitmaps {
let mut output_deladd_obkv = KvWriterDelAdd::memory();
let mut buffer = Vec::new();
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
DeCboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
buffer.clear();
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
DeCboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
}
@@ -251,7 +251,7 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
previous: &[u8],
buffer: &'a mut Vec<u8>,
) -> Result<Option<&'a [u8]>> {
Ok(CboRoaringBitmapCodec::merge_deladd_into(
Ok(DeCboRoaringBitmapCodec::merge_deladd_into(
KvReaderDelAdd::from_slice(deladd_obkv),
previous,
buffer,

View File

@@ -40,7 +40,7 @@ use crate::update::{
};
use crate::vector::db::EmbedderInfo;
use crate::vector::{RuntimeEmbedders, VectorStore};
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
use crate::{DeCboRoaringBitmapCodec, Index, Result, UserError};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 4;
@@ -764,8 +764,8 @@ where
fn execute_word_prefix_docids(
txn: &mut heed::RwTxn<'_>,
merger: Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
word_docids_db: Database<Str, DeCboRoaringBitmapCodec>,
word_prefix_docids_db: Database<Str, DeCboRoaringBitmapCodec>,
indexer_config: &IndexerConfig,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],

View File

@@ -29,7 +29,7 @@ use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
use crate::vector::VectorStore;
use crate::{
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
lat_lng_to_xyz, DeCboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
Result, SerializationError, U8StrStrCodec, UserError,
};
@@ -866,7 +866,7 @@ where
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
fn write_proximity_entries_into_database_additional_searchables<R, MF>(
merger: Merger<R, MF>,
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
database: &heed::Database<U8StrStrCodec, DeCboRoaringBitmapCodec>,
wtxn: &mut RwTxn<'_>,
) -> Result<()>
where
@@ -881,7 +881,7 @@ where
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
Some(value) => {
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
DeCboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
}
None => continue,
};

View File

@@ -27,7 +27,7 @@ use crate::index::db_name;
use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY};
use crate::update::new::KvReaderFieldId;
use crate::vector::Embedding;
use crate::{CboRoaringBitmapCodec, DocumentId, Error, Index, InternalError};
use crate::{DeCboRoaringBitmapCodec, DocumentId, Error, Index, InternalError};
/// Note that the FrameProducer requires up to 9 bytes to
/// encode the length, the max grant has been computed accordingly.
@@ -971,7 +971,9 @@ pub struct WordDocidsSender<'a, 'b, D> {
impl<D: DatabaseType> WordDocidsSender<'_, '_, D> {
pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> {
let value_length = CboRoaringBitmapCodec::serialized_size(bitmap);
let mut tmp_buffer = Vec::new();
let value_length =
DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(bitmap, &mut tmp_buffer);
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
InternalError::StorePut {
database_name: D::DATABASE.database_name(),
@@ -986,7 +988,10 @@ impl<D: DatabaseType> WordDocidsSender<'_, '_, D> {
value_length,
|key_buffer, value_buffer| {
key_buffer.copy_from_slice(key);
CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?;
DeCboRoaringBitmapCodec::serialize_into(
bitmap,
&mut io::Cursor::new(value_buffer),
)?;
Ok(())
},
)
@@ -1007,7 +1012,9 @@ impl FacetDocidsSender<'_, '_> {
let (facet_kind, key) = FacetKind::extract_from_key(key);
let database = Database::from(facet_kind);
let value_length = CboRoaringBitmapCodec::serialized_size(bitmap);
let mut tmp_buffer = Vec::new();
let value_length =
DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(bitmap, &mut tmp_buffer);
let value_length = match facet_kind {
// We must take the facet group size into account
// when we serialize strings and numbers.
@@ -1041,7 +1048,7 @@ impl FacetDocidsSender<'_, '_> {
FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out,
};
CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?;
DeCboRoaringBitmapCodec::serialize_into(bitmap, &mut io::Cursor::new(value_out))?;
Ok(())
},

View File

@@ -82,7 +82,7 @@ use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::new::thread_local::MostlySend;
use crate::update::new::KvReaderDelAdd;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{CboRoaringBitmapCodec, Result};
use crate::{DeCboRoaringBitmapCodec, Result};
/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps.
///
@@ -323,6 +323,7 @@ struct SpillingCaches<'extractor> {
spilled_entries: Vec<grenad::Sorter<MergeDeladdCboRoaringBitmaps>>,
deladd_buffer: Vec<u8>,
cbo_buffer: Vec<u8>,
tmp_buffer: Vec<u32>,
}
impl<'extractor> SpillingCaches<'extractor> {
@@ -348,6 +349,7 @@ impl<'extractor> SpillingCaches<'extractor> {
caches,
deladd_buffer: Vec::new(),
cbo_buffer: Vec::new(),
tmp_buffer: Vec::new(),
}
}
@@ -370,6 +372,7 @@ impl<'extractor> SpillingCaches<'extractor> {
&mut self.spilled_entries[bucket],
&mut self.deladd_buffer,
&mut self.cbo_buffer,
&mut self.tmp_buffer,
key,
DelAddRoaringBitmap::new_del_u32(n),
),
@@ -395,6 +398,7 @@ impl<'extractor> SpillingCaches<'extractor> {
&mut self.spilled_entries[bucket],
&mut self.deladd_buffer,
&mut self.cbo_buffer,
&mut self.tmp_buffer,
key,
DelAddRoaringBitmap::new_add_u32(n),
),
@@ -411,6 +415,7 @@ fn spill_entry_to_sorter(
spilled_entries: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
deladd_buffer: &mut Vec<u8>,
cbo_buffer: &mut Vec<u8>,
tmp_buffer: &mut Vec<u32>,
key: &[u8],
deladd: DelAddRoaringBitmap,
) -> Result<()> {
@@ -420,21 +425,29 @@ fn spill_entry_to_sorter(
match deladd {
DelAddRoaringBitmap { del: Some(del), add: None } => {
cbo_buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer);
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(
&del, cbo_buffer, tmp_buffer,
);
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
}
DelAddRoaringBitmap { del: None, add: Some(add) } => {
cbo_buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer);
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(
&add, cbo_buffer, tmp_buffer,
);
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
}
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
cbo_buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer);
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(
&del, cbo_buffer, tmp_buffer,
);
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
cbo_buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer);
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(
&add, cbo_buffer, tmp_buffer,
);
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
}
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
@@ -640,12 +653,12 @@ impl DelAddRoaringBitmap {
let reader = KvReaderDelAdd::from_slice(bytes);
let del = match reader.get(DelAdd::Deletion) {
Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
Some(bytes) => DeCboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
None => None,
};
let add = match reader.get(DelAdd::Addition) {
Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
Some(bytes) => DeCboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
None => None,
};

View File

@@ -14,7 +14,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValu
use crate::heed_codec::BytesRefCodec;
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::update::{create_writer, writer_into_reader};
use crate::{CboRoaringBitmapCodec, FieldId, Index};
use crate::{DeCboRoaringBitmapCodec, FieldId, Index};
/// Generate the facet level based on the level 0.
///
@@ -123,7 +123,7 @@ fn compute_level(
ser_buffer.push(group_len);
let group_docids = mem::take(&mut group_docids);
let docids = group_docids.into_iter().union();
CboRoaringBitmapCodec::serialize_into_vec(&docids, &mut ser_buffer);
DeCboRoaringBitmapCodec::serialize_into(&docids, &mut ser_buffer);
writer.insert(left_bound, &ser_buffer)?;
}
left_bound = Some(key.left_bound);
@@ -142,7 +142,7 @@ fn compute_level(
let group_len: u8 = group_docids.len().try_into().unwrap();
ser_buffer.push(group_len);
let group_docids = group_docids.into_iter().union();
CboRoaringBitmapCodec::serialize_into_vec(&group_docids, &mut ser_buffer);
DeCboRoaringBitmapCodec::serialize_into(&group_docids, &mut ser_buffer);
writer.insert(left_bound, &ser_buffer)?;
}

View File

@@ -14,7 +14,7 @@ use super::extract::{
};
use crate::update::facet::new_incremental::FacetFieldIdChange;
use crate::update::new::extract::cellulite::GeoJsonExtractorData;
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
use crate::{DeCboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
pub fn merge_and_send_rtree<'extractor, MSP>(
@@ -325,7 +325,7 @@ fn merge_cbo_bitmaps(
del: Option<RoaringBitmap>,
add: Option<RoaringBitmap>,
) -> Result<Operation> {
let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
let current = current.map(DeCboRoaringBitmapCodec::deserialize_from).transpose()?;
match (current, del, add) {
(None, None, None) => Ok(Operation::Ignore), // but it's strange
(None, None, Some(add)) => Ok(Operation::Write(add)),

View File

@@ -14,12 +14,12 @@ use thread_local::ThreadLocal;
use super::ref_cell_ext::RefCellExt as _;
use crate::heed_codec::StrBEU16Codec;
use crate::update::GrenadParameters;
use crate::{CboRoaringBitmapCodec, Index, Prefix, Result};
use crate::{DeCboRoaringBitmapCodec, Index, Prefix, Result};
struct WordPrefixDocids<'i> {
index: &'i Index,
database: Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
database: Database<Bytes, DeCboRoaringBitmapCodec>,
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
max_memory_by_thread: Option<usize>,
/// Do not use an experimental LMDB feature to read uncommitted data in parallel.
no_experimental_post_processing: bool,
@@ -28,8 +28,8 @@ struct WordPrefixDocids<'i> {
impl<'i> WordPrefixDocids<'i> {
fn new(
index: &'i Index,
database: Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
database: Database<Bytes, DeCboRoaringBitmapCodec>,
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
grenad_parameters: &GrenadParameters,
) -> WordPrefixDocids<'i> {
WordPrefixDocids {
@@ -87,12 +87,12 @@ impl<'i> WordPrefixDocids<'i> {
let output = self
.database
.prefix_iter(&rtxn, prefix.as_bytes())?
.remap_types::<Str, CboRoaringBitmapCodec>()
.remap_types::<Str, DeCboRoaringBitmapCodec>()
.map(|result| result.map(|(_word, bitmap)| bitmap))
.union()?;
buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&output, &mut buffer);
DeCboRoaringBitmapCodec::serialize_into(&output, &mut buffer);
indexes.push(PrefixEntry { prefix, serialized_length: buffer.len() });
file.write_all(&buffer)?;
}
@@ -150,11 +150,11 @@ impl<'i> WordPrefixDocids<'i> {
.bitmaps(prefix)
.unwrap()
.iter()
.map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes))
.map(|bytes| DeCboRoaringBitmapCodec::deserialize_from(bytes))
.union()?;
buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&output, buffer);
DeCboRoaringBitmapCodec::serialize_into(&output, buffer);
index.push(PrefixEntry { prefix, serialized_length: buffer.len() });
file.write_all(buffer)
})?;
@@ -203,7 +203,7 @@ struct FrozenPrefixBitmaps<'a, 'rtxn> {
impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
pub fn from_prefixes(
database: Database<Bytes, CboRoaringBitmapCodec>,
database: Database<Bytes, DeCboRoaringBitmapCodec>,
rtxn: &'rtxn RoTxn,
prefixes: &'a BTreeSet<Prefix>,
) -> heed::Result<Self> {
@@ -231,8 +231,8 @@ unsafe impl Sync for FrozenPrefixBitmaps<'_, '_> {}
struct WordPrefixIntegerDocids<'i> {
index: &'i Index,
database: Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
database: Database<Bytes, DeCboRoaringBitmapCodec>,
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
max_memory_by_thread: Option<usize>,
/// Do not use an experimental LMDB feature to read uncommitted data in parallel.
no_experimental_post_processing: bool,
@@ -241,8 +241,8 @@ struct WordPrefixIntegerDocids<'i> {
impl<'i> WordPrefixIntegerDocids<'i> {
fn new(
index: &'i Index,
database: Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
database: Database<Bytes, DeCboRoaringBitmapCodec>,
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
grenad_parameters: &'_ GrenadParameters,
) -> WordPrefixIntegerDocids<'i> {
WordPrefixIntegerDocids {
@@ -338,10 +338,10 @@ impl<'i> WordPrefixIntegerDocids<'i> {
} else {
let output = bitmaps_bytes
.into_iter()
.map(CboRoaringBitmapCodec::deserialize_from)
.map(DeCboRoaringBitmapCodec::deserialize_from)
.union()?;
buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&output, &mut buffer);
DeCboRoaringBitmapCodec::serialize_into(&output, &mut buffer);
indexes.push(PrefixIntegerEntry {
prefix,
pos,
@@ -419,10 +419,10 @@ impl<'i> WordPrefixIntegerDocids<'i> {
} else {
let output = bitmaps_bytes
.iter()
.map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes))
.map(|bytes| DeCboRoaringBitmapCodec::deserialize_from(bytes))
.union()?;
buffer.clear();
CboRoaringBitmapCodec::serialize_into_vec(&output, buffer);
DeCboRoaringBitmapCodec::serialize_into(&output, buffer);
index.push(PrefixIntegerEntry {
prefix,
pos,
@@ -486,7 +486,7 @@ struct FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
pub fn from_prefixes(
database: Database<Bytes, CboRoaringBitmapCodec>,
database: Database<Bytes, DeCboRoaringBitmapCodec>,
rtxn: &'rtxn RoTxn,
prefixes: &'a BTreeSet<Prefix>,
) -> heed::Result<Self> {
@@ -516,7 +516,7 @@ unsafe impl Sync for FrozenPrefixIntegerBitmaps<'_, '_> {}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
fn delete_prefixes(
wtxn: &mut RwTxn,
prefix_database: &Database<Bytes, CboRoaringBitmapCodec>,
prefix_database: &Database<Bytes, DeCboRoaringBitmapCodec>,
prefixes: &BTreeSet<Prefix>,
) -> Result<()> {
// We remove all the entries that are no more required in this word prefix docids database.

View File

@@ -9,12 +9,12 @@ use crate::update::index_documents::{
create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps,
};
use crate::{CboRoaringBitmapCodec, Result};
use crate::{DeCboRoaringBitmapCodec, Result};
pub struct WordPrefixDocids<'t, 'i> {
wtxn: &'t mut heed::RwTxn<'i>,
word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
word_docids: Database<Str, DeCboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>,
@@ -24,8 +24,8 @@ pub struct WordPrefixDocids<'t, 'i> {
impl<'t, 'i> WordPrefixDocids<'t, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i>,
word_docids: Database<Str, CboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
word_docids: Database<Str, DeCboRoaringBitmapCodec>,
word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
) -> WordPrefixDocids<'t, 'i> {
WordPrefixDocids {
wtxn,

View File

@@ -14,12 +14,12 @@ use crate::update::index_documents::{
create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps,
};
use crate::{CboRoaringBitmapCodec, Result};
use crate::{DeCboRoaringBitmapCodec, Result};
pub struct WordPrefixIntegerDocids<'t, 'i> {
wtxn: &'t mut heed::RwTxn<'i>,
prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
prefix_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
word_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>,
@@ -29,8 +29,8 @@ pub struct WordPrefixIntegerDocids<'t, 'i> {
impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i>,
prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
prefix_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
word_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
) -> WordPrefixIntegerDocids<'t, 'i> {
WordPrefixIntegerDocids {
wtxn,

View File

@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
use crate::vector::settings::RemoveFragments;
use crate::vector::EmbeddingConfig;
use crate::{CboRoaringBitmapCodec, DocumentId, UserError};
use crate::{DeCboRoaringBitmapCodec, DocumentId, UserError};
/// DB representation of an embedder configuration.
///
@@ -273,9 +273,9 @@ impl<'a> heed::BytesDecode<'a> for EmbedderInfoCodec {
}
let first_bitmap_size = bytes.read_u32::<BigEndian>()?;
let first_bitmap_bytes = &bytes[..first_bitmap_size as usize];
let user_provided = CboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?;
let user_provided = DeCboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?;
let skip_regenerate_different_from_user_provided =
CboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?;
DeCboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?;
Ok(EmbedderInfo {
embedder_id,
embedding_status: EmbeddingStatus {
@@ -290,20 +290,21 @@ impl<'a> heed::BytesEncode<'a> for EmbedderInfoCodec {
type EItem = EmbedderInfo;
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, heed::BoxedError> {
let first_bitmap_size =
CboRoaringBitmapCodec::serialized_size(&item.embedding_status.user_provided);
let second_bitmap_size = CboRoaringBitmapCodec::serialized_size(
let mut tmp_buffer = Vec::new();
let first_bitmap_size = DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(
&item.embedding_status.user_provided,
&mut tmp_buffer,
);
let second_bitmap_size = DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(
&item.embedding_status.skip_regenerate_different_from_user_provided,
&mut tmp_buffer,
);
let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size);
bytes.write_u8(item.embedder_id)?;
bytes.write_u32::<BigEndian>(first_bitmap_size.try_into()?)?;
CboRoaringBitmapCodec::serialize_into_writer(
&item.embedding_status.user_provided,
&mut bytes,
)?;
CboRoaringBitmapCodec::serialize_into_writer(
DeCboRoaringBitmapCodec::serialize_into(&item.embedding_status.user_provided, &mut bytes)?;
DeCboRoaringBitmapCodec::serialize_into(
&item.embedding_status.skip_regenerate_different_from_user_provided,
&mut bytes,
)?;