mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-19 19:06:59 +00:00
Compare commits
27 Commits
release-v1
...
delta-enco
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
432557697e | ||
|
|
9870007bf7 | ||
|
|
454cc9509f | ||
|
|
85e41fdfbb | ||
|
|
7f3e38ace1 | ||
|
|
7729dfbbfd | ||
|
|
2751e5b31c | ||
|
|
0ac8785718 | ||
|
|
d62fa401fe | ||
|
|
4a97d60c60 | ||
|
|
0b6250cc43 | ||
|
|
8959a93b05 | ||
|
|
1c602a63a0 | ||
|
|
a1b06cba72 | ||
|
|
a672665310 | ||
|
|
297db181d3 | ||
|
|
d2f40923d7 | ||
|
|
e651766fa1 | ||
|
|
6b46a312f2 | ||
|
|
ff9d77ce67 | ||
|
|
db08ab56d5 | ||
|
|
e9448dbb46 | ||
|
|
66cba71d10 | ||
|
|
135aa2579d | ||
|
|
6c59e055f2 | ||
|
|
e123330464 | ||
|
|
be9a2afb2d |
568
Cargo.lock
generated
568
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -4,7 +4,7 @@ use std::fmt::Write;
|
|||||||
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats};
|
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchStats};
|
||||||
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str};
|
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str};
|
||||||
use meilisearch_types::heed::{Database, RoTxn};
|
use meilisearch_types::heed::{Database, RoTxn};
|
||||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
||||||
use meilisearch_types::tasks::{Details, Kind, Status, Task};
|
use meilisearch_types::tasks::{Details, Kind, Status, Task};
|
||||||
use meilisearch_types::versioning::{self, VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
use meilisearch_types::versioning::{self, VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@@ -188,7 +188,7 @@ pub fn snapshot_all_batches(rtxn: &RoTxn, db: Database<BEU32, SerdeJson<Batch>>)
|
|||||||
|
|
||||||
pub fn snapshot_batches_to_tasks_mappings(
|
pub fn snapshot_batches_to_tasks_mappings(
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
db: Database<BEU32, CboRoaringBitmapCodec>,
|
db: Database<BEU32, DeCboRoaringBitmapCodec>,
|
||||||
) -> String {
|
) -> String {
|
||||||
let mut snap = String::new();
|
let mut snap = String::new();
|
||||||
let iter = db.iter(rtxn).unwrap();
|
let iter = db.iter(rtxn).unwrap();
|
||||||
@@ -199,7 +199,7 @@ pub fn snapshot_batches_to_tasks_mappings(
|
|||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn snapshot_date_db(rtxn: &RoTxn, db: Database<BEI128, CboRoaringBitmapCodec>) -> String {
|
pub fn snapshot_date_db(rtxn: &RoTxn, db: Database<BEI128, DeCboRoaringBitmapCodec>) -> String {
|
||||||
let mut snap = String::new();
|
let mut snap = String::new();
|
||||||
let iter = db.iter(rtxn).unwrap();
|
let iter = db.iter(rtxn).unwrap();
|
||||||
for next in iter {
|
for next in iter {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use std::ops::{Bound, RangeBounds};
|
|||||||
use meilisearch_types::batches::{Batch, BatchId};
|
use meilisearch_types::batches::{Batch, BatchId};
|
||||||
use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str};
|
use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str};
|
||||||
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
|
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
|
||||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
||||||
use meilisearch_types::tasks::{Kind, Status};
|
use meilisearch_types::tasks::{Kind, Status};
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
@@ -42,11 +42,11 @@ pub struct BatchQueue {
|
|||||||
/// Store the batches associated to an index.
|
/// Store the batches associated to an index.
|
||||||
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
|
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
|
||||||
/// Store the batches containing tasks which were enqueued at a specific date
|
/// Store the batches containing tasks which were enqueued at a specific date
|
||||||
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) enqueued_at: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
/// Store the batches containing finished tasks started at a specific date
|
/// Store the batches containing finished tasks started at a specific date
|
||||||
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) started_at: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
/// Store the batches containing tasks finished at a specific date
|
/// Store the batches containing tasks finished at a specific date
|
||||||
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) finished_at: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BatchQueue {
|
impl BatchQueue {
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use std::time::Duration;
|
|||||||
use file_store::FileStore;
|
use file_store::FileStore;
|
||||||
use meilisearch_types::batches::BatchId;
|
use meilisearch_types::batches::BatchId;
|
||||||
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
|
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
|
||||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, BEU32};
|
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, BEU32};
|
||||||
use meilisearch_types::tasks::network::DbTaskNetwork;
|
use meilisearch_types::tasks::network::DbTaskNetwork;
|
||||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@@ -131,7 +131,7 @@ pub struct Queue {
|
|||||||
pub(crate) batches: batches::BatchQueue,
|
pub(crate) batches: batches::BatchQueue,
|
||||||
|
|
||||||
/// Matches a batch id with the associated task ids.
|
/// Matches a batch id with the associated task ids.
|
||||||
pub(crate) batch_to_tasks_mapping: Database<BEU32, CboRoaringBitmapCodec>,
|
pub(crate) batch_to_tasks_mapping: Database<BEU32, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// The list of files referenced by the tasks.
|
/// The list of files referenced by the tasks.
|
||||||
pub(crate) file_store: FileStore,
|
pub(crate) file_store: FileStore,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::ops::{Bound, RangeBounds};
|
|||||||
|
|
||||||
use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str};
|
use meilisearch_types::heed::types::{DecodeIgnore, SerdeBincode, SerdeJson, Str};
|
||||||
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
|
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn, WithoutTls};
|
||||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
use meilisearch_types::milli::{DeCboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
||||||
use meilisearch_types::tasks::network::DbTaskNetwork;
|
use meilisearch_types::tasks::network::DbTaskNetwork;
|
||||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
@@ -44,11 +44,11 @@ pub struct TaskQueue {
|
|||||||
/// Store the tasks that were canceled by a task uid
|
/// Store the tasks that were canceled by a task uid
|
||||||
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
|
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
|
||||||
/// Store the task ids of tasks which were enqueued at a specific date
|
/// Store the task ids of tasks which were enqueued at a specific date
|
||||||
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) enqueued_at: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
/// Store the task ids of finished tasks which started being processed at a specific date
|
/// Store the task ids of finished tasks which started being processed at a specific date
|
||||||
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) started_at: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
/// Store the task ids of tasks which finished at a specific date
|
/// Store the task ids of tasks which finished at a specific date
|
||||||
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
|
pub(crate) finished_at: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TaskQueue {
|
impl TaskQueue {
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use convert_case::{Case, Casing as _};
|
|||||||
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats};
|
use meilisearch_types::batches::{Batch, BatchEnqueuedAt, BatchId, BatchStats};
|
||||||
use meilisearch_types::heed::{Database, RoTxn, RwTxn};
|
use meilisearch_types::heed::{Database, RoTxn, RwTxn};
|
||||||
use meilisearch_types::milli::progress::Progress;
|
use meilisearch_types::milli::progress::Progress;
|
||||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, ChannelCongestion};
|
use meilisearch_types::milli::{ChannelCongestion, DeCboRoaringBitmapCodec};
|
||||||
use meilisearch_types::task_view::DetailsView;
|
use meilisearch_types::task_view::DetailsView;
|
||||||
use meilisearch_types::tasks::{
|
use meilisearch_types::tasks::{
|
||||||
BatchStopReason, Details, IndexSwap, Kind, KindWithContent, Status,
|
BatchStopReason, Details, IndexSwap, Kind, KindWithContent, Status,
|
||||||
@@ -211,7 +211,7 @@ impl ProcessingBatch {
|
|||||||
|
|
||||||
pub(crate) fn insert_task_datetime(
|
pub(crate) fn insert_task_datetime(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
database: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
time: OffsetDateTime,
|
time: OffsetDateTime,
|
||||||
task_id: TaskId,
|
task_id: TaskId,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
@@ -224,7 +224,7 @@ pub(crate) fn insert_task_datetime(
|
|||||||
|
|
||||||
pub(crate) fn remove_task_datetime(
|
pub(crate) fn remove_task_datetime(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
database: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
time: OffsetDateTime,
|
time: OffsetDateTime,
|
||||||
task_id: TaskId,
|
task_id: TaskId,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
@@ -243,7 +243,7 @@ pub(crate) fn remove_task_datetime(
|
|||||||
|
|
||||||
pub(crate) fn remove_n_tasks_datetime_earlier_than(
|
pub(crate) fn remove_n_tasks_datetime_earlier_than(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
database: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
earlier_than: OffsetDateTime,
|
earlier_than: OffsetDateTime,
|
||||||
mut count: usize,
|
mut count: usize,
|
||||||
task_id: TaskId,
|
task_id: TaskId,
|
||||||
@@ -271,7 +271,7 @@ pub(crate) fn remove_n_tasks_datetime_earlier_than(
|
|||||||
pub(crate) fn keep_ids_within_datetimes(
|
pub(crate) fn keep_ids_within_datetimes(
|
||||||
rtxn: &RoTxn,
|
rtxn: &RoTxn,
|
||||||
ids: &mut RoaringBitmap,
|
ids: &mut RoaringBitmap,
|
||||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
database: Database<BEI128, DeCboRoaringBitmapCodec>,
|
||||||
after: Option<OffsetDateTime>,
|
after: Option<OffsetDateTime>,
|
||||||
before: Option<OffsetDateTime>,
|
before: Option<OffsetDateTime>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::collections::BTreeMap;
|
|||||||
|
|
||||||
use base64::Engine as _;
|
use base64::Engine as _;
|
||||||
use itertools::{EitherOrBoth, Itertools as _};
|
use itertools::{EitherOrBoth, Itertools as _};
|
||||||
use milli::{CboRoaringBitmapCodec, DocumentId};
|
use milli::{DeCboRoaringBitmapCodec, DocumentId};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use utoipa::ToSchema;
|
use utoipa::ToSchema;
|
||||||
@@ -461,7 +461,8 @@ impl Serialize for TaskKeys {
|
|||||||
{
|
{
|
||||||
let TaskKeys(task_keys) = self;
|
let TaskKeys(task_keys) = self;
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(task_keys, &mut bytes);
|
// TODO correctly handle this io::Error
|
||||||
|
DeCboRoaringBitmapCodec::serialize_into(task_keys, &mut bytes).unwrap();
|
||||||
let encoded = base64::prelude::BASE64_STANDARD.encode(&bytes);
|
let encoded = base64::prelude::BASE64_STANDARD.encode(&bytes);
|
||||||
serializer.serialize_str(&encoded)
|
serializer.serialize_str(&encoded)
|
||||||
}
|
}
|
||||||
@@ -498,7 +499,7 @@ impl<'de> serde::de::Visitor<'de> for TaskKeysVisitor {
|
|||||||
where
|
where
|
||||||
E: serde::de::Error,
|
E: serde::de::Error,
|
||||||
{
|
{
|
||||||
let task_keys = CboRoaringBitmapCodec::deserialize_from(decoded).map_err(|_err| {
|
let task_keys = DeCboRoaringBitmapCodec::deserialize_from(decoded).map_err(|_err| {
|
||||||
E::invalid_value(serde::de::Unexpected::Bytes(decoded), &"a cbo roaring bitmap")
|
E::invalid_value(serde::de::Unexpected::Bytes(decoded), &"a cbo roaring bitmap")
|
||||||
})?;
|
})?;
|
||||||
Ok(TaskKeys(task_keys))
|
Ok(TaskKeys(task_keys))
|
||||||
|
|||||||
@@ -300,6 +300,7 @@ impl Infos {
|
|||||||
max_indexing_memory,
|
max_indexing_memory,
|
||||||
max_indexing_threads,
|
max_indexing_threads,
|
||||||
skip_index_budget: _,
|
skip_index_budget: _,
|
||||||
|
experimental_disable_delta_encoding: _,
|
||||||
experimental_no_edition_2024_for_settings,
|
experimental_no_edition_2024_for_settings,
|
||||||
experimental_no_edition_2024_for_dumps,
|
experimental_no_edition_2024_for_dumps,
|
||||||
experimental_no_edition_2024_for_prefix_post_processing,
|
experimental_no_edition_2024_for_prefix_post_processing,
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ use meilisearch::{
|
|||||||
LogStderrType, Opt, ServicesData, SubscriberForSecondLayer,
|
LogStderrType, Opt, ServicesData, SubscriberForSecondLayer,
|
||||||
};
|
};
|
||||||
use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE};
|
use meilisearch_auth::{generate_master_key, AuthController, MASTER_KEY_MIN_SIZE};
|
||||||
|
use meilisearch_types::milli::heed_codec::DELTA_ENCODING_STATUS;
|
||||||
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||||
use tracing::level_filters::LevelFilter;
|
use tracing::level_filters::LevelFilter;
|
||||||
use tracing_subscriber::layer::SubscriberExt as _;
|
use tracing_subscriber::layer::SubscriberExt as _;
|
||||||
@@ -95,6 +96,14 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
async fn try_main(runtime: tokio::runtime::Handle) -> anyhow::Result<()> {
|
async fn try_main(runtime: tokio::runtime::Handle) -> anyhow::Result<()> {
|
||||||
let (opt, config_read_from) = Opt::try_build()?;
|
let (opt, config_read_from) = Opt::try_build()?;
|
||||||
|
|
||||||
|
// Disables the delta encoding of bitmaps as soon as possible
|
||||||
|
if opt.indexer_options.experimental_disable_delta_encoding {
|
||||||
|
DELTA_ENCODING_STATUS.set_to_disabled()
|
||||||
|
} else {
|
||||||
|
DELTA_ENCODING_STATUS.set_to_enabled()
|
||||||
|
}
|
||||||
|
.expect("the delta-encoding status to be set only once");
|
||||||
|
|
||||||
std::panic::set_hook(Box::new(on_panic));
|
std::panic::set_hook(Box::new(on_panic));
|
||||||
|
|
||||||
anyhow::ensure!(
|
anyhow::ensure!(
|
||||||
|
|||||||
@@ -60,6 +60,7 @@ const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING: &str =
|
|||||||
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING";
|
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING";
|
||||||
const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_PREFIX_POST_PROCESSING: &str =
|
const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_PREFIX_POST_PROCESSING: &str =
|
||||||
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_PREFIX_POST_PROCESSING";
|
"MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_PREFIX_POST_PROCESSING";
|
||||||
|
const MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING: &str = "MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING";
|
||||||
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
||||||
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
|
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
|
||||||
const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER";
|
const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER";
|
||||||
@@ -845,6 +846,14 @@ pub struct IndexerOpts {
|
|||||||
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING)]
|
#[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_FACET_POST_PROCESSING)]
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub experimental_no_edition_2024_for_facet_post_processing: bool,
|
pub experimental_no_edition_2024_for_facet_post_processing: bool,
|
||||||
|
|
||||||
|
/// Experimental disable delta-encoding for bitmaps. For more information,
|
||||||
|
/// see: <https://github.com/orgs/meilisearch/discussions/875>
|
||||||
|
///
|
||||||
|
/// Enables the experimental disable delta-encoding for bitmaps feature.
|
||||||
|
#[clap(long, env = MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING)]
|
||||||
|
#[serde(default)]
|
||||||
|
pub experimental_disable_delta_encoding: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IndexerOpts {
|
impl IndexerOpts {
|
||||||
@@ -858,6 +867,7 @@ impl IndexerOpts {
|
|||||||
experimental_no_edition_2024_for_dumps,
|
experimental_no_edition_2024_for_dumps,
|
||||||
experimental_no_edition_2024_for_prefix_post_processing,
|
experimental_no_edition_2024_for_prefix_post_processing,
|
||||||
experimental_no_edition_2024_for_facet_post_processing,
|
experimental_no_edition_2024_for_facet_post_processing,
|
||||||
|
experimental_disable_delta_encoding,
|
||||||
} = self;
|
} = self;
|
||||||
if let Some(max_indexing_memory) = max_indexing_memory.0 {
|
if let Some(max_indexing_memory) = max_indexing_memory.0 {
|
||||||
export_to_env_if_not_present(
|
export_to_env_if_not_present(
|
||||||
@@ -895,6 +905,12 @@ impl IndexerOpts {
|
|||||||
experimental_no_edition_2024_for_facet_post_processing.to_string(),
|
experimental_no_edition_2024_for_facet_post_processing.to_string(),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if experimental_disable_delta_encoding {
|
||||||
|
export_to_env_if_not_present(
|
||||||
|
MEILI_EXPERIMENTAL_DISABLE_DELTA_ENCODING,
|
||||||
|
experimental_disable_delta_encoding.to_string(),
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -910,6 +926,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig {
|
|||||||
experimental_no_edition_2024_for_dumps,
|
experimental_no_edition_2024_for_dumps,
|
||||||
experimental_no_edition_2024_for_prefix_post_processing,
|
experimental_no_edition_2024_for_prefix_post_processing,
|
||||||
experimental_no_edition_2024_for_facet_post_processing,
|
experimental_no_edition_2024_for_facet_post_processing,
|
||||||
|
experimental_disable_delta_encoding: _, // managed in try_main
|
||||||
} = other;
|
} = other;
|
||||||
|
|
||||||
let thread_pool = ThreadPoolNoAbortBuilder::new_for_indexing()
|
let thread_pool = ThreadPoolNoAbortBuilder::new_for_indexing()
|
||||||
@@ -1245,7 +1262,7 @@ where
|
|||||||
T: AsRef<OsStr>,
|
T: AsRef<OsStr>,
|
||||||
{
|
{
|
||||||
if let Err(VarError::NotPresent) = std::env::var(key) {
|
if let Err(VarError::NotPresent) = std::env::var(key) {
|
||||||
std::env::set_var(key, value);
|
unsafe { std::env::set_var(key, value) }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -43,9 +43,9 @@ impl Server<Owned> {
|
|||||||
let dir = TempDir::new().unwrap();
|
let dir = TempDir::new().unwrap();
|
||||||
|
|
||||||
if cfg!(windows) {
|
if cfg!(windows) {
|
||||||
std::env::set_var("TMP", TEST_TEMP_DIR.path());
|
unsafe { std::env::set_var("TMP", TEST_TEMP_DIR.path()) }
|
||||||
} else {
|
} else {
|
||||||
std::env::set_var("TMPDIR", TEST_TEMP_DIR.path());
|
unsafe { std::env::set_var("TMPDIR", TEST_TEMP_DIR.path()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
let options = default_settings(dir.path());
|
let options = default_settings(dir.path());
|
||||||
@@ -58,9 +58,9 @@ impl Server<Owned> {
|
|||||||
|
|
||||||
pub async fn new_auth_with_options(mut options: Opt, dir: TempDir) -> Self {
|
pub async fn new_auth_with_options(mut options: Opt, dir: TempDir) -> Self {
|
||||||
if cfg!(windows) {
|
if cfg!(windows) {
|
||||||
std::env::set_var("TMP", TEST_TEMP_DIR.path());
|
unsafe { std::env::set_var("TMP", TEST_TEMP_DIR.path()) }
|
||||||
} else {
|
} else {
|
||||||
std::env::set_var("TMPDIR", TEST_TEMP_DIR.path());
|
unsafe { std::env::set_var("TMPDIR", TEST_TEMP_DIR.path()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
options.master_key = Some("MASTER_KEY".to_string());
|
options.master_key = Some("MASTER_KEY".to_string());
|
||||||
@@ -215,9 +215,9 @@ impl Server<Shared> {
|
|||||||
let dir = TempDir::new().unwrap();
|
let dir = TempDir::new().unwrap();
|
||||||
|
|
||||||
if cfg!(windows) {
|
if cfg!(windows) {
|
||||||
std::env::set_var("TMP", TEST_TEMP_DIR.path());
|
unsafe { std::env::set_var("TMP", TEST_TEMP_DIR.path()) }
|
||||||
} else {
|
} else {
|
||||||
std::env::set_var("TMPDIR", TEST_TEMP_DIR.path());
|
unsafe { std::env::set_var("TMPDIR", TEST_TEMP_DIR.path()) }
|
||||||
}
|
}
|
||||||
|
|
||||||
let options = default_settings(dir.path());
|
let options = default_settings(dir.path());
|
||||||
@@ -508,6 +508,8 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
|||||||
experimental_no_edition_2024_for_dumps: false,
|
experimental_no_edition_2024_for_dumps: false,
|
||||||
experimental_no_edition_2024_for_prefix_post_processing: false,
|
experimental_no_edition_2024_for_prefix_post_processing: false,
|
||||||
experimental_no_edition_2024_for_facet_post_processing: false,
|
experimental_no_edition_2024_for_facet_post_processing: false,
|
||||||
|
// It has no effect to set the delta encoding here as the toggle is done in try_main
|
||||||
|
experimental_disable_delta_encoding: false,
|
||||||
},
|
},
|
||||||
experimental_enable_metrics: false,
|
experimental_enable_metrics: false,
|
||||||
..Parser::parse_from(None as Option<&str>)
|
..Parser::parse_from(None as Option<&str>)
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ license.workspace = true
|
|||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
anyhow = "1.0.100"
|
anyhow = "1.0.100"
|
||||||
|
bstr = "1.12.1"
|
||||||
clap = { version = "4.5.52", features = ["derive"] }
|
clap = { version = "4.5.52", features = ["derive"] }
|
||||||
dump = { path = "../dump" }
|
dump = { path = "../dump" }
|
||||||
file-store = { path = "../file-store" }
|
file-store = { path = "../file-store" }
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
|
|||||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||||
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
use meilisearch_types::milli::index::EmbeddingsWithMetadata;
|
||||||
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||||
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
use meilisearch_types::milli::{obkv_to_json, DeCboRoaringBitmapCodec, BEU32};
|
||||||
use meilisearch_types::tasks::{Status, Task};
|
use meilisearch_types::tasks::{Status, Task};
|
||||||
use meilisearch_types::versioning::{get_version, parse_version};
|
use meilisearch_types::versioning::{get_version, parse_version};
|
||||||
use meilisearch_types::Index;
|
use meilisearch_types::Index;
|
||||||
@@ -140,6 +140,14 @@ enum Command {
|
|||||||
#[arg(long, value_delimiter = ',')]
|
#[arg(long, value_delimiter = ',')]
|
||||||
index_part: Vec<IndexPart>,
|
index_part: Vec<IndexPart>,
|
||||||
},
|
},
|
||||||
|
|
||||||
|
/// Outputs all entries of the index in a formatted way.
|
||||||
|
///
|
||||||
|
/// This command is useful for debugging purposes.
|
||||||
|
OutputFormattedEntries {
|
||||||
|
#[arg(long)]
|
||||||
|
index_name: String,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, ValueEnum)]
|
#[derive(Clone, ValueEnum)]
|
||||||
@@ -169,9 +177,148 @@ fn main() -> anyhow::Result<()> {
|
|||||||
Command::HairDryer { index_name, index_part } => {
|
Command::HairDryer { index_name, index_part } => {
|
||||||
hair_dryer(db_path, &index_name, &index_part)
|
hair_dryer(db_path, &index_name, &index_part)
|
||||||
}
|
}
|
||||||
|
Command::OutputFormattedEntries { index_name } => {
|
||||||
|
output_formatted_entries(db_path, &index_name)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn output_formatted_entries(db_path: PathBuf, index_name: &str) -> anyhow::Result<()> {
|
||||||
|
let index_scheduler_path = db_path.join("tasks");
|
||||||
|
let env = unsafe {
|
||||||
|
EnvOpenOptions::new().read_txn_without_tls().max_dbs(100).open(&index_scheduler_path)
|
||||||
|
}
|
||||||
|
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||||
|
|
||||||
|
let index_mapper_rtxn = env.read_txn()?;
|
||||||
|
let index_mapping: Database<Str, UuidCodec> =
|
||||||
|
try_opening_database(&env, &index_mapper_rtxn, "index-mapping")?;
|
||||||
|
|
||||||
|
for result in index_mapping.iter(&index_mapper_rtxn)? {
|
||||||
|
let (uid, uuid) = result?;
|
||||||
|
if uid != index_name {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||||
|
let index = Index::new(EnvOpenOptions::new().read_txn_without_tls(), &index_path, false)
|
||||||
|
.with_context(|| {
|
||||||
|
format!("While trying to open the index at path {:?}", index_path.display())
|
||||||
|
})?;
|
||||||
|
let rtxn = index.read_txn()?;
|
||||||
|
|
||||||
|
let Index {
|
||||||
|
word_docids,
|
||||||
|
exact_word_docids,
|
||||||
|
word_prefix_docids,
|
||||||
|
exact_word_prefix_docids,
|
||||||
|
word_pair_proximity_docids,
|
||||||
|
word_position_docids,
|
||||||
|
word_fid_docids,
|
||||||
|
field_id_word_count_docids,
|
||||||
|
word_prefix_position_docids,
|
||||||
|
word_prefix_fid_docids,
|
||||||
|
facet_id_exists_docids,
|
||||||
|
facet_id_is_null_docids,
|
||||||
|
facet_id_is_empty_docids,
|
||||||
|
..
|
||||||
|
} = index;
|
||||||
|
|
||||||
|
struct DatabaseInfo {
|
||||||
|
name: &'static str,
|
||||||
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DatabaseInfo {
|
||||||
|
fn new(name: &'static str, database: Database<Bytes, DeCboRoaringBitmapCodec>) -> Self {
|
||||||
|
DatabaseInfo { name, database }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let databases = [
|
||||||
|
DatabaseInfo::new("word_docids", word_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new("exact_word_docids", exact_word_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new("word_prefix_docids", word_prefix_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new(
|
||||||
|
"exact_word_prefix_docids",
|
||||||
|
exact_word_prefix_docids.remap_key_type(),
|
||||||
|
),
|
||||||
|
DatabaseInfo::new(
|
||||||
|
"word_pair_proximity_docids",
|
||||||
|
word_pair_proximity_docids.remap_key_type(),
|
||||||
|
),
|
||||||
|
DatabaseInfo::new("word_position_docids", word_position_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new("word_fid_docids", word_fid_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new(
|
||||||
|
"field_id_word_count_docids",
|
||||||
|
field_id_word_count_docids.remap_key_type(),
|
||||||
|
),
|
||||||
|
DatabaseInfo::new(
|
||||||
|
"word_prefix_position_docids",
|
||||||
|
word_prefix_position_docids.remap_key_type(),
|
||||||
|
),
|
||||||
|
DatabaseInfo::new("word_prefix_fid_docids", word_prefix_fid_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new("facet_id_exists_docids", facet_id_exists_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new("facet_id_is_null_docids", facet_id_is_null_docids.remap_key_type()),
|
||||||
|
DatabaseInfo::new(
|
||||||
|
"facet_id_is_empty_docids",
|
||||||
|
facet_id_is_empty_docids.remap_key_type(),
|
||||||
|
),
|
||||||
|
// DatabaseInfo::new("facet_id_f64_docids", facet_id_f64_docids.remap_key_type()),
|
||||||
|
// DatabaseInfo::new(
|
||||||
|
// "facet_id_string_docids",
|
||||||
|
// facet_id_string_docids.remap_key_type(),
|
||||||
|
// ),
|
||||||
|
// DatabaseInfo::new(
|
||||||
|
// "facet_id_normalized_string_strings",
|
||||||
|
// facet_id_normalized_string_strings.remap_key_type(),
|
||||||
|
// ),
|
||||||
|
// DatabaseInfo::new("facet_id_string_fst", facet_id_string_fst.remap_key_type()),
|
||||||
|
// DatabaseInfo::new(
|
||||||
|
// "field_id_docid_facet_f64s",
|
||||||
|
// field_id_docid_facet_f64s.remap_key_type(),
|
||||||
|
// ),
|
||||||
|
// DatabaseInfo::new(
|
||||||
|
// "field_id_docid_facet_strings",
|
||||||
|
// field_id_docid_facet_strings.remap_key_type(),
|
||||||
|
// ),
|
||||||
|
];
|
||||||
|
|
||||||
|
use bstr::ByteSlice as _;
|
||||||
|
|
||||||
|
let stdout = std::io::stdout();
|
||||||
|
let mut stdout_lock = BufWriter::new(stdout.lock());
|
||||||
|
|
||||||
|
for DatabaseInfo { name: db_name, database } in databases {
|
||||||
|
for result in database.iter(&rtxn)? {
|
||||||
|
let (key, bitmap) = result?;
|
||||||
|
let value: Vec<u32> = bitmap.iter().collect();
|
||||||
|
writeln!(&mut stdout_lock, "{db_name}: {} -> {:?}", key.as_bstr(), value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
let db_name = "main";
|
||||||
|
|
||||||
|
let fst = index.words_fst(&rtxn)?;
|
||||||
|
writeln!(&mut stdout_lock, "{db_name}: words-fst -> {fst:?}")?;
|
||||||
|
|
||||||
|
let prefix_fst = index.words_prefixes_fst(&rtxn)?;
|
||||||
|
writeln!(&mut stdout_lock, "{db_name}: words-prefixes-fst -> {prefix_fst:?}")?;
|
||||||
|
|
||||||
|
let documents_ids = index.documents_ids(&rtxn)?;
|
||||||
|
writeln!(&mut stdout_lock, "{db_name}: documents-ids -> {documents_ids:?}")?;
|
||||||
|
|
||||||
|
let exact_words = index.exact_words(&rtxn)?;
|
||||||
|
writeln!(&mut stdout_lock, "{db_name}: exact-words -> {exact_words:?}")?;
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Clears the task queue located at `db_path`.
|
/// Clears the task queue located at `db_path`.
|
||||||
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
||||||
let path = db_path.join("tasks");
|
let path = db_path.join("tasks");
|
||||||
|
|||||||
@@ -120,14 +120,16 @@ twox-hash = { version = "2.1.2", default-features = false, features = [
|
|||||||
] }
|
] }
|
||||||
geo-types = "0.7.17"
|
geo-types = "0.7.17"
|
||||||
zerometry = "0.3.0"
|
zerometry = "0.3.0"
|
||||||
|
bitpacking = "0.9.2"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
mimalloc = { version = "0.1.48", default-features = false }
|
|
||||||
# fixed version due to format breakages in v1.40
|
# fixed version due to format breakages in v1.40
|
||||||
insta = "=1.39.0"
|
insta = "=1.39.0"
|
||||||
|
mimalloc = { version = "0.1.48", default-features = false }
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
md5 = "0.8.0"
|
md5 = "0.8.0"
|
||||||
meili-snap = { path = "../meili-snap" }
|
meili-snap = { path = "../meili-snap" }
|
||||||
|
quickcheck = "1.0.3"
|
||||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ use roaring::RoaringBitmap;
|
|||||||
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
|
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
|
||||||
pub use self::ordered_f64_codec::OrderedF64Codec;
|
pub use self::ordered_f64_codec::OrderedF64Codec;
|
||||||
use super::StrRefCodec;
|
use super::StrRefCodec;
|
||||||
use crate::{CboRoaringBitmapCodec, BEU16};
|
use crate::{DeCboRoaringBitmapCodec, BEU16};
|
||||||
|
|
||||||
pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
|
pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
|
||||||
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
|
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
|
||||||
@@ -97,7 +97,7 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
|||||||
|
|
||||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||||
let mut v = vec![value.size];
|
let mut v = vec![value.size];
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&value.bitmap, &mut v);
|
DeCboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v)?;
|
||||||
Ok(Cow::Owned(v))
|
Ok(Cow::Owned(v))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -107,7 +107,7 @@ impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
|
|||||||
|
|
||||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||||
let size = bytes[0];
|
let size = bytes[0];
|
||||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
|
let bitmap = DeCboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
|
||||||
Ok(FacetGroupValue { size, bitmap })
|
Ok(FacetGroupValue { size, bitmap })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,10 +22,10 @@ pub use self::beu32_str_codec::BEU32StrCodec;
|
|||||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||||
pub use self::fst_set_codec::FstSetCodec;
|
pub use self::fst_set_codec::FstSetCodec;
|
||||||
pub use self::obkv_codec::ObkvCodec;
|
pub use self::obkv_codec::ObkvCodec;
|
||||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
pub use self::roaring_bitmap::{
|
||||||
pub use self::roaring_bitmap_length::{
|
DeCboRoaringBitmapCodec, RoaringBitmapCodec, DELTA_ENCODING_STATUS,
|
||||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
|
||||||
};
|
};
|
||||||
|
pub use self::roaring_bitmap_length::DeCboRoaringBitmapLenCodec;
|
||||||
pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec};
|
pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||||
|
|
||||||
|
|||||||
@@ -1,50 +0,0 @@
|
|||||||
use std::borrow::Cow;
|
|
||||||
use std::convert::TryInto;
|
|
||||||
use std::mem::size_of;
|
|
||||||
|
|
||||||
use heed::{BoxedError, BytesDecode};
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
|
||||||
|
|
||||||
pub struct BoRoaringBitmapCodec;
|
|
||||||
|
|
||||||
impl BoRoaringBitmapCodec {
|
|
||||||
pub fn serialize_into(bitmap: &RoaringBitmap, out: &mut Vec<u8>) {
|
|
||||||
out.reserve(bitmap.len() as usize * size_of::<u32>());
|
|
||||||
bitmap.iter().map(u32::to_ne_bytes).for_each(|bytes| out.extend_from_slice(&bytes));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BytesDecode<'_> for BoRoaringBitmapCodec {
|
|
||||||
type DItem = RoaringBitmap;
|
|
||||||
|
|
||||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
|
||||||
let mut bitmap = RoaringBitmap::new();
|
|
||||||
|
|
||||||
for chunk in bytes.chunks(size_of::<u32>()) {
|
|
||||||
let bytes = chunk.try_into()?;
|
|
||||||
bitmap.push(u32::from_ne_bytes(bytes));
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(bitmap)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl BytesDecodeOwned for BoRoaringBitmapCodec {
|
|
||||||
type DItem = RoaringBitmap;
|
|
||||||
|
|
||||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
|
||||||
Self::bytes_decode(bytes)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
|
|
||||||
type EItem = RoaringBitmap;
|
|
||||||
|
|
||||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
|
||||||
let mut out = Vec::new();
|
|
||||||
BoRoaringBitmapCodec::serialize_into(item, &mut out);
|
|
||||||
Ok(Cow::Owned(out))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -7,7 +7,6 @@ use heed::BoxedError;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
|
||||||
|
|
||||||
/// This is the limit where using a byteorder became less size efficient
|
/// This is the limit where using a byteorder became less size efficient
|
||||||
/// than using a direct roaring encoding, it is also the point where we are able
|
/// than using a direct roaring encoding, it is also the point where we are able
|
||||||
@@ -19,8 +18,19 @@ pub const THRESHOLD: usize = 7;
|
|||||||
pub struct CboRoaringBitmapCodec;
|
pub struct CboRoaringBitmapCodec;
|
||||||
|
|
||||||
impl CboRoaringBitmapCodec {
|
impl CboRoaringBitmapCodec {
|
||||||
|
/// If the number of items (u32s) to encode is less than or equal to the threshold
|
||||||
|
/// it means that it would weigh the same or less than the RoaringBitmap
|
||||||
|
/// header, so we directly encode them using ByteOrder instead.
|
||||||
|
pub fn bitmap_serialize_as_raw_u32s(roaring: &RoaringBitmap) -> bool {
|
||||||
|
roaring.len() <= THRESHOLD as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn bytes_deserialize_as_raw_u32s(bytes: &[u8]) -> bool {
|
||||||
|
bytes.len() <= THRESHOLD * size_of::<u32>()
|
||||||
|
}
|
||||||
|
|
||||||
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
||||||
if roaring.len() <= THRESHOLD as u64 {
|
if Self::bitmap_serialize_as_raw_u32s(roaring) {
|
||||||
roaring.len() as usize * size_of::<u32>()
|
roaring.len() as usize * size_of::<u32>()
|
||||||
} else {
|
} else {
|
||||||
roaring.serialized_size()
|
roaring.serialized_size()
|
||||||
@@ -35,10 +45,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
roaring: &RoaringBitmap,
|
roaring: &RoaringBitmap,
|
||||||
mut writer: W,
|
mut writer: W,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
if roaring.len() <= THRESHOLD as u64 {
|
if Self::bitmap_serialize_as_raw_u32s(roaring) {
|
||||||
// If the number of items (u32s) to encode is less than or equal to the threshold
|
|
||||||
// it means that it would weigh the same or less than the RoaringBitmap
|
|
||||||
// header, so we directly encode them using ByteOrder instead.
|
|
||||||
for integer in roaring {
|
for integer in roaring {
|
||||||
writer.write_u32::<NativeEndian>(integer)?;
|
writer.write_u32::<NativeEndian>(integer)?;
|
||||||
}
|
}
|
||||||
@@ -51,7 +58,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
if Self::bytes_deserialize_as_raw_u32s(bytes) {
|
||||||
// If there is threshold or less than threshold integers that can fit into this array
|
// If there is threshold or less than threshold integers that can fit into this array
|
||||||
// of bytes it means that we used the ByteOrder codec serializer.
|
// of bytes it means that we used the ByteOrder codec serializer.
|
||||||
let mut bitmap = RoaringBitmap::new();
|
let mut bitmap = RoaringBitmap::new();
|
||||||
@@ -71,7 +78,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
other: &RoaringBitmap,
|
other: &RoaringBitmap,
|
||||||
) -> io::Result<RoaringBitmap> {
|
) -> io::Result<RoaringBitmap> {
|
||||||
// See above `deserialize_from` method for implementation details.
|
// See above `deserialize_from` method for implementation details.
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
if Self::bytes_deserialize_as_raw_u32s(bytes) {
|
||||||
let mut bitmap = RoaringBitmap::new();
|
let mut bitmap = RoaringBitmap::new();
|
||||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||||
if other.contains(integer) {
|
if other.contains(integer) {
|
||||||
@@ -83,78 +90,6 @@ impl CboRoaringBitmapCodec {
|
|||||||
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
|
||||||
///
|
|
||||||
/// if the merged values length is under the threshold, values are directly
|
|
||||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
|
||||||
/// values and is serialized in the buffer.
|
|
||||||
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
|
||||||
where
|
|
||||||
I: IntoIterator<Item = A>,
|
|
||||||
A: AsRef<[u8]>,
|
|
||||||
{
|
|
||||||
let mut roaring = RoaringBitmap::new();
|
|
||||||
let mut vec = Vec::new();
|
|
||||||
|
|
||||||
for bytes in slices {
|
|
||||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
|
||||||
let mut reader = bytes.as_ref();
|
|
||||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
|
||||||
vec.push(integer);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if roaring.is_empty() {
|
|
||||||
vec.sort_unstable();
|
|
||||||
vec.dedup();
|
|
||||||
|
|
||||||
if vec.len() <= THRESHOLD {
|
|
||||||
for integer in vec {
|
|
||||||
buffer.extend_from_slice(&integer.to_ne_bytes());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// We can unwrap safely because the vector is sorted upper.
|
|
||||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
|
||||||
roaring.serialize_into(buffer)?;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
roaring.extend(vec);
|
|
||||||
roaring.serialize_into(buffer)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
|
||||||
pub fn merge_deladd_into<'a>(
|
|
||||||
deladd: &KvReaderDelAdd,
|
|
||||||
previous: &[u8],
|
|
||||||
buffer: &'a mut Vec<u8>,
|
|
||||||
) -> io::Result<Option<&'a [u8]>> {
|
|
||||||
// Deserialize the bitmap that is already there
|
|
||||||
let mut previous = Self::deserialize_from(previous)?;
|
|
||||||
|
|
||||||
// Remove integers we no more want in the previous bitmap
|
|
||||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
|
||||||
previous -= Self::deserialize_from(value)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Insert the new integers we want in the previous bitmap
|
|
||||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
|
||||||
previous |= Self::deserialize_from(value)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
if previous.is_empty() {
|
|
||||||
return Ok(None);
|
|
||||||
}
|
|
||||||
|
|
||||||
Self::serialize_into_vec(&previous, buffer);
|
|
||||||
Ok(Some(&buffer[..]))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||||
@@ -182,75 +117,3 @@ impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
|||||||
Ok(Cow::Owned(vec))
|
Ok(Cow::Owned(vec))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use heed::{BytesDecode, BytesEncode};
|
|
||||||
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn verify_encoding_decoding() {
|
|
||||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
|
||||||
let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap();
|
|
||||||
let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap();
|
|
||||||
assert_eq!(input, output);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn verify_threshold() {
|
|
||||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
|
||||||
|
|
||||||
// use roaring bitmap
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
input.serialize_into(&mut bytes).unwrap();
|
|
||||||
let roaring_size = bytes.len();
|
|
||||||
|
|
||||||
// use byteorder directly
|
|
||||||
let mut bytes = Vec::new();
|
|
||||||
for integer in input {
|
|
||||||
bytes.write_u32::<NativeEndian>(integer).unwrap();
|
|
||||||
}
|
|
||||||
let bo_size = bytes.len();
|
|
||||||
|
|
||||||
assert!(roaring_size > bo_size);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn merge_cbo_roaring_bitmaps() {
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
|
|
||||||
let small_data = [
|
|
||||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(4..6).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(1..3).unwrap(),
|
|
||||||
];
|
|
||||||
|
|
||||||
let small_data: Vec<_> =
|
|
||||||
small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
|
||||||
CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap();
|
|
||||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
|
||||||
let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap();
|
|
||||||
assert_eq!(bitmap, expected);
|
|
||||||
|
|
||||||
let medium_data = [
|
|
||||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(4..8).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(0..3).unwrap(),
|
|
||||||
RoaringBitmap::from_sorted_iter(7..23).unwrap(),
|
|
||||||
];
|
|
||||||
|
|
||||||
let medium_data: Vec<_> =
|
|
||||||
medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
|
||||||
buffer.clear();
|
|
||||||
CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
|
||||||
|
|
||||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
|
||||||
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
|
|
||||||
assert_eq!(bitmap, expected);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -0,0 +1,374 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::io::{self, Cursor, ErrorKind};
|
||||||
|
use std::sync::OnceLock;
|
||||||
|
|
||||||
|
use byteorder::{NativeEndian, ReadBytesExt as _};
|
||||||
|
use heed::BoxedError;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||||
|
use super::de_roaring_bitmap_codec::DeRoaringBitmapCodec;
|
||||||
|
use crate::heed_codec::roaring_bitmap::take_all_blocks;
|
||||||
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
|
|
||||||
|
/// Defines the status of the delta encoding on whether we have enabled it or not.
|
||||||
|
pub static DELTA_ENCODING_STATUS: DeltaEncodingStatusLock = DeltaEncodingStatusLock::new();
|
||||||
|
|
||||||
|
pub struct DeCboRoaringBitmapCodec;
|
||||||
|
|
||||||
|
impl DeCboRoaringBitmapCodec {
|
||||||
|
pub fn serialized_size_with_tmp_buffer(
|
||||||
|
bitmap: &RoaringBitmap,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> usize {
|
||||||
|
// We are stuck with this format because the CboRoaringBitmapCodec decides to write
|
||||||
|
// raw and unencoded u32s, without a header when there is at most THRESHOLD elements.
|
||||||
|
if CboRoaringBitmapCodec::bitmap_serialize_as_raw_u32s(bitmap)
|
||||||
|
|| DELTA_ENCODING_STATUS.is_disabled()
|
||||||
|
{
|
||||||
|
CboRoaringBitmapCodec::serialized_size(bitmap)
|
||||||
|
} else {
|
||||||
|
DeRoaringBitmapCodec::serialized_size_with_tmp_buffer(bitmap, tmp_buffer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes the delta-encoded compressed version of
|
||||||
|
/// the given roaring bitmap into the provided writer.
|
||||||
|
pub fn serialize_into<W: io::Write>(bitmap: &RoaringBitmap, writer: &mut W) -> io::Result<()> {
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
Self::serialize_into_with_tmp_buffer(bitmap, writer, &mut tmp_buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [Self::serialize_into] but accepts a buffer to avoid allocating one.
|
||||||
|
///
|
||||||
|
/// Note that we always serialize the bitmap with the delta-encoded compressed version.
|
||||||
|
pub fn serialize_into_with_tmp_buffer<W: io::Write>(
|
||||||
|
bitmap: &RoaringBitmap,
|
||||||
|
writer: &mut W,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
// We are stuck with this format because the CboRoaringBitmapCodec decides to write
|
||||||
|
// raw and unencoded u32s, without a header when there is at most THRESHOLD elements.
|
||||||
|
if CboRoaringBitmapCodec::bitmap_serialize_as_raw_u32s(bitmap)
|
||||||
|
|| DELTA_ENCODING_STATUS.is_disabled()
|
||||||
|
{
|
||||||
|
CboRoaringBitmapCodec::serialize_into_writer(bitmap, writer)
|
||||||
|
} else {
|
||||||
|
DeRoaringBitmapCodec::serialize_into_with_tmp_buffer(bitmap, writer, tmp_buffer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the delta-decoded roaring bitmap from the compressed bytes.
|
||||||
|
pub fn deserialize_from(compressed: &[u8]) -> io::Result<RoaringBitmap> {
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
Self::deserialize_from_with_tmp_buffer(compressed, &mut tmp_buffer)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [Self::deserialize_from] but accepts a buffer to avoid allocating one.
|
||||||
|
///
|
||||||
|
/// It tries to decode the input by using the delta-decoded version and
|
||||||
|
/// if it fails, falls back to the CboRoaringBitmap version.
|
||||||
|
pub fn deserialize_from_with_tmp_buffer(
|
||||||
|
input: &[u8],
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> io::Result<RoaringBitmap> {
|
||||||
|
// The input is too short to be a valid delta-decoded bitmap.
|
||||||
|
// We fall back to the CboRoaringBitmap version with raw u32s.
|
||||||
|
if CboRoaringBitmapCodec::bytes_deserialize_as_raw_u32s(input) {
|
||||||
|
return CboRoaringBitmapCodec::deserialize_from(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
match DeRoaringBitmapCodec::deserialize_from_with_tmp_buffer(
|
||||||
|
input,
|
||||||
|
take_all_blocks,
|
||||||
|
tmp_buffer,
|
||||||
|
) {
|
||||||
|
Ok(bitmap) => Ok(bitmap),
|
||||||
|
// If the error kind is Other it means that the delta-decoder found
|
||||||
|
// an invalid magic header. We fall back to the CboRoaringBitmap version.
|
||||||
|
Err(e) if e.kind() == ErrorKind::Other => {
|
||||||
|
CboRoaringBitmapCodec::deserialize_from(input)
|
||||||
|
}
|
||||||
|
Err(e) => Err(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge serialized DeCboRoaringBitmaps in a buffer.
|
||||||
|
///
|
||||||
|
/// If the merged values length is under the threshold, values are directly
|
||||||
|
/// serialized in the buffer else a delta-encoded list of integers is created
|
||||||
|
/// from the values and is serialized in the buffer.
|
||||||
|
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||||
|
where
|
||||||
|
I: IntoIterator<Item = A>,
|
||||||
|
A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
|
let mut roaring = RoaringBitmap::new();
|
||||||
|
let mut vec = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
|
||||||
|
for bytes in slices {
|
||||||
|
if CboRoaringBitmapCodec::bytes_deserialize_as_raw_u32s(bytes.as_ref()) {
|
||||||
|
let mut reader = bytes.as_ref();
|
||||||
|
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||||
|
vec.push(integer);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
roaring |= DeCboRoaringBitmapCodec::deserialize_from_with_tmp_buffer(
|
||||||
|
bytes.as_ref(),
|
||||||
|
&mut tmp_buffer,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
roaring.extend(vec);
|
||||||
|
|
||||||
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&roaring, buffer, &mut tmp_buffer)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Do an intersection directly with a serialized delta-encoded bitmap.
|
||||||
|
///
|
||||||
|
/// When doing the intersection we only need to deserialize the necessary
|
||||||
|
/// bitmap containers and avoid a lot of unnecessary allocations. We do
|
||||||
|
/// that by skipping entire delta-encoded blocks when possible to avoid
|
||||||
|
/// storing them in the bitmap we use for the final intersection.
|
||||||
|
pub fn intersection_with_serialized(
|
||||||
|
bytes: &[u8],
|
||||||
|
other: &RoaringBitmap,
|
||||||
|
) -> io::Result<RoaringBitmap> {
|
||||||
|
if CboRoaringBitmapCodec::bytes_deserialize_as_raw_u32s(bytes) {
|
||||||
|
return CboRoaringBitmapCodec::intersection_with_serialized(bytes, other);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO move this tmp buffer outside
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
let filter_block = |first, last| other.range_cardinality(first..=last) == 0;
|
||||||
|
|
||||||
|
match DeRoaringBitmapCodec::deserialize_from_with_tmp_buffer(
|
||||||
|
bytes,
|
||||||
|
filter_block,
|
||||||
|
&mut tmp_buffer,
|
||||||
|
) {
|
||||||
|
Ok(bitmap) => Ok(bitmap & other),
|
||||||
|
// If the error kind is Other it means that the delta-decoder found
|
||||||
|
// an invalid magic header. We fall back to the CboRoaringBitmap version.
|
||||||
|
Err(e) if e.kind() == ErrorKind::Other => {
|
||||||
|
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
||||||
|
}
|
||||||
|
Err(e) => Err(e),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn merge_deladd_into<'a>(
|
||||||
|
deladd: &KvReaderDelAdd,
|
||||||
|
previous: &[u8],
|
||||||
|
buffer: &'a mut Vec<u8>,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> io::Result<Option<&'a [u8]>> {
|
||||||
|
// Deserialize the bitmap that is already there
|
||||||
|
let mut previous = Self::deserialize_from_with_tmp_buffer(previous, tmp_buffer)?;
|
||||||
|
|
||||||
|
// Remove integers we no more want in the previous bitmap
|
||||||
|
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||||
|
previous -= Self::deserialize_from_with_tmp_buffer(value, tmp_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Insert the new integers we want in the previous bitmap
|
||||||
|
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||||
|
previous |= Self::deserialize_from_with_tmp_buffer(value, tmp_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
if previous.is_empty() {
|
||||||
|
return Ok(None);
|
||||||
|
}
|
||||||
|
|
||||||
|
Self::serialize_into_with_tmp_buffer(&previous, buffer, tmp_buffer)?;
|
||||||
|
|
||||||
|
Ok(Some(&buffer[..]))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl heed::BytesDecode<'_> for DeCboRoaringBitmapCodec {
|
||||||
|
type DItem = RoaringBitmap;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||||
|
Self::deserialize_from(bytes).map_err(Into::into)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BytesDecodeOwned for DeCboRoaringBitmapCodec {
|
||||||
|
type DItem = RoaringBitmap;
|
||||||
|
|
||||||
|
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||||
|
Self::deserialize_from(bytes).map_err(Into::into)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl heed::BytesEncode<'_> for DeCboRoaringBitmapCodec {
|
||||||
|
type EItem = RoaringBitmap;
|
||||||
|
|
||||||
|
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
let capacity = Self::serialized_size_with_tmp_buffer(item, &mut tmp_buffer);
|
||||||
|
let mut output = Vec::with_capacity(capacity);
|
||||||
|
Self::serialize_into_with_tmp_buffer(item, &mut output, &mut tmp_buffer)?;
|
||||||
|
Ok(Cow::Owned(output))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Manages the global status of the delta encoding.
|
||||||
|
///
|
||||||
|
/// Whether we must use delta encoding or not when encoding roaring bitmaps.
|
||||||
|
#[derive(Default)]
|
||||||
|
pub struct DeltaEncodingStatusLock(OnceLock<DeltaEncodingStatus>);
|
||||||
|
|
||||||
|
impl DeltaEncodingStatusLock {
|
||||||
|
pub const fn new() -> Self {
|
||||||
|
Self(OnceLock::new())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
|
enum DeltaEncodingStatus {
|
||||||
|
Enabled,
|
||||||
|
#[default]
|
||||||
|
Disabled,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DeltaEncodingStatusLock {
|
||||||
|
pub fn set_to_enabled(&self) -> Result<(), ()> {
|
||||||
|
self.0.set(DeltaEncodingStatus::Enabled).map_err(drop)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn set_to_disabled(&self) -> Result<(), ()> {
|
||||||
|
self.0.set(DeltaEncodingStatus::Disabled).map_err(drop)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_enabled(&self) -> bool {
|
||||||
|
matches!(self.0.get(), Some(DeltaEncodingStatus::Enabled))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_disabled(&self) -> bool {
|
||||||
|
!self.is_enabled()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::iter::FromIterator;
|
||||||
|
|
||||||
|
use byteorder::WriteBytesExt as _;
|
||||||
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
use quickcheck::quickcheck;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::super::super::roaring_bitmap_length::DeCboRoaringBitmapLenCodec;
|
||||||
|
use super::super::THRESHOLD;
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_encoding_decoding() {
|
||||||
|
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
||||||
|
let bytes = DeCboRoaringBitmapCodec::bytes_encode(&input).unwrap();
|
||||||
|
let output = DeCboRoaringBitmapCodec::bytes_decode(&bytes).unwrap();
|
||||||
|
assert_eq!(input, output);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn verify_threshold() {
|
||||||
|
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
||||||
|
|
||||||
|
// use roaring bitmap
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
input.serialize_into(&mut bytes).unwrap();
|
||||||
|
let roaring_size = bytes.len();
|
||||||
|
|
||||||
|
// use byteorder directly
|
||||||
|
let mut bytes = Vec::new();
|
||||||
|
for integer in input {
|
||||||
|
bytes.write_u32::<NativeEndian>(integer).unwrap();
|
||||||
|
}
|
||||||
|
let bo_size = bytes.len();
|
||||||
|
|
||||||
|
assert!(roaring_size > bo_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn merge_de_cbo_roaring_bitmaps() {
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
|
||||||
|
let small_data = [
|
||||||
|
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(4..6).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(1..3).unwrap(),
|
||||||
|
];
|
||||||
|
|
||||||
|
let small_data: Vec<_> =
|
||||||
|
small_data.iter().map(|b| DeCboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||||
|
DeCboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap();
|
||||||
|
let bitmap = DeCboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||||
|
let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap();
|
||||||
|
assert_eq!(bitmap, expected);
|
||||||
|
|
||||||
|
let medium_data = [
|
||||||
|
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(4..8).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(0..3).unwrap(),
|
||||||
|
RoaringBitmap::from_sorted_iter(7..23).unwrap(),
|
||||||
|
];
|
||||||
|
|
||||||
|
let medium_data: Vec<_> =
|
||||||
|
medium_data.iter().map(|b| DeCboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||||
|
buffer.clear();
|
||||||
|
DeCboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
||||||
|
|
||||||
|
let bitmap = DeCboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||||
|
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
|
||||||
|
assert_eq!(bitmap, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
quickcheck! {
|
||||||
|
fn qc_random(xs: Vec<u32>) -> bool {
|
||||||
|
let bitmap = RoaringBitmap::from_iter(xs);
|
||||||
|
let mut compressed = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&bitmap, &mut compressed, &mut tmp_buffer).unwrap();
|
||||||
|
let length = DeCboRoaringBitmapLenCodec::bytes_decode(&compressed[..]).unwrap();
|
||||||
|
let decompressed = DeCboRoaringBitmapCodec::deserialize_from_with_tmp_buffer(&compressed[..], &mut tmp_buffer).unwrap();
|
||||||
|
length == bitmap.len() && decompressed == bitmap
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quickcheck! {
|
||||||
|
fn qc_random_check_serialized_size(xs: Vec<u32>) -> bool {
|
||||||
|
let bitmap = RoaringBitmap::from_iter(xs);
|
||||||
|
let mut compressed = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&bitmap, &mut compressed, &mut tmp_buffer).unwrap();
|
||||||
|
let length = DeCboRoaringBitmapLenCodec::bytes_decode(&compressed).unwrap();
|
||||||
|
let expected_len = DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(&bitmap, &mut tmp_buffer);
|
||||||
|
length == bitmap.len() && compressed.len() == expected_len
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quickcheck! {
|
||||||
|
fn qc_random_intersection_with_serialized(lhs: Vec<u32>, rhs: Vec<u32>) -> bool {
|
||||||
|
let mut compressed = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
|
||||||
|
let lhs = RoaringBitmap::from_iter(lhs);
|
||||||
|
let rhs = RoaringBitmap::from_iter(rhs);
|
||||||
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&lhs, &mut compressed, &mut tmp_buffer).unwrap();
|
||||||
|
|
||||||
|
let intersection = DeCboRoaringBitmapCodec::intersection_with_serialized(&compressed, &rhs).unwrap();
|
||||||
|
let expected_intersection = lhs & rhs;
|
||||||
|
|
||||||
|
intersection == expected_intersection
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,474 @@
|
|||||||
|
use std::io::{self, ErrorKind};
|
||||||
|
use std::mem::{self, size_of, size_of_val};
|
||||||
|
|
||||||
|
use bitpacking::{BitPacker, BitPacker1x, BitPacker4x, BitPacker8x};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
/// The magic header for our custom encoding format
|
||||||
|
const MAGIC_HEADER: u16 = 36869;
|
||||||
|
|
||||||
|
pub struct DeRoaringBitmapCodec;
|
||||||
|
|
||||||
|
// TODO reintroduce:
|
||||||
|
// - serialized_size?
|
||||||
|
// - serialize_into_vec
|
||||||
|
// - intersection_with_serialized
|
||||||
|
// - merge_into
|
||||||
|
// - merge_deladd_into
|
||||||
|
impl DeRoaringBitmapCodec {
|
||||||
|
/// Returns the serialized size of the given roaring bitmap with the delta encoding format.
|
||||||
|
pub fn serialized_size_with_tmp_buffer(
|
||||||
|
bitmap: &RoaringBitmap,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> usize {
|
||||||
|
let mut size = 2; // u16 magic header
|
||||||
|
|
||||||
|
let bitpacker8x = BitPacker8x::new();
|
||||||
|
let bitpacker4x = BitPacker4x::new();
|
||||||
|
let bitpacker1x = BitPacker1x::new();
|
||||||
|
|
||||||
|
// This temporary buffer is used to store each chunk of decompressed u32s.
|
||||||
|
tmp_buffer.resize(BitPacker8x::BLOCK_LEN, 0u32);
|
||||||
|
let decompressed = &mut tmp_buffer[..];
|
||||||
|
|
||||||
|
let mut buffer_index = 0;
|
||||||
|
let mut initial = None;
|
||||||
|
// We initially collect all the integers into a flat buffer of the size
|
||||||
|
// of the largest bitpacker. We encode them with it until we don't have
|
||||||
|
// enough of them...
|
||||||
|
for n in bitmap {
|
||||||
|
decompressed[buffer_index] = n;
|
||||||
|
buffer_index += 1;
|
||||||
|
if buffer_index == BitPacker8x::BLOCK_LEN {
|
||||||
|
let num_bits = bitpacker8x.num_bits_strictly_sorted(initial, decompressed);
|
||||||
|
let compressed_len = BitPacker8x::compressed_block_size(num_bits);
|
||||||
|
size += 1; // u8 chunk header
|
||||||
|
size += compressed_len; // compressed data length
|
||||||
|
initial = Some(n);
|
||||||
|
buffer_index = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...We then switch to a smaller bitpacker to encode the remaining chunks...
|
||||||
|
let decompressed = &decompressed[..buffer_index];
|
||||||
|
let mut chunks = decompressed.chunks_exact(BitPacker4x::BLOCK_LEN);
|
||||||
|
for decompressed in chunks.by_ref() {
|
||||||
|
let num_bits = bitpacker4x.num_bits_strictly_sorted(initial, decompressed);
|
||||||
|
let compressed_len = BitPacker4x::compressed_block_size(num_bits);
|
||||||
|
size += 1; // u8 chunk header
|
||||||
|
size += compressed_len; // compressed data length
|
||||||
|
initial = decompressed.iter().last().copied();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...And so on...
|
||||||
|
let decompressed = chunks.remainder();
|
||||||
|
let mut chunks = decompressed.chunks_exact(BitPacker1x::BLOCK_LEN);
|
||||||
|
for decompressed in chunks.by_ref() {
|
||||||
|
let num_bits = bitpacker1x.num_bits_strictly_sorted(initial, decompressed);
|
||||||
|
let compressed_len = BitPacker1x::compressed_block_size(num_bits);
|
||||||
|
size += 1; // u8 chunk header
|
||||||
|
size += compressed_len; // compressed data length
|
||||||
|
initial = decompressed.iter().last().copied();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...Until we don't have any small enough bitpacker. We put them raw
|
||||||
|
// at the end of out buffer with a header indicating the matter.
|
||||||
|
let decompressed = chunks.remainder();
|
||||||
|
if !decompressed.is_empty() {
|
||||||
|
size += 1; // u8 chunk header
|
||||||
|
size += mem::size_of_val(decompressed); // remaining uncompressed u32s
|
||||||
|
}
|
||||||
|
|
||||||
|
size
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Writes the delta-encoded compressed version of the given roaring bitmap
|
||||||
|
/// into the provided writer. Accepts a buffer to avoid allocating one.
|
||||||
|
pub fn serialize_into_with_tmp_buffer<W: io::Write>(
|
||||||
|
bitmap: &RoaringBitmap,
|
||||||
|
mut writer: W,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
// Insert the magic header
|
||||||
|
writer.write_all(&MAGIC_HEADER.to_ne_bytes())?;
|
||||||
|
|
||||||
|
let bitpacker8x = BitPacker8x::new();
|
||||||
|
let bitpacker4x = BitPacker4x::new();
|
||||||
|
let bitpacker1x = BitPacker1x::new();
|
||||||
|
|
||||||
|
// This temporary buffer is used to store each chunk of decompressed and
|
||||||
|
// compressed and delta-encoded u32s. We need room for the decompressed
|
||||||
|
// u32s coming from the roaring bitmap, the compressed output that can
|
||||||
|
// be as large as the decompressed u32s, and the chunk header.
|
||||||
|
tmp_buffer.resize((BitPacker8x::BLOCK_LEN * 2) + 1, 0u32);
|
||||||
|
let (decompressed, compressed) = tmp_buffer.split_at_mut(BitPacker8x::BLOCK_LEN);
|
||||||
|
let compressed = bytemuck::cast_slice_mut(compressed);
|
||||||
|
|
||||||
|
let mut buffer_index = 0;
|
||||||
|
let mut initial = None;
|
||||||
|
// We initially collect all the integers into a flat buffer of the size
|
||||||
|
// of the largest bitpacker. We encode them with it until we don't have
|
||||||
|
// enough of them...
|
||||||
|
for n in bitmap {
|
||||||
|
decompressed[buffer_index] = n;
|
||||||
|
buffer_index += 1;
|
||||||
|
if buffer_index == BitPacker8x::BLOCK_LEN {
|
||||||
|
let output = encode_with_packer(&bitpacker8x, decompressed, initial, compressed);
|
||||||
|
writer.write_all(output)?;
|
||||||
|
initial = Some(n);
|
||||||
|
buffer_index = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...We then switch to a smaller bitpacker to encode the remaining chunks...
|
||||||
|
let decompressed = &decompressed[..buffer_index];
|
||||||
|
let mut chunks = decompressed.chunks_exact(BitPacker4x::BLOCK_LEN);
|
||||||
|
for decompressed in chunks.by_ref() {
|
||||||
|
let output = encode_with_packer(&bitpacker4x, decompressed, initial, compressed);
|
||||||
|
writer.write_all(output)?;
|
||||||
|
initial = decompressed.iter().last().copied();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...And so on...
|
||||||
|
let decompressed = chunks.remainder();
|
||||||
|
let mut chunks = decompressed.chunks_exact(BitPacker1x::BLOCK_LEN);
|
||||||
|
for decompressed in chunks.by_ref() {
|
||||||
|
let output = encode_with_packer(&bitpacker1x, decompressed, initial, compressed);
|
||||||
|
writer.write_all(output)?;
|
||||||
|
initial = decompressed.iter().last().copied();
|
||||||
|
}
|
||||||
|
|
||||||
|
// ...Until we don't have any small enough bitpacker. We put them raw
|
||||||
|
// at the end of out buffer with a header indicating the matter.
|
||||||
|
let decompressed = chunks.remainder();
|
||||||
|
if !decompressed.is_empty() {
|
||||||
|
let header = encode_chunk_header(BitPackerLevel::None, u32::BITS as u8);
|
||||||
|
// Note: Not convinced about the performance of writing a single
|
||||||
|
// byte followed by a larger write. However, we will use this
|
||||||
|
// codec with a BufWriter or directly with a Vec of bytes.
|
||||||
|
writer.write_all(&[header])?;
|
||||||
|
writer.write_all(bytemuck::cast_slice(decompressed))?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as [Self::deserialize_from] but accepts a buffer to avoid allocating one.
|
||||||
|
///
|
||||||
|
/// The `filter_block` function is used to filter out blocks. It takes the first
|
||||||
|
/// and last u32 values of a block and returns `true` if the block must be kept.
|
||||||
|
pub fn deserialize_from_with_tmp_buffer<F>(
|
||||||
|
input: &[u8],
|
||||||
|
filter_block: F,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
|
) -> io::Result<RoaringBitmap>
|
||||||
|
where
|
||||||
|
F: Fn(u32, u32) -> bool,
|
||||||
|
{
|
||||||
|
let Some((header, mut compressed)) = input.split_at_checked(size_of_val(&MAGIC_HEADER))
|
||||||
|
else {
|
||||||
|
return Err(io::Error::new(ErrorKind::UnexpectedEof, "expecting a two-bytes header"));
|
||||||
|
};
|
||||||
|
|
||||||
|
// Safety: This unwrap cannot happen as the header buffer is the right size
|
||||||
|
let header = u16::from_ne_bytes(header.try_into().unwrap());
|
||||||
|
|
||||||
|
if header != MAGIC_HEADER {
|
||||||
|
return Err(io::Error::other("invalid header value"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let bitpacker8x = BitPacker8x::new();
|
||||||
|
let bitpacker4x = BitPacker4x::new();
|
||||||
|
let bitpacker1x = BitPacker1x::new();
|
||||||
|
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
tmp_buffer.resize(BitPacker8x::BLOCK_LEN, 0u32);
|
||||||
|
let decompressed = &mut tmp_buffer[..];
|
||||||
|
let mut initial = None;
|
||||||
|
|
||||||
|
while let Some((&chunk_header, encoded)) = compressed.split_first() {
|
||||||
|
let (level, num_bits) = decode_chunk_header(chunk_header);
|
||||||
|
let (bytes_read, decompressed) = match level {
|
||||||
|
BitPackerLevel::None => {
|
||||||
|
if num_bits != u32::BITS as u8 {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
ErrorKind::InvalidData,
|
||||||
|
"invalid number of bits to encode non-compressed u32s",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunks = encoded.chunks_exact(size_of::<u32>());
|
||||||
|
if !chunks.remainder().is_empty() {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
"expecting last chunk to be a multiple of the size of an u32",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let integers = chunks
|
||||||
|
// safety: This unwrap cannot happen as
|
||||||
|
// the size of u32 is set correctly.
|
||||||
|
.map(|b| b.try_into().unwrap())
|
||||||
|
.map(u32::from_ne_bytes);
|
||||||
|
|
||||||
|
if let Some((first, last)) =
|
||||||
|
integers.clone().next().zip(integers.clone().next_back())
|
||||||
|
{
|
||||||
|
if !(filter_block)(first, last) {
|
||||||
|
bitmap
|
||||||
|
.append(integers)
|
||||||
|
.map_err(|e| io::Error::new(ErrorKind::InvalidData, e))?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is basically always the last chunk that exists in
|
||||||
|
// this delta-encoded format as the raw u32s are appended
|
||||||
|
// when there is not enough of them to fit in a bitpacker.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
BitPackerLevel::BitPacker1x => {
|
||||||
|
decode_with_packer(&bitpacker1x, decompressed, initial, encoded, num_bits)
|
||||||
|
}
|
||||||
|
BitPackerLevel::BitPacker4x => {
|
||||||
|
decode_with_packer(&bitpacker4x, decompressed, initial, encoded, num_bits)
|
||||||
|
}
|
||||||
|
BitPackerLevel::BitPacker8x => {
|
||||||
|
decode_with_packer(&bitpacker8x, decompressed, initial, encoded, num_bits)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
initial = decompressed.iter().last().copied();
|
||||||
|
if let Some((first, last)) = decompressed.first().copied().zip(initial) {
|
||||||
|
if !(filter_block)(first, last) {
|
||||||
|
// TODO investigate perf
|
||||||
|
// Safety: Bitpackers cannot output unsorter integers when
|
||||||
|
// used with the compress_strictly_sorted function.
|
||||||
|
bitmap.append(decompressed.iter().copied()).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// What the delta-decoding read plus the chunk header size
|
||||||
|
compressed = &compressed[bytes_read + 1..];
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(bitmap)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the length of the serialized DeRoaringBitmap.
|
||||||
|
pub fn deserialize_length_from(input: &[u8]) -> io::Result<u64> {
|
||||||
|
let Some((header, mut compressed)) = input.split_at_checked(size_of_val(&MAGIC_HEADER))
|
||||||
|
else {
|
||||||
|
return Err(io::Error::new(ErrorKind::UnexpectedEof, "expecting a two-bytes header"));
|
||||||
|
};
|
||||||
|
|
||||||
|
// Safety: This unwrap cannot happen as the header buffer is the right size
|
||||||
|
let header = u16::from_ne_bytes(header.try_into().unwrap());
|
||||||
|
|
||||||
|
if header != MAGIC_HEADER {
|
||||||
|
return Err(io::Error::other("invalid header value"));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut length = 0;
|
||||||
|
while let Some((&chunk_header, encoded)) = compressed.split_first() {
|
||||||
|
let (level, num_bits) = decode_chunk_header(chunk_header);
|
||||||
|
let bytes_read = match level {
|
||||||
|
BitPackerLevel::None => {
|
||||||
|
if num_bits != u32::BITS as u8 {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
ErrorKind::InvalidData,
|
||||||
|
"invalid number of bits to encode non-compressed u32s",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let chunks = encoded.chunks_exact(size_of::<u32>());
|
||||||
|
if !chunks.remainder().is_empty() {
|
||||||
|
return Err(io::Error::new(
|
||||||
|
io::ErrorKind::InvalidData,
|
||||||
|
"expecting last chunk to be a multiple of the size of an u32",
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
// This call is optimized for performance
|
||||||
|
// and will not iterate over the chunks.
|
||||||
|
length += chunks.count() as u64;
|
||||||
|
|
||||||
|
// This is basically always the last chunk that exists in
|
||||||
|
// this delta-encoded format as the raw u32s are appended
|
||||||
|
// when there is not enough of them to fit in a bitpacker.
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
BitPackerLevel::BitPacker1x => {
|
||||||
|
length += BitPacker1x::BLOCK_LEN as u64;
|
||||||
|
BitPacker1x::compressed_block_size(num_bits)
|
||||||
|
}
|
||||||
|
BitPackerLevel::BitPacker4x => {
|
||||||
|
length += BitPacker4x::BLOCK_LEN as u64;
|
||||||
|
BitPacker4x::compressed_block_size(num_bits)
|
||||||
|
}
|
||||||
|
BitPackerLevel::BitPacker8x => {
|
||||||
|
length += BitPacker8x::BLOCK_LEN as u64;
|
||||||
|
BitPacker8x::compressed_block_size(num_bits)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// What the delta-decoding read plus the chunk header size
|
||||||
|
compressed = &compressed[bytes_read + 1..];
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(length)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A utility function to take all blocks.
|
||||||
|
pub fn take_all_blocks(_first: u32, _last: u32) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a strickly sorted list of u32s and outputs delta-encoded
|
||||||
|
/// bytes with a chunk header. We expect the output buffer to be
|
||||||
|
/// at least BLOCK_LEN + 1.
|
||||||
|
fn encode_with_packer<'c, B: BitPackerExt>(
|
||||||
|
bitpacker: &B,
|
||||||
|
decompressed: &[u32],
|
||||||
|
initial: Option<u32>,
|
||||||
|
output: &'c mut [u8],
|
||||||
|
) -> &'c [u8] {
|
||||||
|
let num_bits = bitpacker.num_bits_strictly_sorted(initial, decompressed);
|
||||||
|
let compressed_len = B::compressed_block_size(num_bits);
|
||||||
|
let chunk_header = encode_chunk_header(B::level(), num_bits);
|
||||||
|
let buffer = &mut output[..compressed_len + 1];
|
||||||
|
// Safety: The buffer is at least one byte
|
||||||
|
let (header_in_buffer, encoded) = buffer.split_first_mut().unwrap();
|
||||||
|
*header_in_buffer = chunk_header;
|
||||||
|
bitpacker.compress_strictly_sorted(initial, decompressed, encoded, num_bits);
|
||||||
|
buffer
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the number of bytes read and the decoded unsigned integers.
|
||||||
|
fn decode_with_packer<'d, B: BitPacker>(
|
||||||
|
bitpacker: &B,
|
||||||
|
decompressed: &'d mut [u32],
|
||||||
|
initial: Option<u32>,
|
||||||
|
compressed: &[u8],
|
||||||
|
num_bits: u8,
|
||||||
|
) -> (usize, &'d [u32]) {
|
||||||
|
let decompressed = &mut decompressed[..B::BLOCK_LEN];
|
||||||
|
let read = bitpacker.decompress_strictly_sorted(initial, compressed, decompressed, num_bits);
|
||||||
|
(read, decompressed)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// An identifier for the bitpacker to be able
|
||||||
|
/// to correctly decode the compressed integers.
|
||||||
|
#[derive(Debug, PartialEq, Eq)]
|
||||||
|
#[repr(u8)]
|
||||||
|
enum BitPackerLevel {
|
||||||
|
/// The remaining bytes are raw little endian encoded u32s.
|
||||||
|
None,
|
||||||
|
/// The remaining bits are encoded using a `BitPacker1x`.
|
||||||
|
BitPacker1x,
|
||||||
|
/// The remaining bits are encoded using a `BitPacker4x`.
|
||||||
|
BitPacker4x,
|
||||||
|
/// The remaining bits are encoded using a `BitPacker8x`.
|
||||||
|
BitPacker8x,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the chunk header based on the bitpacker level
|
||||||
|
/// and the number of bits to encode the list of integers.
|
||||||
|
fn encode_chunk_header(level: BitPackerLevel, num_bits: u8) -> u8 {
|
||||||
|
debug_assert!(num_bits as u32 <= 2_u32.pow(6));
|
||||||
|
let level = level as u8;
|
||||||
|
debug_assert!(level <= 3);
|
||||||
|
num_bits | (level << 6)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decodes the chunk header and output the bitpacker level
|
||||||
|
/// and the number of bits to decode the following bytes.
|
||||||
|
fn decode_chunk_header(data: u8) -> (BitPackerLevel, u8) {
|
||||||
|
let num_bits = data & 0b00111111;
|
||||||
|
let level = match data >> 6 {
|
||||||
|
0 => BitPackerLevel::None,
|
||||||
|
1 => BitPackerLevel::BitPacker1x,
|
||||||
|
2 => BitPackerLevel::BitPacker4x,
|
||||||
|
3 => BitPackerLevel::BitPacker8x,
|
||||||
|
invalid => panic!("Invalid bitpacker level: {invalid}"),
|
||||||
|
};
|
||||||
|
debug_assert!(num_bits as u32 <= 2_u32.pow(6));
|
||||||
|
(level, num_bits)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A simple helper trait to get the BitPackerLevel
|
||||||
|
/// and correctly generate the chunk header.
|
||||||
|
trait BitPackerExt: BitPacker {
|
||||||
|
/// Returns the level of the bitpacker: an identifier to be
|
||||||
|
/// able to decode the numbers with the right bitpacker.
|
||||||
|
fn level() -> BitPackerLevel;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BitPackerExt for BitPacker8x {
|
||||||
|
fn level() -> BitPackerLevel {
|
||||||
|
BitPackerLevel::BitPacker8x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BitPackerExt for BitPacker4x {
|
||||||
|
fn level() -> BitPackerLevel {
|
||||||
|
BitPackerLevel::BitPacker4x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BitPackerExt for BitPacker1x {
|
||||||
|
fn level() -> BitPackerLevel {
|
||||||
|
BitPackerLevel::BitPacker1x
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use quickcheck::quickcheck;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::{take_all_blocks, DeRoaringBitmapCodec};
|
||||||
|
|
||||||
|
quickcheck! {
|
||||||
|
fn qc_random(xs: Vec<u32>) -> bool {
|
||||||
|
let bitmap = RoaringBitmap::from_iter(xs);
|
||||||
|
let mut compressed = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
DeRoaringBitmapCodec::serialize_into_with_tmp_buffer(&bitmap, &mut compressed, &mut tmp_buffer).unwrap();
|
||||||
|
let length = DeRoaringBitmapCodec::deserialize_length_from(&compressed[..]).unwrap();
|
||||||
|
let decompressed = DeRoaringBitmapCodec::deserialize_from_with_tmp_buffer(&compressed[..], take_all_blocks, &mut tmp_buffer).unwrap();
|
||||||
|
length == bitmap.len() && decompressed == bitmap
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quickcheck! {
|
||||||
|
fn qc_random_check_serialized_size(xs: Vec<u32>) -> bool {
|
||||||
|
let bitmap = RoaringBitmap::from_iter(xs);
|
||||||
|
let mut compressed = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
DeRoaringBitmapCodec::serialize_into_with_tmp_buffer(&bitmap, &mut compressed, &mut tmp_buffer).unwrap();
|
||||||
|
let length = DeRoaringBitmapCodec::deserialize_length_from(&compressed).unwrap();
|
||||||
|
let expected_len = DeRoaringBitmapCodec::serialized_size_with_tmp_buffer(&bitmap, &mut tmp_buffer);
|
||||||
|
length == bitmap.len() && compressed.len() == expected_len
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
quickcheck! {
|
||||||
|
fn qc_random_intersection_with_serialized(lhs: Vec<u32>, rhs: Vec<u32>) -> bool {
|
||||||
|
let mut compressed = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
|
|
||||||
|
let lhs = RoaringBitmap::from_iter(lhs);
|
||||||
|
let rhs = RoaringBitmap::from_iter(rhs);
|
||||||
|
DeRoaringBitmapCodec::serialize_into_with_tmp_buffer(&lhs, &mut compressed, &mut tmp_buffer).unwrap();
|
||||||
|
|
||||||
|
let sub_lhs = DeRoaringBitmapCodec::deserialize_from_with_tmp_buffer(&compressed, |first, last| {
|
||||||
|
rhs.range_cardinality(first..=last) == 0
|
||||||
|
}, &mut tmp_buffer).unwrap();
|
||||||
|
|
||||||
|
let intersection = sub_lhs & rhs.clone();
|
||||||
|
let expected_intersection = lhs & rhs;
|
||||||
|
|
||||||
|
intersection == expected_intersection
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
mod bo_roaring_bitmap_codec;
|
pub(super) mod cbo_roaring_bitmap_codec;
|
||||||
pub mod cbo_roaring_bitmap_codec;
|
mod de_cbo_roaring_bitmap_codec;
|
||||||
|
pub(super) mod de_roaring_bitmap_codec;
|
||||||
mod roaring_bitmap_codec;
|
mod roaring_bitmap_codec;
|
||||||
|
|
||||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
pub use self::cbo_roaring_bitmap_codec::THRESHOLD;
|
||||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
pub use self::de_cbo_roaring_bitmap_codec::{DeCboRoaringBitmapCodec, DELTA_ENCODING_STATUS};
|
||||||
|
pub use self::de_roaring_bitmap_codec::take_all_blocks;
|
||||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ use std::mem;
|
|||||||
use heed::{BoxedError, BytesDecode};
|
use heed::{BoxedError, BytesDecode};
|
||||||
|
|
||||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
use crate::heed_codec::roaring_bitmap::THRESHOLD;
|
||||||
use crate::heed_codec::BytesDecodeOwned;
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
|
|
||||||
pub struct CboRoaringBitmapLenCodec;
|
pub struct CboRoaringBitmapLenCodec;
|
||||||
|
|||||||
@@ -0,0 +1,42 @@
|
|||||||
|
use std::io::ErrorKind;
|
||||||
|
|
||||||
|
use heed::{BoxedError, BytesDecode};
|
||||||
|
|
||||||
|
use super::BoRoaringBitmapLenCodec;
|
||||||
|
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||||
|
use crate::heed_codec::roaring_bitmap::de_roaring_bitmap_codec::DeRoaringBitmapCodec;
|
||||||
|
use crate::heed_codec::roaring_bitmap_length::CboRoaringBitmapLenCodec;
|
||||||
|
use crate::heed_codec::BytesDecodeOwned;
|
||||||
|
|
||||||
|
pub struct DeCboRoaringBitmapLenCodec;
|
||||||
|
|
||||||
|
impl BytesDecode<'_> for DeCboRoaringBitmapLenCodec {
|
||||||
|
type DItem = u64;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||||
|
if CboRoaringBitmapCodec::bytes_deserialize_as_raw_u32s(bytes) {
|
||||||
|
// If there is threshold or less than threshold integers that can fit
|
||||||
|
// into this array of bytes it means that we used the ByteOrder codec
|
||||||
|
// serializer.
|
||||||
|
BoRoaringBitmapLenCodec::bytes_decode(bytes)
|
||||||
|
} else {
|
||||||
|
match DeRoaringBitmapCodec::deserialize_length_from(bytes) {
|
||||||
|
Ok(bitmap) => Ok(bitmap),
|
||||||
|
// If the error kind is Other it means that the delta-decoder found
|
||||||
|
// an invalid magic header. We fall back to the CboRoaringBitmap version.
|
||||||
|
Err(e) if e.kind() == ErrorKind::Other => {
|
||||||
|
CboRoaringBitmapLenCodec::bytes_decode(bytes)
|
||||||
|
}
|
||||||
|
Err(e) => Err(e.into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BytesDecodeOwned for DeCboRoaringBitmapLenCodec {
|
||||||
|
type DItem = u64;
|
||||||
|
|
||||||
|
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||||
|
Self::bytes_decode(bytes)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,7 +1,9 @@
|
|||||||
mod bo_roaring_bitmap_len_codec;
|
mod bo_roaring_bitmap_len_codec;
|
||||||
mod cbo_roaring_bitmap_len_codec;
|
mod cbo_roaring_bitmap_len_codec;
|
||||||
|
mod de_cbo_roaring_bitmap_len_codec;
|
||||||
mod roaring_bitmap_len_codec;
|
mod roaring_bitmap_len_codec;
|
||||||
|
|
||||||
pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec;
|
use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec;
|
||||||
pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec;
|
use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec;
|
||||||
pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec;
|
pub use self::de_cbo_roaring_bitmap_len_codec::DeCboRoaringBitmapLenCodec;
|
||||||
|
use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec;
|
||||||
|
|||||||
@@ -72,16 +72,15 @@ impl BytesDecodeOwned for RoaringBitmapLenCodec {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use heed::BytesEncode;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::heed_codec::RoaringBitmapCodec;
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn deserialize_roaring_bitmap_length() {
|
fn deserialize_roaring_bitmap_length() {
|
||||||
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
|
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
|
||||||
let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
let mut bytes = Vec::new();
|
||||||
|
bitmap.serialize_into(&mut bytes).unwrap();
|
||||||
let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap();
|
let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap();
|
||||||
assert_eq!(bitmap.len(), len);
|
assert_eq!(bitmap.len(), len);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -34,11 +34,10 @@ use crate::update::new::StdResult;
|
|||||||
use crate::vector::db::IndexEmbeddingConfigs;
|
use crate::vector::db::IndexEmbeddingConfigs;
|
||||||
use crate::vector::{Embedding, VectorStore, VectorStoreBackend, VectorStoreStats};
|
use crate::vector::{Embedding, VectorStore, VectorStoreBackend, VectorStoreStats};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, Criterion, DeCboRoaringBitmapCodec, DeCboRoaringBitmapLenCodec, DocumentId,
|
||||||
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
|
ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry,
|
||||||
FieldidsWeightsMap, FilterableAttributesRule, GeoPoint, LocalizedAttributesRule, ObkvCodec,
|
FieldIdWordCountCodec, FieldidsWeightsMap, FilterableAttributesRule, GeoPoint,
|
||||||
Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32,
|
LocalizedAttributesRule, ObkvCodec, Result, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64,
|
||||||
BEU64,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
|
||||||
@@ -133,38 +132,38 @@ pub struct Index {
|
|||||||
pub external_documents_ids: Database<Str, BEU32>,
|
pub external_documents_ids: Database<Str, BEU32>,
|
||||||
|
|
||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub word_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||||
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub exact_word_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix.
|
/// A prefix of word and all the documents ids containing this prefix.
|
||||||
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||||
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
pub exact_word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<U8StrStrCodec, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the word and the position with the docids that corresponds to it.
|
/// Maps the word and the position with the docids that corresponds to it.
|
||||||
pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
pub word_position_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
/// Maps the word and the field id with the docids that corresponds to it.
|
/// Maps the word and the field id with the docids that corresponds to it.
|
||||||
pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
pub word_fid_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the field id and the word count with the docids that corresponds to it.
|
/// Maps the field id and the word count with the docids that corresponds to it.
|
||||||
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
|
pub field_id_word_count_docids: Database<FieldIdWordCountCodec, DeCboRoaringBitmapCodec>,
|
||||||
/// Maps the word prefix and a position with all the docids where the prefix appears at the position.
|
/// Maps the word prefix and a position with all the docids where the prefix appears at the position.
|
||||||
pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_position_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
/// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
|
/// Maps the word prefix and a field id with all the docids where the prefix appears inside the field
|
||||||
pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_fid_docids: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the facet field id and the docids for which this field exists
|
/// Maps the facet field id and the docids for which this field exists
|
||||||
pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
pub facet_id_exists_docids: Database<FieldIdCodec, DeCboRoaringBitmapCodec>,
|
||||||
/// Maps the facet field id and the docids for which this field is set as null
|
/// Maps the facet field id and the docids for which this field is set as null
|
||||||
pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
pub facet_id_is_null_docids: Database<FieldIdCodec, DeCboRoaringBitmapCodec>,
|
||||||
/// Maps the facet field id and the docids for which this field is considered empty
|
/// Maps the facet field id and the docids for which this field is considered empty
|
||||||
pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>,
|
pub facet_id_is_empty_docids: Database<FieldIdCodec, DeCboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
|
/// Maps the facet field id and ranges of numbers with the docids that corresponds to them.
|
||||||
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||||
@@ -505,7 +504,7 @@ impl Index {
|
|||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
docids: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, RoaringBitmapCodec>().put(
|
self.main.remap_types::<Str, DeCboRoaringBitmapCodec>().put(
|
||||||
wtxn,
|
wtxn,
|
||||||
main_key::DOCUMENTS_IDS_KEY,
|
main_key::DOCUMENTS_IDS_KEY,
|
||||||
docids,
|
docids,
|
||||||
@@ -516,7 +515,7 @@ impl Index {
|
|||||||
pub fn documents_ids(&self, rtxn: &RoTxn<'_>) -> heed::Result<RoaringBitmap> {
|
pub fn documents_ids(&self, rtxn: &RoTxn<'_>) -> heed::Result<RoaringBitmap> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.main
|
.main
|
||||||
.remap_types::<Str, RoaringBitmapCodec>()
|
.remap_types::<Str, DeCboRoaringBitmapCodec>()
|
||||||
.get(rtxn, main_key::DOCUMENTS_IDS_KEY)?
|
.get(rtxn, main_key::DOCUMENTS_IDS_KEY)?
|
||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
@@ -525,7 +524,7 @@ impl Index {
|
|||||||
pub fn number_of_documents(&self, rtxn: &RoTxn<'_>) -> Result<u64> {
|
pub fn number_of_documents(&self, rtxn: &RoTxn<'_>) -> Result<u64> {
|
||||||
let count = self
|
let count = self
|
||||||
.main
|
.main
|
||||||
.remap_types::<Str, RoaringBitmapLenCodec>()
|
.remap_types::<Str, DeCboRoaringBitmapLenCodec>()
|
||||||
.get(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
|
.get(rtxn, main_key::DOCUMENTS_IDS_KEY)?;
|
||||||
Ok(count.unwrap_or_default())
|
Ok(count.unwrap_or_default())
|
||||||
}
|
}
|
||||||
@@ -726,7 +725,7 @@ impl Index {
|
|||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
docids: &RoaringBitmap,
|
docids: &RoaringBitmap,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, RoaringBitmapCodec>().put(
|
self.main.remap_types::<Str, DeCboRoaringBitmapCodec>().put(
|
||||||
wtxn,
|
wtxn,
|
||||||
main_key::GEO_FACETED_DOCUMENTS_IDS_KEY,
|
main_key::GEO_FACETED_DOCUMENTS_IDS_KEY,
|
||||||
docids,
|
docids,
|
||||||
@@ -745,7 +744,7 @@ impl Index {
|
|||||||
pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn<'_>) -> heed::Result<RoaringBitmap> {
|
pub fn geo_faceted_documents_ids(&self, rtxn: &RoTxn<'_>) -> heed::Result<RoaringBitmap> {
|
||||||
match self
|
match self
|
||||||
.main
|
.main
|
||||||
.remap_types::<Str, RoaringBitmapCodec>()
|
.remap_types::<Str, DeCboRoaringBitmapCodec>()
|
||||||
.get(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)?
|
.get(rtxn, main_key::GEO_FACETED_DOCUMENTS_IDS_KEY)?
|
||||||
{
|
{
|
||||||
Some(docids) => Ok(docids),
|
Some(docids) => Ok(docids),
|
||||||
@@ -1398,7 +1397,7 @@ impl Index {
|
|||||||
/// Returns the number of documents ids associated with the given word,
|
/// Returns the number of documents ids associated with the given word,
|
||||||
/// it is much faster than deserializing the bitmap and getting the length of it.
|
/// it is much faster than deserializing the bitmap and getting the length of it.
|
||||||
pub fn word_documents_count(&self, rtxn: &RoTxn<'_>, word: &str) -> heed::Result<Option<u64>> {
|
pub fn word_documents_count(&self, rtxn: &RoTxn<'_>, word: &str) -> heed::Result<Option<u64>> {
|
||||||
self.word_docids.remap_data_type::<RoaringBitmapLenCodec>().get(rtxn, word)
|
self.word_docids.remap_data_type::<DeCboRoaringBitmapLenCodec>().get(rtxn, word)
|
||||||
}
|
}
|
||||||
|
|
||||||
/* documents */
|
/* documents */
|
||||||
|
|||||||
@@ -72,9 +72,8 @@ pub use self::filterable_attributes_rules::{
|
|||||||
FilterableAttributesRule,
|
FilterableAttributesRule,
|
||||||
};
|
};
|
||||||
pub use self::heed_codec::{
|
pub use self::heed_codec::{
|
||||||
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
BEU16StrCodec, BEU32StrCodec, DeCboRoaringBitmapCodec, DeCboRoaringBitmapLenCodec,
|
||||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
|
FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, StrBEU32Codec, U8StrStrCodec,
|
||||||
RoaringBitmapCodec, RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec,
|
|
||||||
UncheckedU8StrStrCodec,
|
UncheckedU8StrStrCodec,
|
||||||
};
|
};
|
||||||
pub use self::index::Index;
|
pub use self::index::Index;
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId};
|
use crate::{DeCboRoaringBitmapCodec, DocumentId};
|
||||||
|
|
||||||
/// Call the given closure on the facet distribution of the candidate documents.
|
/// Call the given closure on the facet distribution of the candidate documents.
|
||||||
///
|
///
|
||||||
@@ -88,7 +88,7 @@ where
|
|||||||
if key.field_id != field_id {
|
if key.field_id != field_id {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
let intersection = DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
value.bitmap_bytes,
|
value.bitmap_bytes,
|
||||||
candidates,
|
candidates,
|
||||||
)?;
|
)?;
|
||||||
@@ -120,7 +120,7 @@ where
|
|||||||
if key.field_id != field_id {
|
if key.field_id != field_id {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
let intersection = DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
value.bitmap_bytes,
|
value.bitmap_bytes,
|
||||||
candidates,
|
candidates,
|
||||||
)?;
|
)?;
|
||||||
@@ -173,7 +173,7 @@ where
|
|||||||
if key.field_id != self.field_id {
|
if key.field_id != self.field_id {
|
||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
let docids_in_common = DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
value.bitmap_bytes,
|
value.bitmap_bytes,
|
||||||
candidates,
|
candidates,
|
||||||
)?;
|
)?;
|
||||||
@@ -210,7 +210,7 @@ where
|
|||||||
if key.field_id != self.field_id {
|
if key.field_id != self.field_id {
|
||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
let docids_in_common = DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
value.bitmap_bytes,
|
value.bitmap_bytes,
|
||||||
candidates,
|
candidates,
|
||||||
)?;
|
)?;
|
||||||
|
|||||||
@@ -8,7 +8,7 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
use crate::{DeCboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
/// Find all the document ids for which the given field contains a value contained within
|
/// Find all the document ids for which the given field contains a value contained within
|
||||||
/// the two bounds.
|
/// the two bounds.
|
||||||
@@ -114,11 +114,11 @@ impl<'t> FacetRangeSearch<'t, '_, '_> {
|
|||||||
|
|
||||||
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
|
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
|
||||||
*self.docids |= match self.universe {
|
*self.docids |= match self.universe {
|
||||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
Some(universe) => DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
value.bitmap_bytes,
|
value.bitmap_bytes,
|
||||||
universe,
|
universe,
|
||||||
)?,
|
)?,
|
||||||
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
|
None => DeCboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -211,11 +211,11 @@ impl<'t> FacetRangeSearch<'t, '_, '_> {
|
|||||||
};
|
};
|
||||||
if should_take_whole_group {
|
if should_take_whole_group {
|
||||||
*self.docids |= match self.universe {
|
*self.docids |= match self.universe {
|
||||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
Some(universe) => DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
previous_value.bitmap_bytes,
|
previous_value.bitmap_bytes,
|
||||||
universe,
|
universe,
|
||||||
)?,
|
)?,
|
||||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
None => DeCboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||||
};
|
};
|
||||||
previous_key = next_key;
|
previous_key = next_key;
|
||||||
previous_value = next_value;
|
previous_value = next_value;
|
||||||
@@ -313,11 +313,11 @@ impl<'t> FacetRangeSearch<'t, '_, '_> {
|
|||||||
};
|
};
|
||||||
if should_take_whole_group {
|
if should_take_whole_group {
|
||||||
*self.docids |= match self.universe {
|
*self.docids |= match self.universe {
|
||||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
Some(universe) => DeCboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
previous_value.bitmap_bytes,
|
previous_value.bitmap_bytes,
|
||||||
universe,
|
universe,
|
||||||
)?,
|
)?,
|
||||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
None => DeCboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
let level = level - 1;
|
let level = level - 1;
|
||||||
|
|||||||
@@ -12,9 +12,9 @@ use super::interner::Interned;
|
|||||||
use super::Word;
|
use super::Word;
|
||||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||||
use crate::proximity::ProximityPrecision;
|
use crate::proximity::ProximityPrecision;
|
||||||
use crate::update::MergeCboRoaringBitmaps;
|
use crate::update::MergeDeCboRoaringBitmaps;
|
||||||
use crate::{
|
use crate::{
|
||||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
DeCboRoaringBitmapCodec, DeCboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// A cache storing pointers to values in the LMDB databases.
|
/// A cache storing pointers to values in the LMDB databases.
|
||||||
@@ -72,11 +72,11 @@ impl<'ctx> DatabaseCache<'ctx> {
|
|||||||
|
|
||||||
match (bitmap_bytes, universe) {
|
match (bitmap_bytes, universe) {
|
||||||
(bytes, Some(universe)) => {
|
(bytes, Some(universe)) => {
|
||||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
DeCboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.map_err(Into::into)
|
.map_err(Into::into)
|
||||||
}
|
}
|
||||||
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
(bytes, None) => DeCboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.map_err(heed::Error::Decoding)
|
.map_err(heed::Error::Decoding)
|
||||||
.map_err(Into::into),
|
.map_err(Into::into),
|
||||||
@@ -105,7 +105,7 @@ impl<'ctx> DatabaseCache<'ctx> {
|
|||||||
None => return Ok(None),
|
None => return Ok(None),
|
||||||
};
|
};
|
||||||
|
|
||||||
CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
|
DeCboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.map_err(heed::Error::Decoding)
|
.map_err(heed::Error::Decoding)
|
||||||
.map_err(Into::into)
|
.map_err(Into::into)
|
||||||
@@ -157,11 +157,11 @@ impl<'ctx> DatabaseCache<'ctx> {
|
|||||||
|
|
||||||
match (bitmap_bytes, universe) {
|
match (bitmap_bytes, universe) {
|
||||||
(bytes, Some(universe)) => {
|
(bytes, Some(universe)) => {
|
||||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
DeCboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.map_err(Into::into)
|
.map_err(Into::into)
|
||||||
}
|
}
|
||||||
(bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
(bytes, None) => DeCboRoaringBitmapCodec::bytes_decode_owned(bytes)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.map_err(heed::Error::Decoding)
|
.map_err(heed::Error::Decoding)
|
||||||
.map_err(Into::into),
|
.map_err(Into::into),
|
||||||
@@ -223,7 +223,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self.db_cache.word_docids,
|
&mut self.db_cache.word_docids,
|
||||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||||
universe,
|
universe,
|
||||||
MergeCboRoaringBitmaps,
|
MergeDeCboRoaringBitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value(
|
None => DatabaseCache::get_value(
|
||||||
@@ -255,7 +255,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self.db_cache.exact_word_docids,
|
&mut self.db_cache.exact_word_docids,
|
||||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||||
universe,
|
universe,
|
||||||
MergeCboRoaringBitmaps,
|
MergeDeCboRoaringBitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value(
|
None => DatabaseCache::get_value(
|
||||||
@@ -312,7 +312,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self.db_cache.word_prefix_docids,
|
&mut self.db_cache.word_prefix_docids,
|
||||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||||
universe,
|
universe,
|
||||||
MergeCboRoaringBitmaps,
|
MergeDeCboRoaringBitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value(
|
None => DatabaseCache::get_value(
|
||||||
@@ -344,7 +344,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self.db_cache.exact_word_prefix_docids,
|
&mut self.db_cache.exact_word_prefix_docids,
|
||||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||||
universe,
|
universe,
|
||||||
MergeCboRoaringBitmaps,
|
MergeDeCboRoaringBitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value(
|
None => DatabaseCache::get_value(
|
||||||
@@ -377,7 +377,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
{
|
{
|
||||||
docids
|
docids
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.map(|d| CboRoaringBitmapCodec::bytes_decode_owned(d))
|
.map(|d| DeCboRoaringBitmapCodec::bytes_decode_owned(d))
|
||||||
.transpose()
|
.transpose()
|
||||||
.map_err(heed::Error::Decoding)?
|
.map_err(heed::Error::Decoding)?
|
||||||
} else {
|
} else {
|
||||||
@@ -395,7 +395,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
docids |= word1_docids & word2_docids;
|
docids |= word1_docids & word2_docids;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let encoded = CboRoaringBitmapCodec::bytes_encode(&docids)
|
let encoded = DeCboRoaringBitmapCodec::bytes_encode(&docids)
|
||||||
.map(Cow::into_owned)
|
.map(Cow::into_owned)
|
||||||
.map(Cow::Owned)
|
.map(Cow::Owned)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput};
|
|||||||
use crate::score_details::{self, ScoreDetails};
|
use crate::score_details::{self, ScoreDetails};
|
||||||
use crate::search::new::query_graph::QueryNodeData;
|
use crate::search::new::query_graph::QueryNodeData;
|
||||||
use crate::search::new::query_term::ExactTerm;
|
use crate::search::new::query_term::ExactTerm;
|
||||||
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger, TimeBudget};
|
use crate::{DeCboRoaringBitmapCodec, Result, SearchContext, SearchLogger, TimeBudget};
|
||||||
|
|
||||||
/// A ranking rule that produces 3 disjoint buckets:
|
/// A ranking rule that produces 3 disjoint buckets:
|
||||||
///
|
///
|
||||||
@@ -219,7 +219,7 @@ impl State {
|
|||||||
|
|
||||||
match bitmap_bytes {
|
match bitmap_bytes {
|
||||||
Some(bytes) => {
|
Some(bytes) => {
|
||||||
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
|
DeCboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
|
||||||
}
|
}
|
||||||
None => RoaringBitmap::default(),
|
None => RoaringBitmap::default(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ use crate::heed_codec::facet::{
|
|||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
use crate::update::MergeDeladdDeCboRoaringBitmaps;
|
||||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
use crate::{DeCboRoaringBitmapCodec, DeCboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||||
|
|
||||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||||
/// by rebuilding the database "from scratch".
|
/// by rebuilding the database "from scratch".
|
||||||
@@ -29,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> {
|
|||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
// None if level 0 does not need to be updated
|
// None if level 0 does not need to be updated
|
||||||
delta_data: Option<Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>>,
|
delta_data: Option<Merger<BufReader<File>, MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'i> FacetsUpdateBulk<'i> {
|
impl<'i> FacetsUpdateBulk<'i> {
|
||||||
@@ -37,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
field_ids: Vec<FieldId>,
|
field_ids: Vec<FieldId>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
delta_data: Merger<BufReader<File>, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
) -> FacetsUpdateBulk<'i> {
|
) -> FacetsUpdateBulk<'i> {
|
||||||
@@ -90,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
|||||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||||
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||||
pub delta_data: Option<Merger<R, MergeDeladdCboRoaringBitmaps>>,
|
pub delta_data: Option<Merger<R, MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
pub group_size: u8,
|
pub group_size: u8,
|
||||||
pub min_level_size: u8,
|
pub min_level_size: u8,
|
||||||
}
|
}
|
||||||
@@ -143,6 +143,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
let database = self.db.remap_types::<Bytes, Bytes>();
|
let database = self.db.remap_types::<Bytes, Bytes>();
|
||||||
|
|
||||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||||
@@ -162,7 +163,12 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
Some(prev_value) => {
|
Some(prev_value) => {
|
||||||
// prev_value is the group size for level 0, followed by the previous bitmap.
|
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||||
let old_bitmap = &prev_value[1..];
|
let old_bitmap = &prev_value[1..];
|
||||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
DeCboRoaringBitmapCodec::merge_deladd_into(
|
||||||
|
value,
|
||||||
|
old_bitmap,
|
||||||
|
&mut buffer,
|
||||||
|
&mut tmp_buffer,
|
||||||
|
)?;
|
||||||
}
|
}
|
||||||
None => {
|
None => {
|
||||||
// it is safe to ignore the del in that case.
|
// it is safe to ignore the del in that case.
|
||||||
@@ -176,7 +182,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
};
|
};
|
||||||
let new_bitmap = &buffer[1..];
|
let new_bitmap = &buffer[1..];
|
||||||
// if the new bitmap is empty, let's remove it
|
// if the new bitmap is empty, let's remove it
|
||||||
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
if DeCboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||||
database.delete(wtxn, key)?;
|
database.delete(wtxn, key)?;
|
||||||
} else {
|
} else {
|
||||||
database.put(wtxn, key, &buffer)?;
|
database.put(wtxn, key, &buffer)?;
|
||||||
|
|||||||
@@ -15,8 +15,8 @@ use crate::heed_codec::BytesRefCodec;
|
|||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::update::del_add::DelAdd;
|
use crate::update::del_add::DelAdd;
|
||||||
use crate::update::index_documents::valid_lmdb_key;
|
use crate::update::index_documents::valid_lmdb_key;
|
||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
use crate::update::MergeDeladdDeCboRoaringBitmaps;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
use crate::{DeCboRoaringBitmapCodec, Index, Result};
|
||||||
|
|
||||||
/// Enum used as a return value for the facet incremental indexing.
|
/// Enum used as a return value for the facet incremental indexing.
|
||||||
///
|
///
|
||||||
@@ -58,14 +58,14 @@ enum ModificationResult {
|
|||||||
/// `facet_id_(string/f64)_docids` databases.
|
/// `facet_id_(string/f64)_docids` databases.
|
||||||
pub struct FacetsUpdateIncremental {
|
pub struct FacetsUpdateIncremental {
|
||||||
inner: FacetsUpdateIncrementalInner,
|
inner: FacetsUpdateIncrementalInner,
|
||||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
delta_data: Merger<BufReader<File>, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl FacetsUpdateIncremental {
|
impl FacetsUpdateIncremental {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
index: &Index,
|
index: &Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
delta_data: Merger<BufReader<File>, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
@@ -112,13 +112,13 @@ impl FacetsUpdateIncremental {
|
|||||||
let value = KvReader::from_slice(value);
|
let value = KvReader::from_slice(value);
|
||||||
let docids_to_delete = value
|
let docids_to_delete = value
|
||||||
.get(DelAdd::Deletion)
|
.get(DelAdd::Deletion)
|
||||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
.map(DeCboRoaringBitmapCodec::bytes_decode)
|
||||||
.map(|o| o.map_err(heed::Error::Encoding))
|
.map(|o| o.map_err(heed::Error::Encoding))
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
|
|
||||||
let docids_to_add = value
|
let docids_to_add = value
|
||||||
.get(DelAdd::Addition)
|
.get(DelAdd::Addition)
|
||||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
.map(DeCboRoaringBitmapCodec::bytes_decode)
|
||||||
.map(|o| o.map_err(heed::Error::Encoding))
|
.map(|o| o.map_err(heed::Error::Encoding))
|
||||||
.transpose()?;
|
.transpose()?;
|
||||||
|
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ use tracing::debug;
|
|||||||
|
|
||||||
use self::incremental::FacetsUpdateIncremental;
|
use self::incremental::FacetsUpdateIncremental;
|
||||||
use super::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
use super::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||||
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdDeCboRoaringBitmaps};
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec,
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec,
|
||||||
@@ -112,7 +112,7 @@ pub struct FacetsUpdate<'i> {
|
|||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
delta_data: Merger<BufReader<File>, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
||||||
group_size: u8,
|
group_size: u8,
|
||||||
max_group_size: u8,
|
max_group_size: u8,
|
||||||
@@ -124,7 +124,7 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
pub fn new(
|
pub fn new(
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
facet_type: FacetType,
|
facet_type: FacetType,
|
||||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
delta_data: Merger<BufReader<File>, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
||||||
data_size: u64,
|
data_size: u64,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
@@ -364,9 +364,9 @@ pub(crate) mod test_helpers {
|
|||||||
use crate::search::facet::get_highest_level;
|
use crate::search::facet::get_highest_level;
|
||||||
use crate::snapshot_tests::display_bitmap;
|
use crate::snapshot_tests::display_bitmap;
|
||||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::MergeDeladdCboRoaringBitmaps;
|
use crate::update::index_documents::MergeDeladdDeCboRoaringBitmaps;
|
||||||
use crate::update::FacetsUpdateIncrementalInner;
|
use crate::update::FacetsUpdateIncrementalInner;
|
||||||
use crate::CboRoaringBitmapCodec;
|
use crate::DeCboRoaringBitmapCodec;
|
||||||
|
|
||||||
/// Utility function to generate a string whose position in a lexicographically
|
/// Utility function to generate a string whose position in a lexicographically
|
||||||
/// ordered list is `i`.
|
/// ordered list is `i`.
|
||||||
@@ -496,13 +496,13 @@ pub(crate) mod test_helpers {
|
|||||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap();
|
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap();
|
||||||
let mut inner_writer = KvWriterDelAdd::memory();
|
let mut inner_writer = KvWriterDelAdd::memory();
|
||||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
let value = DeCboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||||
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||||
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||||
}
|
}
|
||||||
writer.finish().unwrap();
|
writer.finish().unwrap();
|
||||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
builder.push(reader.into_cursor().unwrap());
|
builder.push(reader.into_cursor().unwrap());
|
||||||
let merger = builder.build();
|
let merger = builder.build();
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use std::io::{self, BufReader};
|
|||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps,
|
create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||||
@@ -27,7 +27,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_number_docids_sorter = create_sorter(
|
let mut facet_number_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
|||||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::helpers::{
|
use crate::update::index_documents::helpers::{
|
||||||
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
MergeDeladdBtreesetString, MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||||
@@ -54,7 +54,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
@@ -154,7 +154,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ use crate::facet::value_encoding::f64_into_bytes;
|
|||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
use crate::{DeCboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||||
|
|
||||||
/// The length of the elements that are always in the buffer when inserting new values.
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||||
@@ -311,8 +311,8 @@ fn deladd_obkv_cbo_roaring_bitmaps(
|
|||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
let del_bitmap_bytes = DeCboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
let add_bitmap_bytes = DeCboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||||
obkv.finish()
|
obkv.finish()
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use obkv::KvReaderU16;
|
|||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
@@ -30,7 +30,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut fid_word_count_docids_sorter = create_sorter(
|
let mut fid_word_count_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use obkv::KvReaderU16;
|
|||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::heed_codec::StrBEU16Codec;
|
use crate::heed_codec::StrBEU16Codec;
|
||||||
@@ -38,7 +38,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut word_fid_docids_sorter = create_sorter(
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
@@ -93,7 +93,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
@@ -103,7 +103,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut exact_word_docids_sorter = create_sorter(
|
let mut exact_word_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
@@ -166,7 +166,7 @@ fn words_into_sorter(
|
|||||||
key_buffer: &mut Vec<u8>,
|
key_buffer: &mut Vec<u8>,
|
||||||
del_words: &BTreeSet<Vec<u8>>,
|
del_words: &BTreeSet<Vec<u8>>,
|
||||||
add_words: &BTreeSet<Vec<u8>>,
|
add_words: &BTreeSet<Vec<u8>>,
|
||||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use obkv::KvReaderU16;
|
|||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
@@ -44,7 +44,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
.map(|_| {
|
.map(|_| {
|
||||||
create_sorter(
|
create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
@@ -198,7 +198,7 @@ fn document_word_positions_into_sorter(
|
|||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdCboRoaringBitmaps>],
|
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>],
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use obkv::KvReaderU16;
|
|||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
@@ -28,7 +28,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut word_position_docids_sorter = create_sorter(
|
let mut word_position_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
@@ -100,7 +100,7 @@ fn words_position_into_sorter(
|
|||||||
key_buffer: &mut Vec<u8>,
|
key_buffer: &mut Vec<u8>,
|
||||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||||
word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use either::Either;
|
|||||||
use grenad::MergeFunction;
|
use grenad::MergeFunction;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::DeCboRoaringBitmapCodec;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::transform::Operation;
|
use crate::update::index_documents::transform::Operation;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
@@ -189,10 +189,10 @@ impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
/// Do a union of all the DeCboRoaringBitmaps in the values.
|
||||||
pub struct MergeCboRoaringBitmaps;
|
pub struct MergeDeCboRoaringBitmaps;
|
||||||
|
|
||||||
impl MergeFunction for MergeCboRoaringBitmaps {
|
impl MergeFunction for MergeDeCboRoaringBitmaps {
|
||||||
type Error = crate::Error;
|
type Error = crate::Error;
|
||||||
|
|
||||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
@@ -200,17 +200,17 @@ impl MergeFunction for MergeCboRoaringBitmaps {
|
|||||||
Ok(values[0].clone())
|
Ok(values[0].clone())
|
||||||
} else {
|
} else {
|
||||||
let mut vec = Vec::new();
|
let mut vec = Vec::new();
|
||||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
DeCboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||||
Ok(Cow::from(vec))
|
Ok(Cow::from(vec))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
/// Do a union of DeCboRoaringBitmaps on both sides of a DelAdd obkv
|
||||||
/// separately and outputs a new DelAdd with both unions.
|
/// separately and outputs a new DelAdd with both unions.
|
||||||
pub struct MergeDeladdCboRoaringBitmaps;
|
pub struct MergeDeladdDeCboRoaringBitmaps;
|
||||||
|
|
||||||
impl MergeFunction for MergeDeladdCboRoaringBitmaps {
|
impl MergeFunction for MergeDeladdDeCboRoaringBitmaps {
|
||||||
type Error = crate::Error;
|
type Error = crate::Error;
|
||||||
|
|
||||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
@@ -232,10 +232,10 @@ impl MergeFunction for MergeDeladdCboRoaringBitmaps {
|
|||||||
|
|
||||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||||
let mut buffer = Vec::new();
|
let mut buffer = Vec::new();
|
||||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
DeCboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
DeCboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||||
}
|
}
|
||||||
@@ -246,15 +246,16 @@ impl MergeFunction for MergeDeladdCboRoaringBitmaps {
|
|||||||
///
|
///
|
||||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||||
/// the second one is the CboRoaringBitmap to merge into.
|
/// the second one is the CboRoaringBitmap to merge into.
|
||||||
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
pub fn merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||||
deladd_obkv: &[u8],
|
deladd_obkv: &[u8],
|
||||||
previous: &[u8],
|
previous: &[u8],
|
||||||
buffer: &'a mut Vec<u8>,
|
buffer: &'a mut Vec<u8>,
|
||||||
) -> Result<Option<&'a [u8]>> {
|
) -> Result<Option<&'a [u8]>> {
|
||||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
Ok(DeCboRoaringBitmapCodec::merge_deladd_into(
|
||||||
KvReaderDelAdd::from_slice(deladd_obkv),
|
KvReaderDelAdd::from_slice(deladd_obkv),
|
||||||
previous,
|
previous,
|
||||||
buffer,
|
buffer,
|
||||||
|
&mut Vec::new(), // tmp_buffer
|
||||||
)?)
|
)?)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ use crate::update::{
|
|||||||
};
|
};
|
||||||
use crate::vector::db::EmbedderInfo;
|
use crate::vector::db::EmbedderInfo;
|
||||||
use crate::vector::{RuntimeEmbedders, VectorStore};
|
use crate::vector::{RuntimeEmbedders, VectorStore};
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result, UserError};
|
use crate::{DeCboRoaringBitmapCodec, Index, Result, UserError};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 4;
|
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||||
@@ -400,7 +400,7 @@ where
|
|||||||
let cloneable_chunk =
|
let cloneable_chunk =
|
||||||
unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
let word_docids = word_docids.get_or_insert_with(|| {
|
let word_docids = word_docids.get_or_insert_with(|| {
|
||||||
MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
|
MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps)
|
||||||
});
|
});
|
||||||
word_docids.push(cloneable_chunk.into_cursor()?);
|
word_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
let cloneable_chunk =
|
let cloneable_chunk =
|
||||||
@@ -408,14 +408,14 @@ where
|
|||||||
let exact_word_docids =
|
let exact_word_docids =
|
||||||
exact_word_docids.get_or_insert_with(|| {
|
exact_word_docids.get_or_insert_with(|| {
|
||||||
MergerBuilder::new(
|
MergerBuilder::new(
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
exact_word_docids.push(cloneable_chunk.into_cursor()?);
|
exact_word_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
let cloneable_chunk =
|
let cloneable_chunk =
|
||||||
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||||
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
|
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
|
||||||
MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
|
MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps)
|
||||||
});
|
});
|
||||||
word_fid_docids.push(cloneable_chunk.into_cursor()?);
|
word_fid_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
TypedChunk::WordDocids {
|
TypedChunk::WordDocids {
|
||||||
@@ -429,7 +429,7 @@ where
|
|||||||
let word_position_docids =
|
let word_position_docids =
|
||||||
word_position_docids.get_or_insert_with(|| {
|
word_position_docids.get_or_insert_with(|| {
|
||||||
MergerBuilder::new(
|
MergerBuilder::new(
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
word_position_docids.push(cloneable_chunk.into_cursor()?);
|
word_position_docids.push(cloneable_chunk.into_cursor()?);
|
||||||
@@ -562,10 +562,10 @@ where
|
|||||||
)]
|
)]
|
||||||
pub fn execute_prefix_databases(
|
pub fn execute_prefix_databases(
|
||||||
self,
|
self,
|
||||||
word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
word_docids: Option<Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
word_fid_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
word_fid_docids: Option<Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
FP: Fn(UpdateIndexingStep) + Sync,
|
FP: Fn(UpdateIndexingStep) + Sync,
|
||||||
@@ -763,9 +763,9 @@ where
|
|||||||
)]
|
)]
|
||||||
fn execute_word_prefix_docids(
|
fn execute_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn<'_>,
|
txn: &mut heed::RwTxn<'_>,
|
||||||
merger: Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
|
merger: Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
word_docids_db: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
word_prefix_docids_db: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
indexer_config: &IndexerConfig,
|
indexer_config: &IndexerConfig,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
|
|||||||
@@ -12,8 +12,8 @@ use obkv::{KvReader, KvWriter};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
self, merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||||
CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdDeCboRoaringBitmaps,
|
||||||
MergeIgnoreValues,
|
MergeIgnoreValues,
|
||||||
};
|
};
|
||||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||||
@@ -29,7 +29,7 @@ use crate::update::settings::InnerIndexSettingsDiff;
|
|||||||
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
|
use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig};
|
||||||
use crate::vector::VectorStore;
|
use crate::vector::VectorStore;
|
||||||
use crate::{
|
use crate::{
|
||||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
lat_lng_to_xyz, DeCboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||||
Result, SerializationError, U8StrStrCodec, UserError,
|
Result, SerializationError, U8StrStrCodec, UserError,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -241,7 +241,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -256,7 +256,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.field_id_word_count_docids,
|
&index.field_id_word_count_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -264,9 +264,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut word_docids_builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
|
let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::WordDocids {
|
let TypedChunk::WordDocids {
|
||||||
@@ -291,7 +291,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_docids,
|
&index.word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let exact_word_docids_merger = exact_word_docids_builder.build();
|
let exact_word_docids_merger = exact_word_docids_builder.build();
|
||||||
@@ -300,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.exact_word_docids,
|
&index.exact_word_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let word_fid_docids_merger = word_fid_docids_builder.build();
|
let word_fid_docids_merger = word_fid_docids_builder.build();
|
||||||
@@ -309,7 +309,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_fid_docids,
|
&index.word_fid_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
@@ -329,7 +329,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -344,7 +344,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_position_docids,
|
&index.word_position_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -353,7 +353,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
let mut data_size = 0;
|
let mut data_size = 0;
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
||||||
@@ -375,7 +375,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
let mut normalized_facet_id_string_builder =
|
let mut normalized_facet_id_string_builder =
|
||||||
MergerBuilder::new(MergeDeladdBtreesetString);
|
MergerBuilder::new(MergeDeladdBtreesetString);
|
||||||
let mut data_size = 0;
|
let mut data_size = 0;
|
||||||
@@ -411,7 +411,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -426,7 +426,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_exists_docids,
|
&index.facet_id_exists_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -435,7 +435,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -450,7 +450,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_is_null_docids,
|
&index.facet_id_is_null_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -458,7 +458,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
|
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -473,7 +473,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.facet_id_is_empty_docids,
|
&index.facet_id_is_empty_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
@@ -482,7 +482,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
||||||
let _entered = span.enter();
|
let _entered = span.enter();
|
||||||
|
|
||||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = MergerBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@@ -504,7 +504,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
&index.word_pair_proximity_docids,
|
&index.word_pair_proximity_docids,
|
||||||
wtxn,
|
wtxn,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -866,7 +866,7 @@ where
|
|||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||||
fn write_proximity_entries_into_database_additional_searchables<R, MF>(
|
fn write_proximity_entries_into_database_additional_searchables<R, MF>(
|
||||||
merger: Merger<R, MF>,
|
merger: Merger<R, MF>,
|
||||||
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
database: &heed::Database<U8StrStrCodec, DeCboRoaringBitmapCodec>,
|
||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
) -> Result<()>
|
) -> Result<()>
|
||||||
where
|
where
|
||||||
@@ -881,7 +881,7 @@ where
|
|||||||
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||||
let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
||||||
Some(value) => {
|
Some(value) => {
|
||||||
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
|
DeCboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
|
||||||
}
|
}
|
||||||
None => continue,
|
None => continue,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ use crate::index::db_name;
|
|||||||
use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY};
|
use crate::index::main_key::{GEO_FACETED_DOCUMENTS_IDS_KEY, GEO_RTREE_KEY};
|
||||||
use crate::update::new::KvReaderFieldId;
|
use crate::update::new::KvReaderFieldId;
|
||||||
use crate::vector::Embedding;
|
use crate::vector::Embedding;
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId, Error, Index, InternalError};
|
use crate::{DeCboRoaringBitmapCodec, DocumentId, Error, Index, InternalError};
|
||||||
|
|
||||||
/// Note that the FrameProducer requires up to 9 bytes to
|
/// Note that the FrameProducer requires up to 9 bytes to
|
||||||
/// encode the length, the max grant has been computed accordingly.
|
/// encode the length, the max grant has been computed accordingly.
|
||||||
@@ -971,7 +971,9 @@ pub struct WordDocidsSender<'a, 'b, D> {
|
|||||||
|
|
||||||
impl<D: DatabaseType> WordDocidsSender<'_, '_, D> {
|
impl<D: DatabaseType> WordDocidsSender<'_, '_, D> {
|
||||||
pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> {
|
pub fn write(&self, key: &[u8], bitmap: &RoaringBitmap) -> crate::Result<()> {
|
||||||
let value_length = CboRoaringBitmapCodec::serialized_size(bitmap);
|
let mut tmp_buffer = Vec::new();
|
||||||
|
let value_length =
|
||||||
|
DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(bitmap, &mut tmp_buffer);
|
||||||
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
|
let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| {
|
||||||
InternalError::StorePut {
|
InternalError::StorePut {
|
||||||
database_name: D::DATABASE.database_name(),
|
database_name: D::DATABASE.database_name(),
|
||||||
@@ -986,7 +988,10 @@ impl<D: DatabaseType> WordDocidsSender<'_, '_, D> {
|
|||||||
value_length,
|
value_length,
|
||||||
|key_buffer, value_buffer| {
|
|key_buffer, value_buffer| {
|
||||||
key_buffer.copy_from_slice(key);
|
key_buffer.copy_from_slice(key);
|
||||||
CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_buffer)?;
|
DeCboRoaringBitmapCodec::serialize_into(
|
||||||
|
bitmap,
|
||||||
|
&mut io::Cursor::new(value_buffer),
|
||||||
|
)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
@@ -1007,7 +1012,9 @@ impl FacetDocidsSender<'_, '_> {
|
|||||||
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
||||||
let database = Database::from(facet_kind);
|
let database = Database::from(facet_kind);
|
||||||
|
|
||||||
let value_length = CboRoaringBitmapCodec::serialized_size(bitmap);
|
let mut tmp_buffer = Vec::new();
|
||||||
|
let value_length =
|
||||||
|
DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(bitmap, &mut tmp_buffer);
|
||||||
let value_length = match facet_kind {
|
let value_length = match facet_kind {
|
||||||
// We must take the facet group size into account
|
// We must take the facet group size into account
|
||||||
// when we serialize strings and numbers.
|
// when we serialize strings and numbers.
|
||||||
@@ -1041,7 +1048,7 @@ impl FacetDocidsSender<'_, '_> {
|
|||||||
FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out,
|
FacetKind::Null | FacetKind::Empty | FacetKind::Exists => value_out,
|
||||||
};
|
};
|
||||||
|
|
||||||
CboRoaringBitmapCodec::serialize_into_writer(bitmap, value_out)?;
|
DeCboRoaringBitmapCodec::serialize_into(bitmap, &mut io::Cursor::new(value_out))?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -81,8 +81,8 @@ use rustc_hash::FxBuildHasher;
|
|||||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::new::thread_local::MostlySend;
|
use crate::update::new::thread_local::MostlySend;
|
||||||
use crate::update::new::KvReaderDelAdd;
|
use crate::update::new::KvReaderDelAdd;
|
||||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
use crate::update::MergeDeladdDeCboRoaringBitmaps;
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
use crate::{DeCboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps.
|
/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps.
|
||||||
///
|
///
|
||||||
@@ -320,9 +320,10 @@ struct SpillingCaches<'extractor> {
|
|||||||
&'extractor Bump,
|
&'extractor Bump,
|
||||||
>,
|
>,
|
||||||
>,
|
>,
|
||||||
spilled_entries: Vec<grenad::Sorter<MergeDeladdCboRoaringBitmaps>>,
|
spilled_entries: Vec<grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>>,
|
||||||
deladd_buffer: Vec<u8>,
|
deladd_buffer: Vec<u8>,
|
||||||
cbo_buffer: Vec<u8>,
|
cbo_buffer: Vec<u8>,
|
||||||
|
tmp_buffer: Vec<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'extractor> SpillingCaches<'extractor> {
|
impl<'extractor> SpillingCaches<'extractor> {
|
||||||
@@ -338,7 +339,7 @@ impl<'extractor> SpillingCaches<'extractor> {
|
|||||||
) -> SpillingCaches<'extractor> {
|
) -> SpillingCaches<'extractor> {
|
||||||
SpillingCaches {
|
SpillingCaches {
|
||||||
spilled_entries: iter::repeat_with(|| {
|
spilled_entries: iter::repeat_with(|| {
|
||||||
let mut builder = grenad::SorterBuilder::new(MergeDeladdCboRoaringBitmaps);
|
let mut builder = grenad::SorterBuilder::new(MergeDeladdDeCboRoaringBitmaps);
|
||||||
builder.dump_threshold(0);
|
builder.dump_threshold(0);
|
||||||
builder.allow_realloc(false);
|
builder.allow_realloc(false);
|
||||||
builder.build()
|
builder.build()
|
||||||
@@ -348,6 +349,7 @@ impl<'extractor> SpillingCaches<'extractor> {
|
|||||||
caches,
|
caches,
|
||||||
deladd_buffer: Vec::new(),
|
deladd_buffer: Vec::new(),
|
||||||
cbo_buffer: Vec::new(),
|
cbo_buffer: Vec::new(),
|
||||||
|
tmp_buffer: Vec::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -370,6 +372,7 @@ impl<'extractor> SpillingCaches<'extractor> {
|
|||||||
&mut self.spilled_entries[bucket],
|
&mut self.spilled_entries[bucket],
|
||||||
&mut self.deladd_buffer,
|
&mut self.deladd_buffer,
|
||||||
&mut self.cbo_buffer,
|
&mut self.cbo_buffer,
|
||||||
|
&mut self.tmp_buffer,
|
||||||
key,
|
key,
|
||||||
DelAddRoaringBitmap::new_del_u32(n),
|
DelAddRoaringBitmap::new_del_u32(n),
|
||||||
),
|
),
|
||||||
@@ -395,6 +398,7 @@ impl<'extractor> SpillingCaches<'extractor> {
|
|||||||
&mut self.spilled_entries[bucket],
|
&mut self.spilled_entries[bucket],
|
||||||
&mut self.deladd_buffer,
|
&mut self.deladd_buffer,
|
||||||
&mut self.cbo_buffer,
|
&mut self.cbo_buffer,
|
||||||
|
&mut self.tmp_buffer,
|
||||||
key,
|
key,
|
||||||
DelAddRoaringBitmap::new_add_u32(n),
|
DelAddRoaringBitmap::new_add_u32(n),
|
||||||
),
|
),
|
||||||
@@ -408,9 +412,10 @@ fn compute_bucket_from_hash(buckets: usize, hash: u64) -> usize {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn spill_entry_to_sorter(
|
fn spill_entry_to_sorter(
|
||||||
spilled_entries: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
spilled_entries: &mut grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>,
|
||||||
deladd_buffer: &mut Vec<u8>,
|
deladd_buffer: &mut Vec<u8>,
|
||||||
cbo_buffer: &mut Vec<u8>,
|
cbo_buffer: &mut Vec<u8>,
|
||||||
|
tmp_buffer: &mut Vec<u32>,
|
||||||
key: &[u8],
|
key: &[u8],
|
||||||
deladd: DelAddRoaringBitmap,
|
deladd: DelAddRoaringBitmap,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
@@ -420,21 +425,21 @@ fn spill_entry_to_sorter(
|
|||||||
match deladd {
|
match deladd {
|
||||||
DelAddRoaringBitmap { del: Some(del), add: None } => {
|
DelAddRoaringBitmap { del: Some(del), add: None } => {
|
||||||
cbo_buffer.clear();
|
cbo_buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer);
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&del, cbo_buffer, tmp_buffer)?;
|
||||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||||
}
|
}
|
||||||
DelAddRoaringBitmap { del: None, add: Some(add) } => {
|
DelAddRoaringBitmap { del: None, add: Some(add) } => {
|
||||||
cbo_buffer.clear();
|
cbo_buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer);
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&add, cbo_buffer, tmp_buffer)?;
|
||||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||||
}
|
}
|
||||||
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
|
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
|
||||||
cbo_buffer.clear();
|
cbo_buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&del, cbo_buffer);
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&del, cbo_buffer, tmp_buffer)?;
|
||||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||||
|
|
||||||
cbo_buffer.clear();
|
cbo_buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&add, cbo_buffer);
|
DeCboRoaringBitmapCodec::serialize_into_with_tmp_buffer(&add, cbo_buffer, tmp_buffer)?;
|
||||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||||
}
|
}
|
||||||
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
|
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
|
||||||
@@ -637,15 +642,22 @@ pub struct DelAddRoaringBitmap {
|
|||||||
|
|
||||||
impl DelAddRoaringBitmap {
|
impl DelAddRoaringBitmap {
|
||||||
fn from_bytes(bytes: &[u8]) -> io::Result<DelAddRoaringBitmap> {
|
fn from_bytes(bytes: &[u8]) -> io::Result<DelAddRoaringBitmap> {
|
||||||
|
let mut tmp_buffer = Vec::new();
|
||||||
let reader = KvReaderDelAdd::from_slice(bytes);
|
let reader = KvReaderDelAdd::from_slice(bytes);
|
||||||
|
|
||||||
let del = match reader.get(DelAdd::Deletion) {
|
let del = match reader.get(DelAdd::Deletion) {
|
||||||
Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
|
Some(bytes) => {
|
||||||
|
DeCboRoaringBitmapCodec::deserialize_from_with_tmp_buffer(bytes, &mut tmp_buffer)
|
||||||
|
.map(Some)?
|
||||||
|
}
|
||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let add = match reader.get(DelAdd::Addition) {
|
let add = match reader.get(DelAdd::Addition) {
|
||||||
Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
|
Some(bytes) => {
|
||||||
|
DeCboRoaringBitmapCodec::deserialize_from_with_tmp_buffer(bytes, &mut tmp_buffer)
|
||||||
|
.map(Some)?
|
||||||
|
}
|
||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValu
|
|||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
use crate::update::facet::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||||
use crate::update::{create_writer, writer_into_reader};
|
use crate::update::{create_writer, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, Index};
|
use crate::{DeCboRoaringBitmapCodec, FieldId, Index};
|
||||||
|
|
||||||
/// Generate the facet level based on the level 0.
|
/// Generate the facet level based on the level 0.
|
||||||
///
|
///
|
||||||
@@ -123,7 +123,7 @@ fn compute_level(
|
|||||||
ser_buffer.push(group_len);
|
ser_buffer.push(group_len);
|
||||||
let group_docids = mem::take(&mut group_docids);
|
let group_docids = mem::take(&mut group_docids);
|
||||||
let docids = group_docids.into_iter().union();
|
let docids = group_docids.into_iter().union();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&docids, &mut ser_buffer);
|
DeCboRoaringBitmapCodec::serialize_into(&docids, &mut ser_buffer)?;
|
||||||
writer.insert(left_bound, &ser_buffer)?;
|
writer.insert(left_bound, &ser_buffer)?;
|
||||||
}
|
}
|
||||||
left_bound = Some(key.left_bound);
|
left_bound = Some(key.left_bound);
|
||||||
@@ -142,7 +142,7 @@ fn compute_level(
|
|||||||
let group_len: u8 = group_docids.len().try_into().unwrap();
|
let group_len: u8 = group_docids.len().try_into().unwrap();
|
||||||
ser_buffer.push(group_len);
|
ser_buffer.push(group_len);
|
||||||
let group_docids = group_docids.into_iter().union();
|
let group_docids = group_docids.into_iter().union();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&group_docids, &mut ser_buffer);
|
DeCboRoaringBitmapCodec::serialize_into(&group_docids, &mut ser_buffer)?;
|
||||||
writer.insert(left_bound, &ser_buffer)?;
|
writer.insert(left_bound, &ser_buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ use super::extract::{
|
|||||||
};
|
};
|
||||||
use crate::update::facet::new_incremental::FacetFieldIdChange;
|
use crate::update::facet::new_incremental::FacetFieldIdChange;
|
||||||
use crate::update::new::extract::cellulite::GeoJsonExtractorData;
|
use crate::update::new::extract::cellulite::GeoJsonExtractorData;
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
use crate::{DeCboRoaringBitmapCodec, FieldId, GeoPoint, Index, InternalError, Result};
|
||||||
|
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
|
||||||
pub fn merge_and_send_rtree<'extractor, MSP>(
|
pub fn merge_and_send_rtree<'extractor, MSP>(
|
||||||
@@ -106,7 +106,7 @@ where
|
|||||||
}
|
}
|
||||||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||||
let current = database.get(&rtxn, key)?;
|
let current = database.get(&rtxn, key)?;
|
||||||
match merge_cbo_bitmaps(current, del, add)? {
|
match merge_de_cbo_bitmaps(current, del, add)? {
|
||||||
Operation::Write(bitmap) => docids_sender.write(key, &bitmap),
|
Operation::Write(bitmap) => docids_sender.write(key, &bitmap),
|
||||||
Operation::Delete => docids_sender.delete(key),
|
Operation::Delete => docids_sender.delete(key),
|
||||||
Operation::Ignore => Ok(()),
|
Operation::Ignore => Ok(()),
|
||||||
@@ -134,8 +134,8 @@ pub fn merge_and_send_facet_docids(
|
|||||||
FacetFieldIdsDelta::new(max_string_count, max_number_count);
|
FacetFieldIdsDelta::new(max_string_count, max_number_count);
|
||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
merge_caches_sorted(frozen, |key, DelAddRoaringBitmap { del, add }| {
|
||||||
let current = database.get_cbo_roaring_bytes_value(&rtxn, key)?;
|
let current = database.get_de_cbo_roaring_bytes_value(&rtxn, key)?;
|
||||||
match merge_cbo_bitmaps(current, del, add)? {
|
match merge_de_cbo_bitmaps(current, del, add)? {
|
||||||
Operation::Write(bitmap) => {
|
Operation::Write(bitmap) => {
|
||||||
facet_field_ids_delta.register_from_key(key);
|
facet_field_ids_delta.register_from_key(key);
|
||||||
docids_sender.write(key, &bitmap)?;
|
docids_sender.write(key, &bitmap)?;
|
||||||
@@ -166,7 +166,7 @@ impl<'a> FacetDatabases<'a> {
|
|||||||
Self { index }
|
Self { index }
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_cbo_roaring_bytes_value<'t>(
|
fn get_de_cbo_roaring_bytes_value<'t>(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &'t RoTxn<'_>,
|
rtxn: &'t RoTxn<'_>,
|
||||||
key: &[u8],
|
key: &[u8],
|
||||||
@@ -320,12 +320,12 @@ enum Operation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
|
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
|
||||||
fn merge_cbo_bitmaps(
|
fn merge_de_cbo_bitmaps(
|
||||||
current: Option<&[u8]>,
|
current: Option<&[u8]>,
|
||||||
del: Option<RoaringBitmap>,
|
del: Option<RoaringBitmap>,
|
||||||
add: Option<RoaringBitmap>,
|
add: Option<RoaringBitmap>,
|
||||||
) -> Result<Operation> {
|
) -> Result<Operation> {
|
||||||
let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
|
let current = current.map(DeCboRoaringBitmapCodec::deserialize_from).transpose()?;
|
||||||
match (current, del, add) {
|
match (current, del, add) {
|
||||||
(None, None, None) => Ok(Operation::Ignore), // but it's strange
|
(None, None, None) => Ok(Operation::Ignore), // but it's strange
|
||||||
(None, None, Some(add)) => Ok(Operation::Write(add)),
|
(None, None, Some(add)) => Ok(Operation::Write(add)),
|
||||||
|
|||||||
@@ -14,12 +14,12 @@ use thread_local::ThreadLocal;
|
|||||||
use super::ref_cell_ext::RefCellExt as _;
|
use super::ref_cell_ext::RefCellExt as _;
|
||||||
use crate::heed_codec::StrBEU16Codec;
|
use crate::heed_codec::StrBEU16Codec;
|
||||||
use crate::update::GrenadParameters;
|
use crate::update::GrenadParameters;
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Prefix, Result};
|
use crate::{DeCboRoaringBitmapCodec, Index, Prefix, Result};
|
||||||
|
|
||||||
struct WordPrefixDocids<'i> {
|
struct WordPrefixDocids<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
max_memory_by_thread: Option<usize>,
|
max_memory_by_thread: Option<usize>,
|
||||||
/// Do not use an experimental LMDB feature to read uncommitted data in parallel.
|
/// Do not use an experimental LMDB feature to read uncommitted data in parallel.
|
||||||
no_experimental_post_processing: bool,
|
no_experimental_post_processing: bool,
|
||||||
@@ -28,8 +28,8 @@ struct WordPrefixDocids<'i> {
|
|||||||
impl<'i> WordPrefixDocids<'i> {
|
impl<'i> WordPrefixDocids<'i> {
|
||||||
fn new(
|
fn new(
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
grenad_parameters: &GrenadParameters,
|
grenad_parameters: &GrenadParameters,
|
||||||
) -> WordPrefixDocids<'i> {
|
) -> WordPrefixDocids<'i> {
|
||||||
WordPrefixDocids {
|
WordPrefixDocids {
|
||||||
@@ -87,12 +87,12 @@ impl<'i> WordPrefixDocids<'i> {
|
|||||||
let output = self
|
let output = self
|
||||||
.database
|
.database
|
||||||
.prefix_iter(&rtxn, prefix.as_bytes())?
|
.prefix_iter(&rtxn, prefix.as_bytes())?
|
||||||
.remap_types::<Str, CboRoaringBitmapCodec>()
|
.remap_types::<Str, DeCboRoaringBitmapCodec>()
|
||||||
.map(|result| result.map(|(_word, bitmap)| bitmap))
|
.map(|result| result.map(|(_word, bitmap)| bitmap))
|
||||||
.union()?;
|
.union()?;
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&output, &mut buffer);
|
DeCboRoaringBitmapCodec::serialize_into(&output, &mut buffer)?;
|
||||||
indexes.push(PrefixEntry { prefix, serialized_length: buffer.len() });
|
indexes.push(PrefixEntry { prefix, serialized_length: buffer.len() });
|
||||||
file.write_all(&buffer)?;
|
file.write_all(&buffer)?;
|
||||||
}
|
}
|
||||||
@@ -150,11 +150,11 @@ impl<'i> WordPrefixDocids<'i> {
|
|||||||
.bitmaps(prefix)
|
.bitmaps(prefix)
|
||||||
.unwrap()
|
.unwrap()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes))
|
.map(|bytes| DeCboRoaringBitmapCodec::deserialize_from(bytes))
|
||||||
.union()?;
|
.union()?;
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&output, buffer);
|
DeCboRoaringBitmapCodec::serialize_into(&output, buffer)?;
|
||||||
index.push(PrefixEntry { prefix, serialized_length: buffer.len() });
|
index.push(PrefixEntry { prefix, serialized_length: buffer.len() });
|
||||||
file.write_all(buffer)
|
file.write_all(buffer)
|
||||||
})?;
|
})?;
|
||||||
@@ -203,7 +203,7 @@ struct FrozenPrefixBitmaps<'a, 'rtxn> {
|
|||||||
impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> {
|
impl<'a, 'rtxn> FrozenPrefixBitmaps<'a, 'rtxn> {
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
pub fn from_prefixes(
|
pub fn from_prefixes(
|
||||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
rtxn: &'rtxn RoTxn,
|
rtxn: &'rtxn RoTxn,
|
||||||
prefixes: &'a BTreeSet<Prefix>,
|
prefixes: &'a BTreeSet<Prefix>,
|
||||||
) -> heed::Result<Self> {
|
) -> heed::Result<Self> {
|
||||||
@@ -231,8 +231,8 @@ unsafe impl Sync for FrozenPrefixBitmaps<'_, '_> {}
|
|||||||
|
|
||||||
struct WordPrefixIntegerDocids<'i> {
|
struct WordPrefixIntegerDocids<'i> {
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
max_memory_by_thread: Option<usize>,
|
max_memory_by_thread: Option<usize>,
|
||||||
/// Do not use an experimental LMDB feature to read uncommitted data in parallel.
|
/// Do not use an experimental LMDB feature to read uncommitted data in parallel.
|
||||||
no_experimental_post_processing: bool,
|
no_experimental_post_processing: bool,
|
||||||
@@ -241,8 +241,8 @@ struct WordPrefixIntegerDocids<'i> {
|
|||||||
impl<'i> WordPrefixIntegerDocids<'i> {
|
impl<'i> WordPrefixIntegerDocids<'i> {
|
||||||
fn new(
|
fn new(
|
||||||
index: &'i Index,
|
index: &'i Index,
|
||||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
prefix_database: Database<Bytes, CboRoaringBitmapCodec>,
|
prefix_database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
grenad_parameters: &'_ GrenadParameters,
|
grenad_parameters: &'_ GrenadParameters,
|
||||||
) -> WordPrefixIntegerDocids<'i> {
|
) -> WordPrefixIntegerDocids<'i> {
|
||||||
WordPrefixIntegerDocids {
|
WordPrefixIntegerDocids {
|
||||||
@@ -338,10 +338,10 @@ impl<'i> WordPrefixIntegerDocids<'i> {
|
|||||||
} else {
|
} else {
|
||||||
let output = bitmaps_bytes
|
let output = bitmaps_bytes
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(CboRoaringBitmapCodec::deserialize_from)
|
.map(DeCboRoaringBitmapCodec::deserialize_from)
|
||||||
.union()?;
|
.union()?;
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&output, &mut buffer);
|
DeCboRoaringBitmapCodec::serialize_into(&output, &mut buffer)?;
|
||||||
indexes.push(PrefixIntegerEntry {
|
indexes.push(PrefixIntegerEntry {
|
||||||
prefix,
|
prefix,
|
||||||
pos,
|
pos,
|
||||||
@@ -419,10 +419,10 @@ impl<'i> WordPrefixIntegerDocids<'i> {
|
|||||||
} else {
|
} else {
|
||||||
let output = bitmaps_bytes
|
let output = bitmaps_bytes
|
||||||
.iter()
|
.iter()
|
||||||
.map(|bytes| CboRoaringBitmapCodec::deserialize_from(bytes))
|
.map(|bytes| DeCboRoaringBitmapCodec::deserialize_from(bytes))
|
||||||
.union()?;
|
.union()?;
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
CboRoaringBitmapCodec::serialize_into_vec(&output, buffer);
|
DeCboRoaringBitmapCodec::serialize_into(&output, buffer)?;
|
||||||
index.push(PrefixIntegerEntry {
|
index.push(PrefixIntegerEntry {
|
||||||
prefix,
|
prefix,
|
||||||
pos,
|
pos,
|
||||||
@@ -486,7 +486,7 @@ struct FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
|
|||||||
impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
|
impl<'a, 'rtxn> FrozenPrefixIntegerBitmaps<'a, 'rtxn> {
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
pub fn from_prefixes(
|
pub fn from_prefixes(
|
||||||
database: Database<Bytes, CboRoaringBitmapCodec>,
|
database: Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
rtxn: &'rtxn RoTxn,
|
rtxn: &'rtxn RoTxn,
|
||||||
prefixes: &'a BTreeSet<Prefix>,
|
prefixes: &'a BTreeSet<Prefix>,
|
||||||
) -> heed::Result<Self> {
|
) -> heed::Result<Self> {
|
||||||
@@ -516,7 +516,7 @@ unsafe impl Sync for FrozenPrefixIntegerBitmaps<'_, '_> {}
|
|||||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
#[tracing::instrument(level = "trace", skip_all, target = "indexing::prefix")]
|
||||||
fn delete_prefixes(
|
fn delete_prefixes(
|
||||||
wtxn: &mut RwTxn,
|
wtxn: &mut RwTxn,
|
||||||
prefix_database: &Database<Bytes, CboRoaringBitmapCodec>,
|
prefix_database: &Database<Bytes, DeCboRoaringBitmapCodec>,
|
||||||
prefixes: &BTreeSet<Prefix>,
|
prefixes: &BTreeSet<Prefix>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// We remove all the entries that are no more required in this word prefix docids database.
|
// We remove all the entries that are no more required in this word prefix docids database.
|
||||||
|
|||||||
@@ -6,15 +6,15 @@ use heed::Database;
|
|||||||
|
|
||||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
create_sorter, merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||||
write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps,
|
write_sorter_into_database, CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
use crate::{DeCboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
pub struct WordPrefixDocids<'t, 'i> {
|
pub struct WordPrefixDocids<'t, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i>,
|
wtxn: &'t mut heed::RwTxn<'i>,
|
||||||
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
word_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
@@ -24,8 +24,8 @@ pub struct WordPrefixDocids<'t, 'i> {
|
|||||||
impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &'t mut heed::RwTxn<'i>,
|
wtxn: &'t mut heed::RwTxn<'i>,
|
||||||
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
word_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
word_prefix_docids: Database<Str, DeCboRoaringBitmapCodec>,
|
||||||
) -> WordPrefixDocids<'t, 'i> {
|
) -> WordPrefixDocids<'t, 'i> {
|
||||||
WordPrefixDocids {
|
WordPrefixDocids {
|
||||||
wtxn,
|
wtxn,
|
||||||
@@ -46,7 +46,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||||||
)]
|
)]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
new_word_docids: grenad::Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
|
new_word_docids: grenad::Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
@@ -55,7 +55,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||||||
// and write into it at the same time, therefore we write into another file.
|
// and write into it at the same time, therefore we write into another file.
|
||||||
let mut prefix_docids_sorter = create_sorter(
|
let mut prefix_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
self.chunk_compression_type,
|
self.chunk_compression_type,
|
||||||
self.chunk_compression_level,
|
self.chunk_compression_level,
|
||||||
self.max_nb_chunks,
|
self.max_nb_chunks,
|
||||||
@@ -130,7 +130,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||||||
self.wtxn,
|
self.wtxn,
|
||||||
database_is_empty,
|
database_is_empty,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -139,7 +139,7 @@ impl<'t, 'i> WordPrefixDocids<'t, 'i> {
|
|||||||
|
|
||||||
fn write_prefixes_in_sorter(
|
fn write_prefixes_in_sorter(
|
||||||
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
||||||
sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
sorter: &mut grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (key, data_slices) in prefixes.drain() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
for data in data_slices {
|
for data in data_slices {
|
||||||
|
|||||||
@@ -11,15 +11,15 @@ use crate::heed_codec::StrBEU16Codec;
|
|||||||
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
|
||||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
create_sorter, merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||||
write_sorter_into_database, CursorClonableMmap, MergeDeladdCboRoaringBitmaps,
|
write_sorter_into_database, CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps,
|
||||||
};
|
};
|
||||||
use crate::{CboRoaringBitmapCodec, Result};
|
use crate::{DeCboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
pub struct WordPrefixIntegerDocids<'t, 'i> {
|
pub struct WordPrefixIntegerDocids<'t, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i>,
|
wtxn: &'t mut heed::RwTxn<'i>,
|
||||||
prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
prefix_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
word_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
@@ -29,8 +29,8 @@ pub struct WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &'t mut heed::RwTxn<'i>,
|
wtxn: &'t mut heed::RwTxn<'i>,
|
||||||
prefix_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
prefix_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
word_database: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
|
word_database: Database<StrBEU16Codec, DeCboRoaringBitmapCodec>,
|
||||||
) -> WordPrefixIntegerDocids<'t, 'i> {
|
) -> WordPrefixIntegerDocids<'t, 'i> {
|
||||||
WordPrefixIntegerDocids {
|
WordPrefixIntegerDocids {
|
||||||
wtxn,
|
wtxn,
|
||||||
@@ -51,7 +51,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
)]
|
)]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
|
new_word_integer_docids: grenad::Merger<CursorClonableMmap, MergeDeladdDeCboRoaringBitmaps>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
@@ -60,7 +60,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
|
|
||||||
let mut prefix_integer_docids_sorter = create_sorter(
|
let mut prefix_integer_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
MergeDeladdCboRoaringBitmaps,
|
MergeDeladdDeCboRoaringBitmaps,
|
||||||
self.chunk_compression_type,
|
self.chunk_compression_type,
|
||||||
self.chunk_compression_level,
|
self.chunk_compression_level,
|
||||||
self.max_nb_chunks,
|
self.max_nb_chunks,
|
||||||
@@ -164,7 +164,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
self.wtxn,
|
self.wtxn,
|
||||||
database_is_empty,
|
database_is_empty,
|
||||||
deladd_serialize_add_side,
|
deladd_serialize_add_side,
|
||||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
merge_deladd_de_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@@ -173,7 +173,7 @@ impl<'t, 'i> WordPrefixIntegerDocids<'t, 'i> {
|
|||||||
|
|
||||||
fn write_prefixes_in_sorter(
|
fn write_prefixes_in_sorter(
|
||||||
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
|
||||||
sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
sorter: &mut grenad::Sorter<MergeDeladdDeCboRoaringBitmaps>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
// TODO: Merge before insertion.
|
// TODO: Merge before insertion.
|
||||||
for (key, data_slices) in prefixes.drain() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
|
|
||||||
use crate::vector::settings::RemoveFragments;
|
use crate::vector::settings::RemoveFragments;
|
||||||
use crate::vector::EmbeddingConfig;
|
use crate::vector::EmbeddingConfig;
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId, UserError};
|
use crate::{DeCboRoaringBitmapCodec, DocumentId, UserError};
|
||||||
|
|
||||||
/// DB representation of an embedder configuration.
|
/// DB representation of an embedder configuration.
|
||||||
///
|
///
|
||||||
@@ -273,9 +273,9 @@ impl<'a> heed::BytesDecode<'a> for EmbedderInfoCodec {
|
|||||||
}
|
}
|
||||||
let first_bitmap_size = bytes.read_u32::<BigEndian>()?;
|
let first_bitmap_size = bytes.read_u32::<BigEndian>()?;
|
||||||
let first_bitmap_bytes = &bytes[..first_bitmap_size as usize];
|
let first_bitmap_bytes = &bytes[..first_bitmap_size as usize];
|
||||||
let user_provided = CboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?;
|
let user_provided = DeCboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?;
|
||||||
let skip_regenerate_different_from_user_provided =
|
let skip_regenerate_different_from_user_provided =
|
||||||
CboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?;
|
DeCboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?;
|
||||||
Ok(EmbedderInfo {
|
Ok(EmbedderInfo {
|
||||||
embedder_id,
|
embedder_id,
|
||||||
embedding_status: EmbeddingStatus {
|
embedding_status: EmbeddingStatus {
|
||||||
@@ -290,20 +290,21 @@ impl<'a> heed::BytesEncode<'a> for EmbedderInfoCodec {
|
|||||||
type EItem = EmbedderInfo;
|
type EItem = EmbedderInfo;
|
||||||
|
|
||||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, heed::BoxedError> {
|
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, heed::BoxedError> {
|
||||||
let first_bitmap_size =
|
let mut tmp_buffer = Vec::new();
|
||||||
CboRoaringBitmapCodec::serialized_size(&item.embedding_status.user_provided);
|
let first_bitmap_size = DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(
|
||||||
let second_bitmap_size = CboRoaringBitmapCodec::serialized_size(
|
&item.embedding_status.user_provided,
|
||||||
|
&mut tmp_buffer,
|
||||||
|
);
|
||||||
|
let second_bitmap_size = DeCboRoaringBitmapCodec::serialized_size_with_tmp_buffer(
|
||||||
&item.embedding_status.skip_regenerate_different_from_user_provided,
|
&item.embedding_status.skip_regenerate_different_from_user_provided,
|
||||||
|
&mut tmp_buffer,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size);
|
let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size);
|
||||||
bytes.write_u8(item.embedder_id)?;
|
bytes.write_u8(item.embedder_id)?;
|
||||||
bytes.write_u32::<BigEndian>(first_bitmap_size.try_into()?)?;
|
bytes.write_u32::<BigEndian>(first_bitmap_size.try_into()?)?;
|
||||||
CboRoaringBitmapCodec::serialize_into_writer(
|
DeCboRoaringBitmapCodec::serialize_into(&item.embedding_status.user_provided, &mut bytes)?;
|
||||||
&item.embedding_status.user_provided,
|
DeCboRoaringBitmapCodec::serialize_into(
|
||||||
&mut bytes,
|
|
||||||
)?;
|
|
||||||
CboRoaringBitmapCodec::serialize_into_writer(
|
|
||||||
&item.embedding_status.skip_regenerate_different_from_user_provided,
|
&item.embedding_status.skip_regenerate_different_from_user_provided,
|
||||||
&mut bytes,
|
&mut bytes,
|
||||||
)?;
|
)?;
|
||||||
|
|||||||
9
qc_loop.sh
Executable file
9
qc_loop.sh
Executable file
@@ -0,0 +1,9 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
while true
|
||||||
|
do
|
||||||
|
cargo test qc_
|
||||||
|
if [[ x$? != x0 ]] ; then
|
||||||
|
exit $?
|
||||||
|
fi
|
||||||
|
done
|
||||||
Reference in New Issue
Block a user