Add facet deletion tests that use both the incremental and bulk methods

+ update deletion snapshots to the new database format
This commit is contained in:
Loïc Lecrenier
2022-10-12 12:32:33 +02:00
parent e3ba1fc883
commit f198b20c42
19 changed files with 302 additions and 146 deletions

View File

@ -4,9 +4,7 @@ use std::fs::File;
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::{BytesEncode, Error, RoTxn, RwTxn};
use log::debug;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::facet::FacetType;
@ -71,8 +69,6 @@ impl<'i> FacetsUpdateBulk<'i> {
#[logging_timer::time("FacetsUpdateBulk::{}")]
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
let db = match facet_type {
@ -84,8 +80,6 @@ impl<'i> FacetsUpdateBulk<'i> {
}
};
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {

View File

@ -1,15 +1,21 @@
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::{
facet::FacetType,
heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec},
heed_codec::ByteSliceRefCodec,
update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner},
FieldId, Index, Result,
};
use heed::RwTxn;
use roaring::RoaringBitmap;
use std::collections::{HashMap, HashSet};
use heed::RwTxn;
use log::debug;
use roaring::RoaringBitmap;
use time::OffsetDateTime;
use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
use crate::{FieldId, Index, Result};
/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
///
/// Depending on the number of removed elements and the existing size of the database, we use either
/// a bulk delete method or an incremental delete method.
pub struct FacetsDelete<'i, 'b> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
@ -48,8 +54,18 @@ impl<'i, 'b> FacetsDelete<'i, 'b> {
}
pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
for (field_id, affected_facet_values) in self.affected_facet_values {
if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) {
// This is an incorrect condition, since we assume that the length of the database is equal
// to the number of facet values for the given field_id. It means that in some cases, we might
// wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
// really be a performance problem is when we fully delete a large ratio of all facet values for
// each field id. This would almost never happen. Still, to be overly cautious, I have added a
// 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
// penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
// Bulk delete
let mut modified = false;
@ -91,3 +107,133 @@ impl<'i, 'b> FacetsDelete<'i, 'b> {
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::iter::FromIterator;
use big_s::S;
use maplit::hashset;
use roaring::RoaringBitmap;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::update::DeleteDocuments;
#[test]
fn delete_mixed_incremental_and_bulk() {
// The point of this test is to create an index populated with documents
// containing different filterable attributes. Then, we delete a bunch of documents
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": i / 10,
"colour": i / 100,
"timestamp": i / 2,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, 1);
db_snap!(index, number_faceted_documents_ids, 1);
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.disable_soft_deletion(true);
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
// by deleting the first 100 documents, we expect that:
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
builder.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_f64_docids, 2);
db_snap!(index, number_faceted_documents_ids, 2);
}
}
#[allow(unused)]
#[cfg(test)]
mod comparison_bench {
use std::iter::once;
use rand::Rng;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::update::facet::tests::FacetIndex;
// This is a simple test to get an intuition on the relative speed
// of the incremental vs. bulk indexer.
//
// The benchmark shows the worst-case scenario for the incremental indexer, since
// each facet value contains only one document ID.
//
// In that scenario, it appears that the incremental indexer is about 70 times slower than the
// bulk indexer.
// #[test]
fn benchmark_facet_indexing_delete() {
let mut r = rand::thread_rng();
for i in 1..=20 {
let size = 50_000 * i;
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
let mut txn = index.env.write_txn().unwrap();
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
for i in 0..size {
// field id = 0, left_bound = i, docids = [i]
elements.push(((0, i as f64), once(i).collect()));
}
let timer = std::time::Instant::now();
index.bulk_insert(&mut txn, &[0], elements.iter());
let time_spent = timer.elapsed().as_millis();
println!("bulk {size} : {time_spent}ms");
txn.commit().unwrap();
for nbr_doc in [1, 100, 1000, 10_000] {
let mut txn = index.env.write_txn().unwrap();
let timer = std::time::Instant::now();
//
// delete one document
//
for _ in 0..nbr_doc {
let deleted_u32 = r.gen::<u32>() % size;
let deleted_f64 = deleted_u32 as f64;
index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
}
let time_spent = timer.elapsed().as_millis();
println!(" delete {nbr_doc} : {time_spent}ms");
txn.abort().unwrap();
}
}
}
}

View File

@ -78,6 +78,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::fs::File;
use log::debug;
use time::OffsetDateTime;
use self::incremental::FacetsUpdateIncremental;
use super::FacetsUpdateBulk;
use crate::facet::FacetType;
@ -89,6 +92,10 @@ pub mod bulk;
pub mod delete;
pub mod incremental;
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
///
/// Depending on the number of new elements and the existing size of the database, we use either
/// a bulk update method or an incremental update method.
pub struct FacetsUpdate<'i> {
index: &'i Index,
database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
@ -123,6 +130,10 @@ impl<'i> FacetsUpdate<'i> {
if self.new_data.is_empty() {
return Ok(());
}
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
// See self::comparison_bench::benchmark_facet_indexing
if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
let field_ids =
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
@ -204,7 +215,7 @@ pub(crate) mod tests {
let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 10 * 100);
let options = options.map_size(4096 * 4 * 10 * 1000);
unsafe {
options.flag(heed::flags::Flags::MdbAlwaysFreePages);
}
@ -230,7 +241,7 @@ pub(crate) mod tests {
let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127
let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 1000);
let options = options.map_size(4096 * 4 * 1000 * 100);
let tempdir = tempfile::TempDir::new().unwrap();
let env = options.open(tempdir.path()).unwrap();
let content = env.create_database(None).unwrap();
@ -440,12 +451,14 @@ mod comparison_bench {
// This is a simple test to get an intuition on the relative speed
// of the incremental vs. bulk indexer.
// It appears that the incremental indexer is about 50 times slower than the
//
// The benchmark shows the worst-case scenario for the incremental indexer, since
// each facet value contains only one document ID.
//
// In that scenario, it appears that the incremental indexer is about 50 times slower than the
// bulk indexer.
// #[test]
fn benchmark_facet_indexing() {
// then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it
let mut facet_value = 0;
let mut r = rand::thread_rng();

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
550cd138d6fe31ccdd42cd5392fbd576

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
9a0ea88e7c9dcf6dc0ef0b601736ffcf

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
d4d5f14e7f1e1f09b86821a0b6defcc6

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/delete.rs
---
3570e0ac0fdb21be9ebe433f59264b56