Add facet deletion tests that use both the incremental and bulk methods

+ update deletion snapshots to the new database format
2025-10-30 07:26:26 +00:00 · 2022-10-12 12:32:33 +02:00
parent e3ba1fc883
commit f198b20c42
19 changed files with 302 additions and 146 deletions
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -4,9 +4,7 @@ use std::fs::File;
 use grenad::CompressionType;
 use heed::types::ByteSlice;
 use heed::{BytesEncode, Error, RoTxn, RwTxn};
-use log::debug;
 use roaring::RoaringBitmap;
-use time::OffsetDateTime;

 use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
 use crate::facet::FacetType;
@@ -71,8 +69,6 @@ impl<'i> FacetsUpdateBulk<'i> {

    #[logging_timer::time("FacetsUpdateBulk::{}")]
    pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
-        debug!("Computing and writing the facet values levels docids into LMDB on disk...");
-
        let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;

        let db = match facet_type {
@@ -84,8 +80,6 @@ impl<'i> FacetsUpdateBulk<'i> {
            }
        };

-        index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
-
        let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };

        inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
--- a/milli/src/update/facet/delete.rs
+++ b/milli/src/update/facet/delete.rs
@@ -1,15 +1,21 @@
-use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
-use crate::{
-    facet::FacetType,
-    heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec},
-    heed_codec::ByteSliceRefCodec,
-    update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner},
-    FieldId, Index, Result,
-};
-use heed::RwTxn;
-use roaring::RoaringBitmap;
 use std::collections::{HashMap, HashSet};

+use heed::RwTxn;
+use log::debug;
+use roaring::RoaringBitmap;
+use time::OffsetDateTime;
+
+use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
+use crate::facet::FacetType;
+use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
+use crate::heed_codec::ByteSliceRefCodec;
+use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
+use crate::{FieldId, Index, Result};
+
+/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
+///
+/// Depending on the number of removed elements and the existing size of the database, we use either
+/// a bulk delete method or an incremental delete method.
 pub struct FacetsDelete<'i, 'b> {
    index: &'i Index,
    database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
@@ -48,8 +54,18 @@ impl<'i, 'b> FacetsDelete<'i, 'b> {
    }

    pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
+        debug!("Computing and writing the facet values levels docids into LMDB on disk...");
+        self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
+
        for (field_id, affected_facet_values) in self.affected_facet_values {
-            if affected_facet_values.len() >= (self.database.len(wtxn)? / 50) {
+            // This is an incorrect condition, since we assume that the length of the database is equal
+            // to the number of facet values for the given field_id. It means that in some cases, we might
+            // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
+            // really be a performance problem is when we fully delete a large ratio of all facet values for
+            // each field id. This would almost never happen. Still, to be overly cautious, I have added a
+            // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
+            // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
+            if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
                // Bulk delete
                let mut modified = false;

@@ -91,3 +107,133 @@ impl<'i, 'b> FacetsDelete<'i, 'b> {
        Ok(())
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::iter::FromIterator;
+
+    use big_s::S;
+    use maplit::hashset;
+    use roaring::RoaringBitmap;
+
+    use crate::db_snap;
+    use crate::documents::documents_batch_reader_from_objects;
+    use crate::index::tests::TempIndex;
+    use crate::update::DeleteDocuments;
+
+    #[test]
+    fn delete_mixed_incremental_and_bulk() {
+        // The point of this test is to create an index populated with documents
+        // containing different filterable attributes. Then, we delete a bunch of documents
+        // such that a mix of the incremental and bulk indexer is used (depending on the field id)
+        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
+
+        index
+            .update_settings(|settings| {
+                settings.set_filterable_fields(
+                    hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
+                );
+            })
+            .unwrap();
+
+        let mut documents = vec![];
+        for i in 0..1000 {
+            documents.push(
+                serde_json::json! {
+                    {
+                        "id": i,
+                        "label": i / 10,
+                        "colour": i / 100,
+                        "timestamp": i / 2,
+                    }
+                }
+                .as_object()
+                .unwrap()
+                .clone(),
+            );
+        }
+
+        let documents = documents_batch_reader_from_objects(documents);
+        index.add_documents(documents).unwrap();
+
+        db_snap!(index, facet_id_f64_docids, 1);
+        db_snap!(index, number_faceted_documents_ids, 1);
+
+        let mut wtxn = index.env.write_txn().unwrap();
+
+        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
+        builder.disable_soft_deletion(true);
+        builder.delete_documents(&RoaringBitmap::from_iter(0..100));
+        // by deleting the first 100 documents, we expect that:
+        // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
+        // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
+        // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
+        // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
+        // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
+        builder.execute().unwrap();
+        wtxn.commit().unwrap();
+
+        db_snap!(index, soft_deleted_documents_ids, @"[]");
+        db_snap!(index, facet_id_f64_docids, 2);
+        db_snap!(index, number_faceted_documents_ids, 2);
+    }
+}
+
+#[allow(unused)]
+#[cfg(test)]
+mod comparison_bench {
+    use std::iter::once;
+
+    use rand::Rng;
+    use roaring::RoaringBitmap;
+
+    use crate::heed_codec::facet::OrderedF64Codec;
+    use crate::update::facet::tests::FacetIndex;
+
+    // This is a simple test to get an intuition on the relative speed
+    // of the incremental vs. bulk indexer.
+    //
+    // The benchmark shows the worst-case scenario for the incremental indexer, since
+    // each facet value contains only one document ID.
+    //
+    // In that scenario, it appears that the incremental indexer is about 70 times slower than the
+    // bulk indexer.
+    // #[test]
+    fn benchmark_facet_indexing_delete() {
+        let mut r = rand::thread_rng();
+
+        for i in 1..=20 {
+            let size = 50_000 * i;
+            let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
+
+            let mut txn = index.env.write_txn().unwrap();
+            let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
+            for i in 0..size {
+                // field id = 0, left_bound = i, docids = [i]
+                elements.push(((0, i as f64), once(i).collect()));
+            }
+            let timer = std::time::Instant::now();
+            index.bulk_insert(&mut txn, &[0], elements.iter());
+            let time_spent = timer.elapsed().as_millis();
+            println!("bulk {size} : {time_spent}ms");
+
+            txn.commit().unwrap();
+
+            for nbr_doc in [1, 100, 1000, 10_000] {
+                let mut txn = index.env.write_txn().unwrap();
+                let timer = std::time::Instant::now();
+                //
+                // delete one document
+                //
+                for _ in 0..nbr_doc {
+                    let deleted_u32 = r.gen::<u32>() % size;
+                    let deleted_f64 = deleted_u32 as f64;
+                    index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
+                }
+                let time_spent = timer.elapsed().as_millis();
+                println!("    delete {nbr_doc} : {time_spent}ms");
+                txn.abort().unwrap();
+            }
+        }
+    }
+}
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -78,6 +78,9 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;

 use std::fs::File;

+use log::debug;
+use time::OffsetDateTime;
+
 use self::incremental::FacetsUpdateIncremental;
 use super::FacetsUpdateBulk;
 use crate::facet::FacetType;
@@ -89,6 +92,10 @@ pub mod bulk;
 pub mod delete;
 pub mod incremental;

+/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
+///
+/// Depending on the number of new elements and the existing size of the database, we use either
+/// a bulk update method or an incremental update method.
 pub struct FacetsUpdate<'i> {
    index: &'i Index,
    database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
@@ -123,6 +130,10 @@ impl<'i> FacetsUpdate<'i> {
        if self.new_data.is_empty() {
            return Ok(());
        }
+        debug!("Computing and writing the facet values levels docids into LMDB on disk...");
+        self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
+
+        // See self::comparison_bench::benchmark_facet_indexing
        if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
            let field_ids =
                self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
@@ -204,7 +215,7 @@ pub(crate) mod tests {
            let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17

            let mut options = heed::EnvOpenOptions::new();
-            let options = options.map_size(4096 * 4 * 10 * 100);
+            let options = options.map_size(4096 * 4 * 10 * 1000);
            unsafe {
                options.flag(heed::flags::Flags::MdbAlwaysFreePages);
            }
@@ -230,7 +241,7 @@ pub(crate) mod tests {
            let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127
            let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf
            let mut options = heed::EnvOpenOptions::new();
-            let options = options.map_size(4096 * 4 * 1000);
+            let options = options.map_size(4096 * 4 * 1000 * 100);
            let tempdir = tempfile::TempDir::new().unwrap();
            let env = options.open(tempdir.path()).unwrap();
            let content = env.create_database(None).unwrap();
@@ -440,12 +451,14 @@ mod comparison_bench {

    // This is a simple test to get an intuition on the relative speed
    // of the incremental vs. bulk indexer.
-    // It appears that the incremental indexer is about 50 times slower than the
+    //
+    // The benchmark shows the worst-case scenario for the incremental indexer, since
+    // each facet value contains only one document ID.
+    //
+    // In that scenario, it appears that the incremental indexer is about 50 times slower than the
    // bulk indexer.
    // #[test]
    fn benchmark_facet_indexing() {
-        // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it
-
        let mut facet_value = 0;

        let mut r = rand::thread_rng();
--- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap
+++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/delete.rs
+---
+550cd138d6fe31ccdd42cd5392fbd576
--- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap
+++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/delete.rs
+---
+9a0ea88e7c9dcf6dc0ef0b601736ffcf
--- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap
+++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/delete.rs
+---
+d4d5f14e7f1e1f09b86821a0b6defcc6
--- a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap
+++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/delete.rs
+---
+3570e0ac0fdb21be9ebe433f59264b56