Add more tests and allow disabling of soft-deletion outside of tests

Also allow disabling soft-deletion in the IndexDocumentsConfig
This commit is contained in:
Loïc Lecrenier
2022-12-05 10:33:31 +01:00
parent d3731dda48
commit f2cf981641
89 changed files with 2171 additions and 54 deletions

View File

@ -365,9 +365,9 @@ mod tests {
use crate::documents::documents_batch_reader_from_objects;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::index::tests::TempIndex;
use crate::update::facet::tests::FacetIndex;
use crate::{db_snap, milli_snap};
use crate::heed_codec::StrRefCodec;
use crate::milli_snap;
use crate::update::facet::test_helpers::{ordered_string, FacetIndex};
#[test]
fn insert() {
@ -491,4 +491,38 @@ mod tests {
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
}
#[test]
fn insert_string() {
let test = |name: &str, group_size: u8, min_level_size: u8| {
let index = FacetIndex::<StrRefCodec>::new(group_size, 0 /*NA*/, min_level_size);
let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::<Vec<_>>();
let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new();
for i in 0..1_000u32 {
// field id = 0, left_bound = i, docids = [i]
elements.push(((0, &strings[i as usize]), once(i).collect()));
}
for i in 0..100u32 {
// field id = 1, left_bound = i, docids = [i]
elements.push(((1, &strings[i as usize]), once(i).collect()));
}
let mut wtxn = index.env.write_txn().unwrap();
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
index.verify_structure_validity(&wtxn, 0);
index.verify_structure_validity(&wtxn, 1);
wtxn.commit().unwrap();
milli_snap!(format!("{index}"), name);
};
test("default", 4, 5);
test("small_group_small_min_level", 2, 2);
test("small_group_large_min_level", 2, 128);
test("large_group_small_min_level", 16, 2);
test("odd_group_odd_min_level", 7, 3);
}
}

View File

@ -114,11 +114,14 @@ mod tests {
use big_s::S;
use maplit::hashset;
use rand::seq::SliceRandom;
use rand::SeedableRng;
use roaring::RoaringBitmap;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
use crate::update::facet::test_helpers::ordered_string;
use crate::update::DeleteDocuments;
#[test]
@ -156,8 +159,8 @@ mod tests {
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, 1);
db_snap!(index, number_faceted_documents_ids, 1);
db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
let mut wtxn = index.env.write_txn().unwrap();
@ -174,8 +177,126 @@ mod tests {
wtxn.commit().unwrap();
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_f64_docids, 2);
db_snap!(index, number_faceted_documents_ids, 2);
db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
}
// Same test as above but working with string values for the facets
#[test]
fn delete_mixed_incremental_and_bulk_string() {
// The point of this test is to create an index populated with documents
// containing different filterable attributes. Then, we delete a bunch of documents
// such that a mix of the incremental and bulk indexer is used (depending on the field id)
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": ordered_string(i / 10),
"colour": ordered_string(i / 100),
"timestamp": ordered_string(i / 2),
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.disable_soft_deletion(true);
builder.delete_documents(&RoaringBitmap::from_iter(0..100));
// by deleting the first 100 documents, we expect that:
// - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
// - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
// - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
// - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
// This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
builder.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
}
#[test]
fn delete_almost_all_incrementally_string() {
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_filterable_fields(
hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
);
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"label": ordered_string(i / 10),
"colour": ordered_string(i / 100),
"timestamp": ordered_string(i / 2),
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
// Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
docids_to_delete.shuffle(&mut rng);
for docid in docids_to_delete.into_iter().take(990) {
let mut wtxn = index.env.write_txn().unwrap();
let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
builder.disable_soft_deletion(true);
builder.delete_documents(&RoaringBitmap::from_iter([docid]));
builder.execute().unwrap();
wtxn.commit().unwrap();
}
db_snap!(index, soft_deleted_documents_ids, @"[]");
db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
db_snap!(index, string_faceted_documents_ids, 2, @r###"
0 []
1 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
2 [292, 324, 358, 381, 493, 839, 852, ]
3 [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
"###);
}
}
@ -188,7 +309,7 @@ mod comparison_bench {
use roaring::RoaringBitmap;
use crate::heed_codec::facet::OrderedF64Codec;
use crate::update::facet::tests::FacetIndex;
use crate::update::facet::test_helpers::FacetIndex;
// This is a simple test to get an intuition on the relative speed
// of the incremental vs. bulk indexer.

View File

@ -648,7 +648,7 @@ mod tests {
use crate::heed_codec::facet::OrderedF64Codec;
use crate::heed_codec::StrRefCodec;
use crate::milli_snap;
use crate::update::facet::tests::FacetIndex;
use crate::update::facet::test_helpers::FacetIndex;
#[test]
fn append() {
@ -1053,7 +1053,7 @@ mod fuzz {
use tempfile::TempDir;
use super::*;
use crate::update::facet::tests::FacetIndex;
use crate::update::facet::test_helpers::FacetIndex;
#[derive(Default)]
pub struct TrivialDatabase<T> {
pub elements: BTreeMap<u16, BTreeMap<T, RoaringBitmap>>,

View File

@ -162,7 +162,7 @@ impl<'i> FacetsUpdate<'i> {
}
#[cfg(test)]
pub(crate) mod tests {
pub(crate) mod test_helpers {
use std::cell::Cell;
use std::fmt::Display;
use std::iter::FromIterator;
@ -183,6 +183,23 @@ pub(crate) mod tests {
use crate::update::FacetsUpdateIncrementalInner;
use crate::CboRoaringBitmapCodec;
/// Utility function to generate a string whose position in a lexicographically
/// ordered list is `i`.
pub fn ordered_string(mut i: usize) -> String {
// The first string is empty
if i == 0 {
return String::new();
}
// The others are 5 char long, each between 'a' and 'z'
let mut s = String::new();
for _ in 0..5 {
let (digit, next) = (i % 26, i / 26);
s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap());
i = next;
}
s
}
/// A dummy index that only contains the facet database, used for testing
pub struct FacetIndex<BoundCodec>
where
@ -438,6 +455,98 @@ pub(crate) mod tests {
}
}
#[cfg(test)]
mod tests {
use big_s::S;
use maplit::hashset;
use crate::db_snap;
use crate::documents::documents_batch_reader_from_objects;
use crate::index::tests::TempIndex;
#[test]
fn replace_all_identical_soft_deletion_then_hard_deletion() {
let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_filterable_fields(hashset! { S("size") });
})
.unwrap();
let mut documents = vec![];
for i in 0..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"size": i % 250,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
let mut documents = vec![];
for i in 0..999 {
documents.push(
serde_json::json! {
{
"id": i,
"size": i % 250,
"other": 0,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
// Then replace the last document while disabling soft_deletion
index.index_documents_config.disable_soft_deletion = true;
let mut documents = vec![];
for i in 999..1000 {
documents.push(
serde_json::json! {
{
"id": i,
"size": i % 250,
"other": 0,
}
}
.as_object()
.unwrap()
.clone(),
);
}
let documents = documents_batch_reader_from_objects(documents);
index.add_documents(documents).unwrap();
db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
}
}
#[allow(unused)]
#[cfg(test)]
mod comparison_bench {
@ -446,7 +555,7 @@ mod comparison_bench {
use rand::Rng;
use roaring::RoaringBitmap;
use super::tests::FacetIndex;
use super::test_helpers::FacetIndex;
use crate::heed_codec::facet::OrderedF64Codec;
// This is a simple test to get an intuition on the relative speed

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/bulk.rs
---
353d70f52eea66e5031dca989ea8a037

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/bulk.rs
---
52a093c909133d84023a4a7b83864808

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/bulk.rs
---
9d86c72ddb241d0aeca2995d61a3648a

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/bulk.rs
---
c0943177594534bfe5527cbf40fe388e

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/facet/bulk.rs
---
6ed86f234028ae3df5881bee5512f11e

View File

@ -1,4 +0,0 @@
---
source: milli/src/update/facet/delete.rs
---
550cd138d6fe31ccdd42cd5392fbd576

View File

@ -1,4 +0,0 @@
---
source: milli/src/update/facet/delete.rs
---
9a0ea88e7c9dcf6dc0ef0b601736ffcf

View File

@ -1,4 +0,0 @@
---
source: milli/src/update/facet/delete.rs
---
d4d5f14e7f1e1f09b86821a0b6defcc6

View File

@ -1,4 +0,0 @@
---
source: milli/src/update/facet/delete.rs
---
3570e0ac0fdb21be9ebe433f59264b56