Fasten the document deletion

When a document deletion occurs, instead of deleting the document we mark it as deleted
in the new “soft deleted” bitmap. It is then removed from the search, and all the other
endpoints.
This commit is contained in:
Tamo
2022-06-13 17:59:34 +02:00
parent 77c837fc1b
commit 3b309f654a
10 changed files with 413 additions and 284 deletions

View File

@@ -27,6 +27,8 @@ pub enum Error {
#[derive(Error, Debug)] #[derive(Error, Debug)]
pub enum InternalError { pub enum InternalError {
#[error("Tried to access a soft deleted documents.")]
AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("{}", HeedError::DatabaseClosing)] #[error("{}", HeedError::DatabaseClosing)]
DatabaseClosing, DatabaseClosing,
#[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))] #[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]

View File

@@ -5,26 +5,30 @@ use std::{fmt, str};
use fst::map::IndexedValue; use fst::map::IndexedValue;
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use roaring::RoaringBitmap;
const DELETED_ID: u64 = u64::MAX; const DELETED_ID: u64 = u64::MAX;
pub struct ExternalDocumentsIds<'a> { pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>, pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
pub(crate) soft: fst::Map<Cow<'a, [u8]>>, pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
} }
impl<'a> ExternalDocumentsIds<'a> { impl<'a> ExternalDocumentsIds<'a> {
pub fn new( pub fn new(
hard: fst::Map<Cow<'a, [u8]>>, hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>, soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
) -> ExternalDocumentsIds<'a> { ) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft } ExternalDocumentsIds { hard, soft, soft_deleted_docids }
} }
pub fn into_static(self) -> ExternalDocumentsIds<'static> { pub fn into_static(self) -> ExternalDocumentsIds<'static> {
ExternalDocumentsIds { ExternalDocumentsIds {
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(), soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft_deleted_docids: self.soft_deleted_docids,
} }
} }
@@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> {
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> { pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
let external_id = external_id.as_ref(); let external_id = external_id.as_ref();
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()), Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
Some(id.try_into().unwrap())
}
_otherwise => None, _otherwise => None,
} }
} }
@@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> {
ExternalDocumentsIds { ExternalDocumentsIds {
hard: fst::Map::default().map_data(Cow::Owned).unwrap(), hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft: fst::Map::default().map_data(Cow::Owned).unwrap(), soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft_deleted_docids: RoaringBitmap::new(),
} }
} }
} }

View File

@@ -32,6 +32,7 @@ pub mod main_key {
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields"; pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
@@ -254,6 +255,29 @@ impl Index {
Ok(count.unwrap_or_default()) Ok(count.unwrap_or_default())
} }
/* deleted documents ids */
/// Writes the soft deleted documents ids.
pub(crate) fn put_soft_deleted_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(
wtxn,
main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
docids,
)
}
/// Returns the soft deleted documents ids.
pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
Ok(self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
.unwrap_or_default())
}
/* primary key */ /* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id. /// Writes the documents primary key, this is the field name that is used to store the id.
@@ -280,7 +304,7 @@ impl Index {
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>, external_documents_ids: &ExternalDocumentsIds<'a>,
) -> heed::Result<()> { ) -> heed::Result<()> {
let ExternalDocumentsIds { hard, soft } = external_documents_ids; let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
let hard = hard.as_fst().as_bytes(); let hard = hard.as_fst().as_bytes();
let soft = soft.as_fst().as_bytes(); let soft = soft.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>( self.main.put::<_, Str, ByteSlice>(
@@ -311,7 +335,8 @@ impl Index {
Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?, Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
None => fst::Map::default().map_data(Cow::Owned)?, None => fst::Map::default().map_data(Cow::Owned)?,
}; };
Ok(ExternalDocumentsIds::new(hard, soft)) let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
} }
/* fields ids map */ /* fields ids map */
@@ -929,9 +954,13 @@ impl Index {
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>, ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> { ) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
let mut documents = Vec::new(); let mut documents = Vec::new();
for id in ids { for id in ids {
if soft_deleted_documents.contains(id) {
return Err(InternalError::AccessingSoftDeletedDocument { document_id: id })?;
}
let kv = self let kv = self
.documents .documents
.get(rtxn, &BEU32::new(id))? .get(rtxn, &BEU32::new(id))?
@@ -947,11 +976,16 @@ impl Index {
&self, &self,
rtxn: &'t RoTxn, rtxn: &'t RoTxn,
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> { ) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
Ok(self Ok(self
.documents .documents
.iter(rtxn)? .iter(rtxn)?
// we cast the BEU32 to a DocumentId // we cast the BEU32 to a DocumentId
.map(|document| document.map(|(id, obkv)| (id.get(), obkv)))) .map(|document| document.map(|(id, obkv)| (id.get(), obkv)))
.filter(move |document| {
document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id))
}))
} }
pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> { pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {

View File

@@ -10,9 +10,7 @@ use roaring::RoaringBitmap;
use super::FacetNumberRange; use super::FacetNumberRange;
use crate::error::{Error, UserError}; use crate::error::{Error, UserError};
use crate::heed_codec::facet::{ use crate::heed_codec::facet::FacetLevelValueF64Codec;
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
};
use crate::{ use crate::{
distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result,
}; };
@@ -266,11 +264,12 @@ impl<'a> Filter<'a> {
fn evaluate_operator( fn evaluate_operator(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
field_id: FieldId, field_id: FieldId,
operator: &Condition<'a>, operator: &Condition<'a>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
// Make sure we always bound the ranges with the field id and the level, // Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the // as the facets values are all in the same database and prefixed by the
// field id and the level. // field id and the level.
@@ -309,9 +308,7 @@ impl<'a> Filter<'a> {
let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?; let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?;
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Condition::Equal(val.clone()); let operator = Condition::Equal(val.clone());
let docids = Self::evaluate_operator( let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?;
rtxn, index, numbers_db, strings_db, field_id, &operator,
)?;
return Ok((all_numbers_ids | all_strings_ids) - docids); return Ok((all_numbers_ids | all_strings_ids) - docids);
} }
}; };
@@ -342,17 +339,27 @@ impl<'a> Filter<'a> {
} }
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids; // to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let strings_db = index.facet_id_string_docids; let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
let filterable_fields = index.filterable_fields(rtxn)?;
// and finally we delete all the soft_deleted_documents, again, only once at the very end
self.inner_evaluate(rtxn, index, &filterable_fields)
.map(|result| result - soft_deleted_documents)
}
fn inner_evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
filterable_fields: &HashSet<String>,
) -> Result<RoaringBitmap> {
match &self.condition { match &self.condition {
FilterCondition::Condition { fid, op } => { FilterCondition::Condition { fid, op } => {
let filterable_fields = index.filterable_fields(rtxn)?; if crate::is_faceted(fid.value(), filterable_fields) {
if crate::is_faceted(fid.value(), &filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?; let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) { if let Some(fid) = field_ids_map.id(fid.value()) {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op) Self::evaluate_operator(rtxn, index, fid, &op)
} else { } else {
return Ok(RoaringBitmap::new()); return Ok(RoaringBitmap::new());
} }
@@ -371,7 +378,7 @@ impl<'a> Filter<'a> {
return Err(fid.as_external_error( return Err(fid.as_external_error(
FilterError::AttributeNotFilterable { FilterError::AttributeNotFilterable {
attribute, attribute,
filterable_fields, filterable_fields: filterable_fields.clone(),
}, },
))?; ))?;
} }
@@ -379,17 +386,39 @@ impl<'a> Filter<'a> {
} }
} }
FilterCondition::Or(lhs, rhs) => { FilterCondition::Or(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; let lhs = Self::inner_evaluate(
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; &(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs | rhs) Ok(lhs | rhs)
} }
FilterCondition::And(lhs, rhs) => { FilterCondition::And(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?; let lhs = Self::inner_evaluate(
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?; &(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
if lhs.is_empty() {
return Ok(lhs);
}
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs & rhs) Ok(lhs & rhs)
} }
FilterCondition::GeoLowerThan { point, radius } => { FilterCondition::GeoLowerThan { point, radius } => {
let filterable_fields = index.filterable_fields(rtxn)?;
if filterable_fields.contains("_geo") { if filterable_fields.contains("_geo") {
let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?]; let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?];
if !(-90.0..=90.0).contains(&base_point[0]) { if !(-90.0..=90.0).contains(&base_point[0]) {
@@ -422,16 +451,17 @@ impl<'a> Filter<'a> {
} else { } else {
return Err(point[0].as_external_error(FilterError::AttributeNotFilterable { return Err(point[0].as_external_error(FilterError::AttributeNotFilterable {
attribute: "_geo", attribute: "_geo",
filterable_fields, filterable_fields: filterable_fields.clone(),
}))?; }))?;
} }
} }
FilterCondition::GeoGreaterThan { point, radius } => { FilterCondition::GeoGreaterThan { point, radius } => {
let result = Self::evaluate( let result = Self::inner_evaluate(
&FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() } &FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() }
.into(), .into(),
rtxn, rtxn,
index, index,
filterable_fields,
)?; )?;
let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?;
Ok(geo_faceted_doc_ids - result) Ok(geo_faceted_doc_ids - result)

View File

@@ -214,7 +214,7 @@ impl<'a> Search<'a> {
) -> Result<SearchResult> { ) -> Result<SearchResult> {
let mut offset = self.offset; let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new(); let mut initial_candidates = RoaringBitmap::new();
let mut excluded_candidates = RoaringBitmap::new(); let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?;
let mut documents_ids = Vec::new(); let mut documents_ids = Vec::new();
while let Some(FinalResult { candidates, bucket_candidates, .. }) = while let Some(FinalResult { candidates, bucket_candidates, .. }) =

View File

@@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds {
} }
impl AvailableDocumentsIds { impl AvailableDocumentsIds {
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds { pub fn from_documents_ids(
match docids.max() { docids: &RoaringBitmap,
soft_deleted_docids: &RoaringBitmap,
) -> AvailableDocumentsIds {
let used_docids = docids | soft_deleted_docids;
match used_docids.max() {
Some(last_id) => { Some(last_id) => {
let mut available = RoaringBitmap::from_iter(0..last_id); let mut available = RoaringBitmap::from_iter(0..last_id);
available -= docids; available -= used_docids;
let iter = match last_id.checked_add(1) { let iter = match last_id.checked_add(1) {
Some(id) => id..=u32::max_value(), Some(id) => id..=u32::max_value(),
@@ -44,7 +49,7 @@ mod tests {
#[test] #[test]
fn empty() { fn empty() {
let base = RoaringBitmap::new(); let base = RoaringBitmap::new();
let left = AvailableDocumentsIds::from_documents_ids(&base); let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = 0..=u32::max_value(); let right = 0..=u32::max_value();
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
} }
@@ -57,8 +62,28 @@ mod tests {
base.insert(100); base.insert(100);
base.insert(405); base.insert(405);
let left = AvailableDocumentsIds::from_documents_ids(&base); let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405); let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r)); left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
} }
#[test]
fn soft_deleted() {
let mut base = RoaringBitmap::new();
base.insert(0);
base.insert(10);
base.insert(100);
base.insert(405);
let mut soft_deleted = RoaringBitmap::new();
soft_deleted.insert(1);
soft_deleted.insert(11);
soft_deleted.insert(101);
soft_deleted.insert(406);
let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
let right =
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
} }

View File

@@ -35,6 +35,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
documents, documents,
} = self.index; } = self.index;
let empty_roaring = RoaringBitmap::default();
// We retrieve the number of documents ids that we are deleting. // We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?; let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
@@ -43,16 +45,16 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_words_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?; self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
// We clean all the faceted documents ids. // We clean all the faceted documents ids.
let empty = RoaringBitmap::default();
for field_id in faceted_fields { for field_id in faceted_fields {
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?; self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
} }
// Clear the other databases. // Clear the other databases.

View File

@@ -1,5 +1,4 @@
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
use std::collections::HashMap;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::{ByteSlice, Str}; use heed::types::{ByteSlice, Str};
@@ -17,15 +16,17 @@ use crate::heed_codec::facet::{
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::{ use crate::{
DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result,
BEU32, RoaringBitmapCodec, SmallString32, BEU32,
}; };
const DELETE_DOCUMENTS_THRESHOLD: u64 = 100_000;
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index, index: &'i Index,
external_documents_ids: ExternalDocumentsIds<'static>, external_documents_ids: ExternalDocumentsIds<'static>,
documents_ids: RoaringBitmap, to_delete_docids: RoaringBitmap,
} }
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -45,16 +46,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
wtxn, wtxn,
index, index,
external_documents_ids, external_documents_ids,
documents_ids: RoaringBitmap::new(), to_delete_docids: RoaringBitmap::new(),
}) })
} }
pub fn delete_document(&mut self, docid: u32) { pub fn delete_document(&mut self, docid: u32) {
self.documents_ids.insert(docid); self.to_delete_docids.insert(docid);
} }
pub fn delete_documents(&mut self, docids: &RoaringBitmap) { pub fn delete_documents(&mut self, docids: &RoaringBitmap) {
self.documents_ids |= docids; self.to_delete_docids |= docids;
} }
pub fn delete_external_id(&mut self, external_id: &str) -> Option<u32> { pub fn delete_external_id(&mut self, external_id: &str) -> Option<u32> {
@@ -63,28 +64,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
Some(docid) Some(docid)
} }
pub fn execute(self) -> Result<DocumentDeletionResult> { pub fn execute(mut self) -> Result<DocumentDeletionResult> {
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
// We retrieve the current documents ids that are in the database. // We retrieve the current documents ids that are in the database.
let mut documents_ids = self.index.documents_ids(self.wtxn)?; let mut documents_ids = self.index.documents_ids(self.wtxn)?;
let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?;
let current_documents_ids_len = documents_ids.len(); let current_documents_ids_len = documents_ids.len();
// We can and must stop removing documents in a database that is empty. // We can and must stop removing documents in a database that is empty.
if documents_ids.is_empty() { if documents_ids.is_empty() {
return Ok(DocumentDeletionResult { // but if there was still documents to delete we clear the database entirely
deleted_documents: 0, if !soft_deleted_docids.is_empty() {
remaining_documents: current_documents_ids_len, ClearDocuments::new(self.wtxn, self.index).execute()?;
}); }
return Ok(DocumentDeletionResult { deleted_documents: 0, remaining_documents: 0 });
} }
// We remove the documents ids that we want to delete // We remove the documents ids that we want to delete
// from the documents in the database and write them back. // from the documents in the database and write them back.
documents_ids -= &self.documents_ids; documents_ids -= &self.to_delete_docids;
self.index.put_documents_ids(self.wtxn, &documents_ids)?; self.index.put_documents_ids(self.wtxn, &documents_ids)?;
// We can execute a ClearDocuments operation when the number of documents // We can execute a ClearDocuments operation when the number of documents
// to delete is exactly the number of documents in the database. // to delete is exactly the number of documents in the database.
if current_documents_ids_len == self.documents_ids.len() { if current_documents_ids_len == self.to_delete_docids.len() {
let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?; let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?;
return Ok(DocumentDeletionResult { return Ok(DocumentDeletionResult {
deleted_documents: current_documents_ids_len, deleted_documents: current_documents_ids_len,
@@ -93,6 +96,50 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let mut field_distribution = self.index.field_distribution(self.wtxn)?;
// we update the field distribution
for docid in self.to_delete_docids.iter() {
let key = BEU32::new(docid);
let document =
self.index.documents.get(self.wtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry { db_name: "documents", key: None },
)?;
for (fid, _value) in document.iter() {
let field_name =
fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: fid,
process: "delete documents",
})?;
if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string())
{
match entry.get().checked_sub(1) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count),
};
}
}
}
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
soft_deleted_docids |= &self.to_delete_docids;
// if we have less documents to delete than the threshold we simply save them in
// the `soft_deleted_documents_ids` bitmap and early exit.
if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD {
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
return Ok(DocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
});
}
// There is more than documents to delete than the threshold we needs to delete them all
self.to_delete_docids = soft_deleted_docids;
// and we can reset the soft deleted bitmap
self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| { let primary_key = self.index.primary_key(self.wtxn)?.ok_or_else(|| {
InternalError::DatabaseMissingEntry { InternalError::DatabaseMissingEntry {
db_name: db_name::MAIN, db_name: db_name::MAIN,
@@ -127,23 +174,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
documents, documents,
} = self.index; } = self.index;
// Number of fields for each document that has been deleted.
let mut fields_ids_distribution_diff = HashMap::new();
// Retrieve the words and the external documents ids contained in the documents. // Retrieve the words and the external documents ids contained in the documents.
let mut words = Vec::new(); let mut words = Vec::new();
let mut external_ids = Vec::new(); let mut external_ids = Vec::new();
for docid in &self.documents_ids { for docid in &self.to_delete_docids {
// We create an iterator to be able to get the content and delete the document // We create an iterator to be able to get the content and delete the document
// content itself. It's faster to acquire a cursor to get and delete, // content itself. It's faster to acquire a cursor to get and delete,
// as we avoid traversing the LMDB B-Tree two times but only once. // as we avoid traversing the LMDB B-Tree two times but only once.
let key = BEU32::new(docid); let key = BEU32::new(docid);
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
if let Some((_key, obkv)) = iter.next().transpose()? { if let Some((_key, obkv)) = iter.next().transpose()? {
for (field_id, _) in obkv.iter() {
*fields_ids_distribution_diff.entry(field_id).or_default() += 1;
}
if let Some(content) = obkv.get(id_field) { if let Some(content) = obkv.get(id_field) {
let external_id = match serde_json::from_slice(content).unwrap() { let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()), Value::String(string) => SmallString32::from(string.as_str()),
@@ -171,24 +211,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
} }
let mut field_distribution = self.index.field_distribution(self.wtxn)?;
// We use pre-calculated number of fields occurrences that needs to be deleted
// to reflect deleted documents.
// If all field occurrences are removed, delete the entry from distribution.
// Otherwise, insert new number of occurrences (current_count - count_diff).
for (field_id, count_diff) in fields_ids_distribution_diff {
let field_name = fields_ids_map.name(field_id).unwrap();
if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count),
};
}
}
self.index.put_field_distribution(self.wtxn, &field_distribution)?;
// We create the FST map of the external ids that we must delete. // We create the FST map of the external ids that we must delete.
external_ids.sort_unstable(); external_ids.sort_unstable();
let external_ids_to_delete = fst::Set::from_iter(external_ids)?; let external_ids_to_delete = fst::Set::from_iter(external_ids)?;
@@ -214,7 +236,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_docids, word_docids,
word.as_str(), word.as_str(),
must_remove, must_remove,
&self.documents_ids, &self.to_delete_docids,
)?; )?;
remove_from_word_docids( remove_from_word_docids(
@@ -222,7 +244,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
exact_word_docids, exact_word_docids,
word.as_str(), word.as_str(),
must_remove, must_remove,
&self.documents_ids, &self.to_delete_docids,
)?; )?;
} }
@@ -256,12 +278,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
self.index.put_words_fst(self.wtxn, &new_words_fst)?; self.index.put_words_fst(self.wtxn, &new_words_fst)?;
let prefixes_to_delete = let prefixes_to_delete =
remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.documents_ids)?; remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?;
let exact_prefix_to_delete = remove_from_word_prefix_docids( let exact_prefix_to_delete = remove_from_word_prefix_docids(
self.wtxn, self.wtxn,
exact_word_prefix_docids, exact_word_prefix_docids,
&self.documents_ids, &self.to_delete_docids,
)?; )?;
let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union(); let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
@@ -293,7 +315,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (key, mut docids) = result?; let (key, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
if docids.is_empty() { if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };
@@ -314,7 +336,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, mut docids) = result?; let (bytes, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
if docids.is_empty() { if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };
@@ -332,7 +354,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, mut docids) = result?; let (bytes, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
if docids.is_empty() { if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };
@@ -351,7 +373,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (bytes, mut docids) = result?; let (bytes, mut docids) = result?;
let previous_len = docids.len(); let previous_len = docids.len();
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
if docids.is_empty() { if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };
@@ -368,7 +390,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?; let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
while let Some((key, mut docids)) = iter.next().transpose()? { while let Some((key, mut docids)) = iter.next().transpose()? {
let previous_len = docids.len(); let previous_len = docids.len();
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
if docids.is_empty() { if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };
@@ -386,7 +408,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree
.iter() .iter()
.filter(|&point| self.documents_ids.contains(point.data.0)) .filter(|&point| self.to_delete_docids.contains(point.data.0))
.cloned() .cloned()
.map(|point| (point, point.data.0)) .map(|point| (point, point.data.0))
.unzip(); .unzip();
@@ -403,46 +425,46 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
remove_docids_from_facet_field_id_number_docids( remove_docids_from_facet_field_id_number_docids(
self.wtxn, self.wtxn,
facet_id_f64_docids, facet_id_f64_docids,
&self.documents_ids, &self.to_delete_docids,
)?; )?;
remove_docids_from_facet_field_id_string_docids( remove_docids_from_facet_field_id_string_docids(
self.wtxn, self.wtxn,
facet_id_string_docids, facet_id_string_docids,
&self.documents_ids, &self.to_delete_docids,
)?; )?;
// Remove the documents ids from the faceted documents ids. // Remove the documents ids from the faceted documents ids.
for field_id in self.index.faceted_fields_ids(self.wtxn)? { for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids // Remove docids from the number faceted documents ids
let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?;
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value( remove_docids_from_field_id_docid_facet_value(
self.wtxn, self.wtxn,
field_id_docid_facet_f64s, field_id_docid_facet_f64s,
field_id, field_id,
&self.documents_ids, &self.to_delete_docids,
|(_fid, docid, _value)| docid, |(_fid, docid, _value)| docid,
)?; )?;
// Remove docids from the string faceted documents ids // Remove docids from the string faceted documents ids
let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?;
docids -= &self.documents_ids; docids -= &self.to_delete_docids;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?;
remove_docids_from_field_id_docid_facet_value( remove_docids_from_field_id_docid_facet_value(
self.wtxn, self.wtxn,
field_id_docid_facet_strings, field_id_docid_facet_strings,
field_id, field_id,
&self.documents_ids, &self.to_delete_docids,
|(_fid, docid, _value)| docid, |(_fid, docid, _value)| docid,
)?; )?;
} }
Ok(DocumentDeletionResult { Ok(DocumentDeletionResult {
deleted_documents: self.documents_ids.len(), deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(), remaining_documents: documents_ids.len(),
}) })
} }
@@ -741,26 +763,26 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
let content = documents!([ let content = documents!([
{"docid":"1_4","label":"sign"}, { "docid": "1_4", "label": "sign" },
{"docid":"1_5","label":"letter"}, { "docid": "1_5", "label": "letter" },
{"docid":"1_7","label":"abstract,cartoon,design,pattern"}, { "docid": "1_7", "label": "abstract,cartoon,design,pattern" },
{"docid":"1_36","label":"drawing,painting,pattern"}, { "docid": "1_36", "label": "drawing,painting,pattern" },
{"docid":"1_37","label":"art,drawing,outdoor"}, { "docid": "1_37", "label": "art,drawing,outdoor" },
{"docid":"1_38","label":"aquarium,art,drawing"}, { "docid": "1_38", "label": "aquarium,art,drawing" },
{"docid":"1_39","label":"abstract"}, { "docid": "1_39", "label": "abstract" },
{"docid":"1_40","label":"cartoon"}, { "docid": "1_40", "label": "cartoon" },
{"docid":"1_41","label":"art,drawing"}, { "docid": "1_41", "label": "art,drawing" },
{"docid":"1_42","label":"art,pattern"}, { "docid": "1_42", "label": "art,pattern" },
{"docid":"1_43","label":"abstract,art,drawing,pattern"}, { "docid": "1_43", "label": "abstract,art,drawing,pattern" },
{"docid":"1_44","label":"drawing"}, { "docid": "1_44", "label": "drawing" },
{"docid":"1_45","label":"art"}, { "docid": "1_45", "label": "art" },
{"docid":"1_46","label":"abstract,colorfulness,pattern"}, { "docid": "1_46", "label": "abstract,colorfulness,pattern" },
{"docid":"1_47","label":"abstract,pattern"}, { "docid": "1_47", "label": "abstract,pattern" },
{"docid":"1_52","label":"abstract,cartoon"}, { "docid": "1_52", "label": "abstract,cartoon" },
{"docid":"1_57","label":"abstract,drawing,pattern"}, { "docid": "1_57", "label": "abstract,drawing,pattern" },
{"docid":"1_58","label":"abstract,art,cartoon"}, { "docid": "1_58", "label": "abstract,art,cartoon" },
{"docid":"1_68","label":"design"}, { "docid": "1_68", "label": "design" },
{"docid":"1_69","label":"geometry"} { "docid": "1_69", "label": "geometry" }
]); ]);
insert_documents(&mut wtxn, &index, content); insert_documents(&mut wtxn, &index, content);
@@ -788,26 +810,26 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
let content = documents!([ let content = documents!([
{"docid":"1_4","label":"sign"}, { "docid": "1_4", "label": "sign" },
{"docid":"1_5","label":"letter"}, { "docid": "1_5", "label": "letter" },
{"docid":"1_7","label":"abstract,cartoon,design,pattern"}, { "docid": "1_7", "label": "abstract,cartoon,design,pattern" },
{"docid":"1_36","label":"drawing,painting,pattern"}, { "docid": "1_36", "label": "drawing,painting,pattern" },
{"docid":"1_37","label":"art,drawing,outdoor"}, { "docid": "1_37", "label": "art,drawing,outdoor" },
{"docid":"1_38","label":"aquarium,art,drawing"}, { "docid": "1_38", "label": "aquarium,art,drawing" },
{"docid":"1_39","label":"abstract"}, { "docid": "1_39", "label": "abstract" },
{"docid":"1_40","label":"cartoon"}, { "docid": "1_40", "label": "cartoon" },
{"docid":"1_41","label":"art,drawing"}, { "docid": "1_41", "label": "art,drawing" },
{"docid":"1_42","label":"art,pattern"}, { "docid": "1_42", "label": "art,pattern" },
{"docid":"1_43","label":"abstract,art,drawing,pattern"}, { "docid": "1_43", "label": "abstract,art,drawing,pattern" },
{"docid":"1_44","label":"drawing"}, { "docid": "1_44", "label": "drawing" },
{"docid":"1_45","label":"art"}, { "docid": "1_45", "label": "art" },
{"docid":"1_46","label":"abstract,colorfulness,pattern"}, { "docid": "1_46", "label": "abstract,colorfulness,pattern" },
{"docid":"1_47","label":"abstract,pattern"}, { "docid": "1_47", "label": "abstract,pattern" },
{"docid":"1_52","label":"abstract,cartoon"}, { "docid": "1_52", "label": "abstract,cartoon" },
{"docid":"1_57","label":"abstract,drawing,pattern"}, { "docid": "1_57", "label": "abstract,drawing,pattern" },
{"docid":"1_58","label":"abstract,art,cartoon"}, { "docid": "1_58", "label": "abstract,art,cartoon" },
{"docid":"1_68","label":"design"}, { "docid": "1_68", "label": "design" },
{"docid":"1_69","label":"geometry"} { "docid": "1_69", "label": "geometry" }
]); ]);
insert_documents(&mut wtxn, &index, content); insert_documents(&mut wtxn, &index, content);
@@ -841,26 +863,26 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
let content = documents!([ let content = documents!([
{"docid":"1_4","label":"sign"}, {"docid": "1_4", "label": "sign"},
{"docid":"1_5","label":"letter"}, {"docid": "1_5", "label": "letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern"}, {"docid": "1_7", "label": "abstract,cartoon,design,pattern"},
{"docid":"1_36","label":"drawing,painting,pattern"}, {"docid": "1_36","label": "drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"}, {"docid": "1_37","label": "art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing"}, {"docid": "1_38","label": "aquarium,art,drawing"},
{"docid":"1_39","label":"abstract"}, {"docid": "1_39","label": "abstract"},
{"docid":"1_40","label":"cartoon"}, {"docid": "1_40","label": "cartoon"},
{"docid":"1_41","label":"art,drawing"}, {"docid": "1_41","label": "art,drawing"},
{"docid":"1_42","label":"art,pattern"}, {"docid": "1_42","label": "art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern"}, {"docid": "1_43","label": "abstract,art,drawing,pattern"},
{"docid":"1_44","label":"drawing"}, {"docid": "1_44","label": "drawing"},
{"docid":"1_45","label":"art"}, {"docid": "1_45","label": "art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"}, {"docid": "1_46","label": "abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"}, {"docid": "1_47","label": "abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"}, {"docid": "1_52","label": "abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"}, {"docid": "1_57","label": "abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"}, {"docid": "1_58","label": "abstract,art,cartoon"},
{"docid":"1_68","label":"design"}, {"docid": "1_68","label": "design"},
{"docid":"1_69","label":"geometry"} {"docid": "1_69","label": "geometry"}
]); ]);
insert_documents(&mut wtxn, &index, content); insert_documents(&mut wtxn, &index, content);
@@ -896,26 +918,26 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
let content = documents!([ let content = documents!([
{"id":"1","city":"Lille", "_geo": { "lat": 50.6299 as f32, "lng": 3.0569 as f32 } }, { "id": "1", "city": "Lille", "_geo": { "lat": 50.6299, "lng": 3.0569 } },
{"id":"2","city":"Mons-en-Barœul", "_geo": { "lat": 50.6415 as f32, "lng": 3.1106 as f32 } }, { "id": "2", "city": "Mons-en-Barœul", "_geo": { "lat": 50.6415, "lng": 3.1106 } },
{"id":"3","city":"Hellemmes", "_geo": { "lat": 50.6312 as f32, "lng": 3.1106 as f32 } }, { "id": "3", "city": "Hellemmes", "_geo": { "lat": 50.6312, "lng": 3.1106 } },
{"id":"4","city":"Villeneuve-d'Ascq", "_geo": { "lat": 50.6224 as f32, "lng": 3.1476 as f32 } }, { "id": "4", "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
{"id":"5","city":"Hem", "_geo": { "lat": 50.6552 as f32, "lng": 3.1897 as f32 } }, { "id": "5", "city": "Hem", "_geo": { "lat": 50.6552, "lng": 3.1897 } },
{"id":"6","city":"Roubaix", "_geo": { "lat": 50.6924 as f32, "lng": 3.1763 as f32 } }, { "id": "6", "city": "Roubaix", "_geo": { "lat": 50.6924, "lng": 3.1763 } },
{"id":"7","city":"Tourcoing", "_geo": { "lat": 50.7263 as f32, "lng": 3.1541 as f32 } }, { "id": "7", "city": "Tourcoing", "_geo": { "lat": 50.7263, "lng": 3.1541 } },
{"id":"8","city":"Mouscron", "_geo": { "lat": 50.7453 as f32, "lng": 3.2206 as f32 } }, { "id": "8", "city": "Mouscron", "_geo": { "lat": 50.7453, "lng": 3.2206 } },
{"id":"9","city":"Tournai", "_geo": { "lat": 50.6053 as f32, "lng": 3.3758 as f32 } }, { "id": "9", "city": "Tournai", "_geo": { "lat": 50.6053, "lng": 3.3758 } },
{"id":"10","city":"Ghent", "_geo": { "lat": 51.0537 as f32, "lng": 3.6957 as f32 } }, { "id": "10", "city": "Ghent", "_geo": { "lat": 51.0537, "lng": 3.6957 } },
{"id":"11","city":"Brussels", "_geo": { "lat": 50.8466 as f32, "lng": 4.3370 as f32 } }, { "id": "11", "city": "Brussels", "_geo": { "lat": 50.8466, "lng": 4.3370 } },
{"id":"12","city":"Charleroi", "_geo": { "lat": 50.4095 as f32, "lng": 4.4347 as f32 } }, { "id": "12", "city": "Charleroi", "_geo": { "lat": 50.4095, "lng": 4.4347 } },
{"id":"13","city":"Mons", "_geo": { "lat": 50.4502 as f32, "lng": 3.9623 as f32 } }, { "id": "13", "city": "Mons", "_geo": { "lat": 50.4502, "lng": 3.9623 } },
{"id":"14","city":"Valenciennes", "_geo": { "lat": 50.3518 as f32, "lng": 3.5326 as f32 } }, { "id": "14", "city": "Valenciennes", "_geo": { "lat": 50.3518, "lng": 3.5326 } },
{"id":"15","city":"Arras", "_geo": { "lat": 50.2844 as f32, "lng": 2.7637 as f32 } }, { "id": "15", "city": "Arras", "_geo": { "lat": 50.2844, "lng": 2.7637 } },
{"id":"16","city":"Cambrai", "_geo": { "lat": 50.1793 as f32, "lng": 3.2189 as f32 } }, { "id": "16", "city": "Cambrai", "_geo": { "lat": 50.1793, "lng": 3.2189 } },
{"id":"17","city":"Bapaume", "_geo": { "lat": 50.1112 as f32, "lng": 2.8547 as f32 } }, { "id": "17", "city": "Bapaume", "_geo": { "lat": 50.1112, "lng": 2.8547 } },
{"id":"18","city":"Amiens", "_geo": { "lat": 49.9314 as f32, "lng": 2.2710 as f32 } }, { "id": "18", "city": "Amiens", "_geo": { "lat": 49.9314, "lng": 2.2710 } },
{"id":"19","city":"Compiègne", "_geo": { "lat": 49.4449 as f32, "lng": 2.7913 as f32 } }, { "id": "19", "city": "Compiègne", "_geo": { "lat": 49.4449, "lng": 2.7913 } },
{"id":"20","city":"Paris", "_geo": { "lat": 48.9021 as f32, "lng": 2.3708 as f32 } } { "id": "20", "city": "Paris", "_geo": { "lat": 48.9021, "lng": 2.3708 } }
]); ]);
let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
@@ -951,26 +973,26 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
let content = documents!([ let content = documents!([
{"docid":"1_4","label":"sign"}, { "docid": "1_4", "label": "sign" },
{"docid":"1_5","label":"letter"}, { "docid": "1_5", "label": "letter" },
{"docid":"1_7","label":"abstract,cartoon,design,pattern"}, { "docid": "1_7", "label": "abstract,cartoon,design,pattern" },
{"docid":"1_36","label":"drawing,painting,pattern"}, { "docid": "1_36", "label": "drawing,painting,pattern" },
{"docid":"1_37","label":"art,drawing,outdoor"}, { "docid": "1_37", "label": "art,drawing,outdoor" },
{"docid":"1_38","label":"aquarium,art,drawing"}, { "docid": "1_38", "label": "aquarium,art,drawing" },
{"docid":"1_39","label":"abstract"}, { "docid": "1_39", "label": "abstract" },
{"docid":"1_40","label":"cartoon"}, { "docid": "1_40", "label": "cartoon" },
{"docid":"1_41","label":"art,drawing"}, { "docid": "1_41", "label": "art,drawing" },
{"docid":"1_42","label":"art,pattern"}, { "docid": "1_42", "label": "art,pattern" },
{"docid":"1_43","label":"abstract,art,drawing,pattern"}, { "docid": "1_43", "label": "abstract,art,drawing,pattern" },
{"docid":"1_44","label":"drawing"}, { "docid": "1_44", "label": "drawing" },
{"docid":"1_45","label":"art"}, { "docid": "1_45", "label": "art" },
{"docid":"1_46","label":"abstract,colorfulness,pattern"}, { "docid": "1_46", "label": "abstract,colorfulness,pattern" },
{"docid":"1_47","label":"abstract,pattern"}, { "docid": "1_47", "label": "abstract,pattern" },
{"docid":"1_52","label":"abstract,cartoon"}, { "docid": "1_52", "label": "abstract,cartoon" },
{"docid":"1_57","label":"abstract,drawing,pattern"}, { "docid": "1_57", "label": "abstract,drawing,pattern" },
{"docid":"1_58","label":"abstract,art,cartoon"}, { "docid": "1_58", "label": "abstract,art,cartoon" },
{"docid":"1_68","label":"design"}, { "docid": "1_68", "label": "design" },
{"docid":"1_69","label":"geometry"} { "docid": "1_69", "label": "geometry" }
]); ]);
insert_documents(&mut wtxn, &index, content); insert_documents(&mut wtxn, &index, content);
@@ -1021,26 +1043,26 @@ mod tests {
builder.execute(|_| ()).unwrap(); builder.execute(|_| ()).unwrap();
let content = documents!([ let content = documents!([
{"docid":"1_4","label":"sign"}, { "docid": "1_4", "label": "sign"},
{"docid":"1_5","label":"letter"}, { "docid": "1_5", "label": "letter"},
{"docid":"1_7","label":"abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"},
{"docid":"1_36","label":"drawing,painting,pattern"}, { "docid": "1_36", "label": "drawing,painting,pattern"},
{"docid":"1_37","label":"art,drawing,outdoor"}, { "docid": "1_37", "label": "art,drawing,outdoor"},
{"docid":"1_38","label":"aquarium,art,drawing", "title": "Nemo"}, { "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"},
{"docid":"1_39","label":"abstract"}, { "docid": "1_39", "label": "abstract"},
{"docid":"1_40","label":"cartoon"}, { "docid": "1_40", "label": "cartoon"},
{"docid":"1_41","label":"art,drawing"}, { "docid": "1_41", "label": "art,drawing"},
{"docid":"1_42","label":"art,pattern"}, { "docid": "1_42", "label": "art,pattern"},
{"docid":"1_43","label":"abstract,art,drawing,pattern", "number": 32i32}, { "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32},
{"docid":"1_44","label":"drawing", "number": 44i32}, { "docid": "1_44", "label": "drawing", "number": 44i32},
{"docid":"1_45","label":"art"}, { "docid": "1_45", "label": "art"},
{"docid":"1_46","label":"abstract,colorfulness,pattern"}, { "docid": "1_46", "label": "abstract,colorfulness,pattern"},
{"docid":"1_47","label":"abstract,pattern"}, { "docid": "1_47", "label": "abstract,pattern"},
{"docid":"1_52","label":"abstract,cartoon"}, { "docid": "1_52", "label": "abstract,cartoon"},
{"docid":"1_57","label":"abstract,drawing,pattern"}, { "docid": "1_57", "label": "abstract,drawing,pattern"},
{"docid":"1_58","label":"abstract,art,cartoon"}, { "docid": "1_58", "label": "abstract,art,cartoon"},
{"docid":"1_68","label":"design"}, { "docid": "1_68", "label": "design"},
{"docid":"1_69","label":"geometry"} { "docid": "1_69", "label": "geometry"}
]); ]);
insert_documents(&mut wtxn, &index, content); insert_documents(&mut wtxn, &index, content);

View File

@@ -719,10 +719,11 @@ mod tests {
assert_eq!(count, 1); assert_eq!(count, 1);
// Check that we get only one document from the database. // Check that we get only one document from the database.
let docs = index.documents(&rtxn, Some(0)).unwrap(); // Since the document has been deleted and re-inserted, its internal docid has has been incremented to 1
let docs = index.documents(&rtxn, Some(1)).unwrap();
assert_eq!(docs.len(), 1); assert_eq!(docs.len(), 1);
let (id, doc) = docs[0]; let (id, doc) = docs[0];
assert_eq!(id, 0); assert_eq!(id, 1);
// Check that this document is equal to the last one sent. // Check that this document is equal to the last one sent.
let mut doc_iter = doc.iter(); let mut doc_iter = doc.iter();
@@ -809,11 +810,12 @@ mod tests {
let count = index.number_of_documents(&rtxn).unwrap(); let count = index.number_of_documents(&rtxn).unwrap();
assert_eq!(count, 3); assert_eq!(count, 3);
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap(); // the document 0 has been deleted and reinserted with the id 3
let (kevin_id, _) = let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap(); let kevin_position =
let (id, doc) = docs[*kevin_id as usize]; docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
assert_eq!(id, *kevin_id); assert_eq!(kevin_position, 2);
let (_, doc) = docs[kevin_position];
// Check that this document is equal to the last // Check that this document is equal to the last
// one sent and that an UUID has been generated. // one sent and that an UUID has been generated.

View File

@@ -4,7 +4,6 @@ use std::collections::{HashMap, HashSet};
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek, SeekFrom}; use std::io::{Read, Seek, SeekFrom};
use byteorder::ReadBytesExt;
use fxhash::FxHashMap; use fxhash::FxHashMap;
use heed::RoTxn; use heed::RoTxn;
use itertools::Itertools; use itertools::Itertools;
@@ -57,7 +56,7 @@ pub struct Transform<'a, 'i> {
flattened_sorter: grenad::Sorter<MergeFn>, flattened_sorter: grenad::Sorter<MergeFn>,
replaced_documents_ids: RoaringBitmap, replaced_documents_ids: RoaringBitmap,
new_documents_ids: RoaringBitmap, new_documents_ids: RoaringBitmap,
// To increase the cache locality and the heap usage we use smartstring. // To increase the cache locality and decrease the heap usage we use compact smartstring.
new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>, new_external_documents_ids_builder: FxHashMap<SmartString<smartstring::Compact>, u64>,
documents_count: usize, documents_count: usize,
} }
@@ -130,13 +129,17 @@ impl<'a, 'i> Transform<'a, 'i> {
indexer_settings.max_memory.map(|mem| mem / 2), indexer_settings.max_memory.map(|mem| mem / 2),
); );
let documents_ids = index.documents_ids(wtxn)?; let documents_ids = index.documents_ids(wtxn)?;
let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?;
Ok(Transform { Ok(Transform {
index, index,
fields_ids_map: index.fields_ids_map(wtxn)?, fields_ids_map: index.fields_ids_map(wtxn)?,
indexer_settings, indexer_settings,
autogenerate_docids, autogenerate_docids,
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), available_documents_ids: AvailableDocumentsIds::from_documents_ids(
&documents_ids,
&soft_deleted_documents_ids,
),
original_sorter, original_sorter,
flattened_sorter, flattened_sorter,
index_documents_method, index_documents_method,
@@ -248,45 +251,39 @@ impl<'a, 'i> Transform<'a, 'i> {
writer.insert(*k, v)?; writer.insert(*k, v)?;
} }
let (docid, should_insert_original_document) = let mut original_docid = None;
match external_documents_ids.get(&*external_id) {
// if the document is in the db but has already been inserted let docid = match self.new_external_documents_ids_builder.entry(external_id.into()) {
// (ie: already exists in the list of replaced documents ids), Entry::Occupied(entry) => *entry.get() as u32,
// we should not add the original document a second time. Entry::Vacant(entry) => {
Some(docid) => (docid, !self.replaced_documents_ids.contains(docid)), // If the document was already in the db we mark it as a replaced document.
None => { // It'll be deleted later. We keep its original docid to insert it in the grenad.
// if the document has already been inserted in this if let Some(docid) = external_documents_ids.get(entry.key()) {
// batch we need to get its docid self.replaced_documents_ids.insert(docid);
match self.new_external_documents_ids_builder.entry(external_id.into()) { original_docid = Some(docid);
Entry::Occupied(entry) => (*entry.get() as u32, false),
// if the document has never been encountered we give it a new docid
// and push this new docid to the external documents ids builder
Entry::Vacant(entry) => {
let new_docid = self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?;
entry.insert(new_docid as u64);
(new_docid, false)
}
}
} }
}; let docid = self
.available_documents_ids
.next()
.ok_or(UserError::DocumentLimitReached)?;
entry.insert(docid as u64);
docid
}
};
if should_insert_original_document { if let Some(original_docid) = original_docid {
self.replaced_documents_ids.insert(docid); let original_key = BEU32::new(original_docid);
let key = BEU32::new(docid);
let base_obkv = self let base_obkv = self
.index .index
.documents .documents
.remap_data_type::<heed::types::ByteSlice>() .remap_data_type::<heed::types::ByteSlice>()
.get(wtxn, &key)? .get(wtxn, &original_key)?
.ok_or(InternalError::DatabaseMissingEntry { .ok_or(InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS, db_name: db_name::DOCUMENTS,
key: None, key: None,
})?; })?;
// we associate the base document with the new key, everything will get merged later.
self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?; self.original_sorter.insert(&docid.to_be_bytes(), base_obkv)?;
match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? { match self.flatten_from_fields_ids_map(KvReader::new(&base_obkv))? {
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
@@ -506,6 +503,39 @@ impl<'a, 'i> Transform<'a, 'i> {
Ok(()) Ok(())
} }
fn remove_deleted_documents_from_field_distribution(
&self,
rtxn: &RoTxn,
field_distribution: &mut FieldDistribution,
) -> Result<()> {
for deleted_docid in self.replaced_documents_ids.iter() {
let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
for (key, _) in obkv.iter() {
let name =
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Computing field distribution in transform.",
})?;
// We checked that the document was in the db earlier. If we can't find it it means
// there is an inconsistency between the field distribution and the field id map.
let field =
field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Accessing field distribution in transform.",
})?;
*field -= 1;
if *field == 0 {
// since we were able to get the field right before it's safe to unwrap here
field_distribution.remove(name).unwrap();
}
}
}
Ok(())
}
/// Generate the `TransformOutput` based on the given sorter that can be generated from any /// Generate the `TransformOutput` based on the given sorter that can be generated from any
/// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
/// id for the user side and the value must be an obkv where keys are valid fields ids. /// id for the user side and the value must be an obkv where keys are valid fields ids.
@@ -532,9 +562,14 @@ impl<'a, 'i> Transform<'a, 'i> {
tempfile::tempfile()?, tempfile::tempfile()?,
); );
// Once we have all the documents in the sorter, we write the documents // To compute the field distribution we need to;
// in the writer. We also generate the field distribution. // 1. Remove all the deleted documents from the field distribution
// 2. Add all the new documents to the field distribution
let mut field_distribution = self.index.field_distribution(wtxn)?; let mut field_distribution = self.index.field_distribution(wtxn)?;
self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?;
// Here we are going to do the document count + field distribution + `write_into_stream_writer`
let mut iter = self.original_sorter.into_stream_merger_iter()?; let mut iter = self.original_sorter.into_stream_merger_iter()?;
// used only for the callback // used only for the callback
let mut documents_count = 0; let mut documents_count = 0;
@@ -547,36 +582,6 @@ impl<'a, 'i> Transform<'a, 'i> {
total_documents: self.documents_count, total_documents: self.documents_count,
}); });
let u32_key = key.clone().read_u32::<byteorder::BigEndian>()?;
// if the document was already in the db we remove all of its field
// from the field distribution.
if self.replaced_documents_ids.contains(u32_key) {
let obkv = self.index.documents.get(wtxn, &BEU32::new(u32_key))?.ok_or(
InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
)?;
for (key, _) in obkv.iter() {
let name =
self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Computing field distribution in transform.",
})?;
// We checked that the document was in the db earlier. If we can't find it it means
// there is an inconsistency between the field distribution and the field id map.
let field = field_distribution.get_mut(name).ok_or(
FieldIdMapMissingEntry::FieldId {
field_id: key,
process: "Accessing field distribution in transform.",
},
)?;
*field -= 1;
if *field == 0 {
// since we were able to get the field right before it's safe to unwrap here
field_distribution.remove(name).unwrap();
}
}
}
// We increment all the field of the current document in the field distribution. // We increment all the field of the current document in the field distribution.
let obkv = KvReader::new(val); let obkv = KvReader::new(val);