Return the original string values for the inverted facet index database

This commit is contained in:
Clément Renault
2021-07-17 12:50:01 +02:00
committed by Kerollmops
parent 03a01166ba
commit 0227254a65
15 changed files with 242 additions and 58 deletions

View File

@ -9,6 +9,7 @@ use serde_json::Value;
use super::ClearDocuments;
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key};
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the facet field id values.
remove_docids_from_facet_field_id_value_docids(
remove_docids_from_facet_field_id_number_docids(
self.wtxn,
facet_id_f64_docids,
&self.documents_ids,
)?;
remove_docids_from_facet_field_id_value_docids(
remove_docids_from_facet_field_id_string_docids(
self.wtxn,
facet_id_string_docids,
&self.documents_ids,
@ -447,7 +448,33 @@ where
Ok(())
}
fn remove_docids_from_facet_field_id_value_docids<'a, C>(
fn remove_docids_from_facet_field_id_string_docids<'a, C>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
to_remove: &RoaringBitmap,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
{
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(result) = iter.next() {
let (bytes, (original_value, mut docids)) = result?;
let previous_len = docids.len();
docids -= to_remove;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
} else if docids.len() != previous_len {
let bytes = bytes.to_owned();
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&bytes, &(original_value, docids))? };
}
}
Ok(())
}
fn remove_docids_from_facet_field_id_number_docids<'a, C>(
wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, CboRoaringBitmapCodec>,
to_remove: &RoaringBitmap,

View File

@ -12,7 +12,7 @@ use roaring::RoaringBitmap;
use crate::error::InternalError;
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec,
FacetStringZeroBoundsValueCodec,
FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec,
};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::{
@ -75,7 +75,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
)?;
// Compute and store the faceted strings documents ids.
let string_documents_ids = compute_faceted_documents_ids(
let string_documents_ids = compute_faceted_strings_documents_ids(
self.wtxn,
self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
field_id,
@ -96,7 +96,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
// Compute and store the faceted numbers documents ids.
let number_documents_ids = compute_faceted_documents_ids(
let number_documents_ids = compute_faceted_numbers_documents_ids(
self.wtxn,
self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
field_id,
@ -237,13 +237,26 @@ fn write_number_entry(
Ok(())
}
fn compute_faceted_documents_ids(
fn compute_faceted_strings_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
field_id: FieldId,
) -> Result<RoaringBitmap> {
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
let (_key, (_original_value, docids)) = result?;
documents_ids |= docids;
}
Ok(documents_ids)
}
fn compute_faceted_numbers_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: FieldId,
) -> Result<RoaringBitmap> {
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
let (_key, docids) = result?;
documents_ids |= docids;
@ -265,7 +278,10 @@ fn clear_field_string_levels<'t>(
fn compute_facet_string_levels<'t>(
rtxn: &'t heed::RoTxn,
db: heed::Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
db: heed::Database<
FacetStringLevelZeroCodec,
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
>,
compression_type: CompressionType,
compression_level: Option<u32>,
shrink_size: Option<u64>,
@ -299,7 +315,7 @@ fn compute_facet_string_levels<'t>(
// Because we know the size of the level 0 we can use a range iterator that starts
// at the first value of the level and goes to the last by simply counting.
for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() {
let ((_field_id, value), docids) = result?;
let ((_field_id, value), (_original_value, docids)) = result?;
if i == 0 {
left = (i as u32, value);

View File

@ -2,8 +2,11 @@ use std::borrow::Cow;
use std::result::Result as StdResult;
use fst::IntoStreamer;
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
use crate::error::SerializationError;
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::Result;
@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>
Ok(vec)
}
/// Uses the FacetStringLevelZeroValueCodec to merge the values.
pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap();
let (head_string, mut head_rb) =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&head[..])
.ok_or(SerializationError::Decoding { db_name: None })?;
for value in tail {
let (_string, rb) =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&value[..])
.ok_or(SerializationError::Decoding { db_name: None })?;
head_rb |= rb;
}
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&(head_string, head_rb))
.map(|cow| cow.into_owned())
.ok_or(SerializationError::Encoding { db_name: None })
.map_err(Into::into)
}
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap();
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;

View File

@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize};
pub use self::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
};
use self::store::{Readers, Store};
pub use self::transform::{Transform, TransformOutput};
@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
*self.index.facet_id_string_docids.as_polymorph(),
facet_field_strings_docids_readers,
cbo_roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
write_method,
)?;

View File

@ -22,12 +22,13 @@ use tempfile::tempfile;
use super::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
};
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
use crate::error::{Error, InternalError, SerializationError};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep;
@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let facet_field_strings_docids_sorter = create_sorter(
cbo_roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -528,17 +529,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Error: From<E>,
{
let mut key_buffer = Vec::new();
let mut data_buffer = Vec::new();
for ((field_id, normalized_value), (original_value, docids)) in iter {
key_buffer.clear();
data_buffer.clear();
FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer);
let data = (original_value.as_str(), docids);
let data = FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data)
.ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?;
if lmdb_key_valid_size(&key_buffer) {
sorter.insert(&key_buffer, &data_buffer)?;
sorter.insert(&key_buffer, &data)?;
} else {
warn!("facet value {:?} is too large to be saved", original_value);
}