Return the original string values for the inverted facet index database

This commit is contained in:
Clément Renault
2021-07-17 12:50:01 +02:00
committed by Kerollmops
parent 03a01166ba
commit 0227254a65
15 changed files with 242 additions and 58 deletions

View File

@ -2,8 +2,11 @@ use std::borrow::Cow;
use std::result::Result as StdResult;
use fst::IntoStreamer;
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
use crate::error::SerializationError;
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::Result;
@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>
Ok(vec)
}
/// Uses the FacetStringLevelZeroValueCodec to merge the values.
pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap();
let (head_string, mut head_rb) =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&head[..])
.ok_or(SerializationError::Decoding { db_name: None })?;
for value in tail {
let (_string, rb) =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&value[..])
.ok_or(SerializationError::Decoding { db_name: None })?;
head_rb |= rb;
}
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&(head_string, head_rb))
.map(|cow| cow.into_owned())
.ok_or(SerializationError::Encoding { db_name: None })
.map_err(Into::into)
}
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
let (head, tail) = values.split_first().unwrap();
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;

View File

@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize};
pub use self::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
};
use self::store::{Readers, Store};
pub use self::transform::{Transform, TransformOutput};
@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
self.wtxn,
*self.index.facet_id_string_docids.as_polymorph(),
facet_field_strings_docids_readers,
cbo_roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
write_method,
)?;

View File

@ -22,12 +22,13 @@ use tempfile::tempfile;
use super::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
};
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
use crate::error::{Error, InternalError, SerializationError};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep;
@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
max_memory,
);
let facet_field_strings_docids_sorter = create_sorter(
cbo_roaring_bitmap_merge,
tuple_string_cbo_roaring_bitmap_merge,
chunk_compression_type,
chunk_compression_level,
chunk_fusing_shrink_size,
@ -528,17 +529,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Error: From<E>,
{
let mut key_buffer = Vec::new();
let mut data_buffer = Vec::new();
for ((field_id, normalized_value), (original_value, docids)) in iter {
key_buffer.clear();
data_buffer.clear();
FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer);
let data = (original_value.as_str(), docids);
let data = FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data)
.ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?;
if lmdb_key_valid_size(&key_buffer) {
sorter.insert(&key_buffer, &data_buffer)?;
sorter.insert(&key_buffer, &data)?;
} else {
warn!("facet value {:?} is too large to be saved", original_value);
}