Return the original string values for the inverted facet index database

This commit is contained in:
Clément Renault
2021-07-17 12:50:01 +02:00
committed by Kerollmops
parent 03a01166ba
commit 0227254a65
15 changed files with 242 additions and 58 deletions

View File

@ -1,5 +1,6 @@
use std::mem::size_of;
use concat_arrays::concat_arrays;
use heed::types::{ByteSlice, Str, Unit};
use roaring::RoaringBitmap;
@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> {
impl<'a> FacetDistinctIter<'a> {
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key))
self.index
.facet_id_string_docids
.get(self.txn, &(self.distinct, key))
.map(|result| result.map(|(_original, docids)| docids))
}
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> {
}
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
key
concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
}
fn facet_number_values<'a>(

View File

@ -47,7 +47,7 @@ mod test {
let mut documents = Vec::new();
let txts = ["toto", "titi", "tata"];
let txts = ["Toto", "Titi", "Tata"];
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
let cat_ints = (1..10).collect::<Vec<_>>();
@ -90,7 +90,6 @@ mod test {
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
addition.update_format(UpdateFormat::Json);
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
let fields_map = index.fields_ids_map(&txn).unwrap();

View File

@ -23,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000;
/// Threshold on the number of candidates that will make
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 35_000;
const CANDIDATES_THRESHOLD: u64 = 3000;
pub struct FacetDistribution<'a> {
facets: Option<HashSet<String>>,
@ -72,6 +72,7 @@ impl<'a> FacetDistribution<'a> {
FacetType::Number => {
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
let distribution_prelength = distribution.len();
let db = self.index.field_id_docid_facet_f64s;
for docid in candidates.into_iter() {
key_buffer.truncate(mem::size_of::<FieldId>());
@ -84,6 +85,9 @@ impl<'a> FacetDistribution<'a> {
for result in iter {
let ((_, _, value), ()) = result?;
*distribution.entry(value.to_string()).or_insert(0) += 1;
if distribution.len() - distribution_prelength == self.max_values_by_facet {
break;
}
}
}
}
@ -106,6 +110,10 @@ impl<'a> FacetDistribution<'a> {
.entry(normalized_value)
.or_insert_with(|| (original_value, 0));
*count += 1;
if normalized_distribution.len() == self.max_values_by_facet {
break;
}
}
}
@ -154,10 +162,10 @@ impl<'a> FacetDistribution<'a> {
FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
for result in iter {
let (value, mut docids) = result?;
let (_normalized, original, mut docids) = result?;
docids &= candidates;
if !docids.is_empty() {
distribution.insert(value.to_string(), docids.len());
distribution.insert(original.to_string(), docids.len());
}
if distribution.len() == self.max_values_by_facet {
break;
@ -193,14 +201,20 @@ impl<'a> FacetDistribution<'a> {
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
.remap_key_type::<FacetStringLevelZeroCodec>();
let mut normalized_distribution = BTreeMap::new();
for result in iter {
let ((_, value), docids) = result?;
distribution.insert(value.to_string(), docids.len());
let ((_, normalized_value), (original_value, docids)) = result?;
normalized_distribution.insert(normalized_value, (original_value, docids.len()));
if distribution.len() == self.max_values_by_facet {
break;
}
}
let iter = normalized_distribution
.into_iter()
.map(|(_normalized, (original, count))| (original.to_string(), count));
distribution.extend(iter);
Ok(distribution)
}

View File

@ -135,7 +135,8 @@ use heed::{Database, LazyDecode, RoRange};
use roaring::RoaringBitmap;
use crate::heed_codec::facet::{
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec,
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
FacetStringZeroBoundsValueCodec,
};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::{FieldId, Index};
@ -209,7 +210,11 @@ impl<'t> Iterator for FacetStringGroupRange<'t> {
///
/// It yields the facet string and the roaring bitmap associated with it.
pub struct FacetStringLevelZeroRange<'t> {
iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
iter: RoRange<
't,
FacetStringLevelZeroCodec,
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
>,
}
impl<'t> FacetStringLevelZeroRange<'t> {
@ -252,18 +257,23 @@ impl<'t> FacetStringLevelZeroRange<'t> {
let iter = db
.remap_key_type::<ByteSlice>()
.range(rtxn, &(left_bound, right_bound))?
.remap_types::<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>();
.remap_types::<
FacetStringLevelZeroCodec,
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>
>();
Ok(FacetStringLevelZeroRange { iter })
}
}
impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
type Item = heed::Result<(&'t str, RoaringBitmap)>;
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> {
match self.iter.next() {
Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))),
Some(Ok(((_fid, normalized), (original, docids)))) => {
Some(Ok((normalized, original, docids)))
}
Some(Err(e)) => Some(Err(e)),
None => None,
}
@ -326,7 +336,7 @@ impl<'t> FacetStringIter<'t> {
}
impl<'t> Iterator for FacetStringIter<'t> {
type Item = heed::Result<(&'t str, RoaringBitmap)>;
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> {
'outer: loop {
@ -377,11 +387,11 @@ impl<'t> Iterator for FacetStringIter<'t> {
// level zero only
for result in last {
match result {
Ok((value, mut docids)) => {
Ok((normalized, original, mut docids)) => {
docids &= &*documents_ids;
if !docids.is_empty() {
*documents_ids -= &docids;
return Some(Ok((value, docids)));
return Some(Ok((normalized, original, docids)));
}
}
Err(e) => return Some(Err(e)),

View File

@ -17,7 +17,9 @@ use self::Operator::*;
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
use super::FacetNumberRange;
use crate::error::UserError;
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
};
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq)]
@ -363,7 +365,10 @@ impl FilterCondition {
rtxn: &heed::RoTxn,
index: &Index,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
strings_db: heed::Database<
FacetStringLevelZeroCodec,
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
>,
field_id: FieldId,
operator: &Operator,
) -> Result<RoaringBitmap> {
@ -374,7 +379,8 @@ impl FilterCondition {
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
Equal(number, string) => {
let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
let (_original_value, string_docids) =
strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
let number_docids = match number {
Some(n) => {
let n = Included(*n);