implement distinct attribute

distinct can return error

facet distinct on numbers

return distinct error

review fixes

make get_facet_value more generic

fixes
This commit is contained in:
Marin Postma
2021-04-07 12:38:48 +02:00
parent 6e126c96a9
commit 45c45e11dd
13 changed files with 525 additions and 53 deletions

View File

@ -0,0 +1,192 @@
use std::mem::size_of;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::*;
use crate::{facet::FacetType, DocumentId, FieldId, Index};
use super::{Distinct, DocIter};
pub struct FacetDistinct<'a> {
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
}
impl<'a> FacetDistinct<'a> {
pub fn new(
distinct: FieldId,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
facet_type: FacetType,
) -> Self {
Self {
distinct,
index,
txn,
facet_type,
}
}
}
pub struct FacetDistinctIter<'a> {
candidates: RoaringBitmap,
distinct: FieldId,
excluded: RoaringBitmap,
facet_type: FacetType,
index: &'a Index,
iter_offset: usize,
txn: &'a heed::RoTxn<'a>,
}
impl<'a> FacetDistinctIter<'a> {
fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap>
where
KC: heed::BytesEncode<'c>,
{
let facet_docids = self
.index
.facet_field_id_value_docids
.remap_key_type::<KC>()
.get(self.txn, key)?
.expect("Corrupted data: Facet values must exist");
Ok(facet_docids)
}
fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetStringCodec>(
id,
self.distinct,
self.index,
self.txn,
)?;
for item in iter {
let ((_, _, value), _) = item?;
let key = (self.distinct, value);
let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?;
self.excluded.union_with(&facet_docids);
}
self.excluded.remove(id);
Ok(())
}
fn distinct_integer(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetI64Codec>(
id,
self.distinct,
self.index,
self.txn,
)?;
for item in iter {
let ((_, _, value), _) = item?;
// get facet docids on level 0
let key = (self.distinct, 0, value, value);
let facet_docids = self.get_facet_docids::<FacetLevelValueI64Codec>(&key)?;
self.excluded.union_with(&facet_docids);
}
self.excluded.remove(id);
Ok(())
}
fn distinct_float(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id,
self.distinct,
self.index,
self.txn,
)?;
for item in iter {
let ((_, _, value), _) = item?;
// get facet docids on level 0
let key = (self.distinct, 0, value, value);
let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?;
self.excluded.union_with(&facet_docids);
}
self.excluded.remove(id);
Ok(())
}
fn next_inner(&mut self) -> anyhow::Result<Option<DocumentId>> {
// The first step is to remove all the excluded documents from our candidates
self.candidates.difference_with(&self.excluded);
let mut candidates_iter = self.candidates.iter().skip(self.iter_offset);
match candidates_iter.next() {
Some(id) => {
match self.facet_type {
FacetType::String => self.distinct_string(id)?,
FacetType::Integer => self.distinct_integer(id)?,
FacetType::Float => self.distinct_float(id)?,
};
// On every iteration, the first document is always a distinct one, since it
// hasn't been discarded by the previous difference.
self.iter_offset += 1;
Ok(Some(id))
}
// no more candidate at this offset, return.
None => Ok(None),
}
}
}
fn get_facet_values<'a, KC>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>>
where
KC: heed::BytesDecode<'a>,
{
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
let iter = index
.field_id_docid_facet_values
.prefix_iter(txn, &key)?
.remap_key_type::<KC>();
Ok(iter)
}
impl Iterator for FacetDistinctIter<'_> {
type Item = anyhow::Result<DocumentId>;
fn next(&mut self) -> Option<Self::Item> {
self.next_inner().transpose()
}
}
impl DocIter for FacetDistinctIter<'_> {
fn into_excluded(self) -> RoaringBitmap {
self.excluded
}
}
impl<'a> Distinct<'_> for FacetDistinct<'a> {
type Iter = FacetDistinctIter<'a>;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
FacetDistinctIter {
candidates,
distinct: self.distinct,
excluded,
facet_type: self.facet_type,
index: self.index,
iter_offset: 0,
txn: self.txn,
}
}
}

View File

@ -0,0 +1,109 @@
use std::collections::HashMap;
use roaring::RoaringBitmap;
use serde_json::Value;
use super::{Distinct, DocIter};
use crate::{DocumentId, FieldId, Index};
pub struct MapDistinct<'a> {
distinct: FieldId,
map: HashMap<String, usize>,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
}
impl<'a> MapDistinct<'a> {
pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self {
let map = HashMap::new();
Self {
distinct,
map,
index,
txn,
}
}
}
pub struct MapDistinctIter<'a, 'b> {
distinct: FieldId,
map: &'b mut HashMap<String, usize>,
index: &'a Index,
txn: &'a heed::RoTxn<'a>,
candidates: roaring::bitmap::IntoIter,
excluded: RoaringBitmap,
}
impl<'a, 'b> MapDistinctIter<'a, 'b> {
fn next_inner(&mut self) -> anyhow::Result<Option<DocumentId>> {
let map = &mut self.map;
let mut filter = |value: Value| {
let entry = map.entry(value.to_string()).or_insert(0);
*entry += 1;
*entry <= 1
};
while let Some(id) = self.candidates.next() {
let document = self.index.documents(&self.txn, Some(id))?[0].1;
let value = document
.get(self.distinct)
.map(serde_json::from_slice::<Value>)
.transpose()?;
let accept = match value {
Some(value) => {
match value {
// Since we can't distinct these values, we always accept them
Value::Null | Value::Object(_) => true,
Value::Array(values) => {
let mut accept = true;
for value in values {
accept &= filter(value);
}
accept
}
value => filter(value),
}
}
// Accept values by default.
_ => true,
};
if accept {
return Ok(Some(id));
} else {
self.excluded.insert(id);
}
}
Ok(None)
}
}
impl Iterator for MapDistinctIter<'_, '_> {
type Item = anyhow::Result<DocumentId>;
fn next(&mut self) -> Option<Self::Item> {
self.next_inner().transpose()
}
}
impl DocIter for MapDistinctIter<'_, '_> {
fn into_excluded(self) -> RoaringBitmap {
self.excluded
}
}
impl<'a, 'b> Distinct<'b> for MapDistinct<'a> {
type Iter = MapDistinctIter<'a, 'b>;
fn distinct(&'b mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
MapDistinctIter {
distinct: self.distinct,
map: &mut self.map,
index: &self.index,
txn: &self.txn,
candidates: candidates.into_iter(),
excluded,
}
}
}

View File

@ -0,0 +1,21 @@
mod facet_distinct;
mod map_distinct;
mod noop_distinct;
use roaring::RoaringBitmap;
pub use facet_distinct::FacetDistinct;
pub use map_distinct::MapDistinct;
pub use noop_distinct::NoopDistinct;
use crate::DocumentId;
pub trait DocIter: Iterator<Item=anyhow::Result<DocumentId>> {
/// Returns ownership on the internal RoaringBitmaps: (candidates, excluded)
fn into_excluded(self) -> RoaringBitmap;
}
pub trait Distinct<'a> {
type Iter: DocIter;
fn distinct(&'a mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter;
}

View File

@ -0,0 +1,36 @@
use roaring::RoaringBitmap;
use crate::DocumentId;
use super::{DocIter, Distinct};
pub struct NoopDistinct;
pub struct NoopDistinctIter {
candidates: roaring::bitmap::IntoIter,
excluded: RoaringBitmap,
}
impl Iterator for NoopDistinctIter {
type Item = anyhow::Result<DocumentId>;
fn next(&mut self) -> Option<Self::Item> {
self.candidates.next().map(Result::Ok)
}
}
impl DocIter for NoopDistinctIter {
fn into_excluded(self) -> RoaringBitmap {
self.excluded
}
}
impl Distinct<'_> for NoopDistinct {
type Iter = NoopDistinctIter;
fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter {
NoopDistinctIter {
candidates: candidates.into_iter(),
excluded,
}
}
}