mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-07 05:05:42 +00:00
Compare commits
21 Commits
v1.9.0-rc.
...
refactor-s
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9874efc352 | ||
|
|
a838f39fce | ||
|
|
7add7d053c | ||
|
|
7559dfc814 | ||
|
|
6c6c4732a1 | ||
|
|
3976fe660e | ||
|
|
50f8218a5d | ||
|
|
19585f1a4f | ||
|
|
8ec6e175e5 | ||
|
|
75b2e02cd2 | ||
|
|
40f05fe156 | ||
|
|
52d0d35b39 | ||
|
|
5432776132 | ||
|
|
66470b27e6 | ||
|
|
0a9bd398c7 | ||
|
|
7967e93c16 | ||
|
|
a6f3a01c6a | ||
|
|
4ca4a3f954 | ||
|
|
e4a69c5ac3 | ||
|
|
ff2e498267 | ||
|
|
531e3d7d6a |
11
Cargo.lock
generated
11
Cargo.lock
generated
@@ -4377,12 +4377,6 @@ dependencies = [
|
||||
"winreg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "retain_mut"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086"
|
||||
|
||||
[[package]]
|
||||
name = "ring"
|
||||
version = "0.17.8"
|
||||
@@ -4400,13 +4394,12 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "roaring"
|
||||
version = "0.10.2"
|
||||
version = "0.10.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873"
|
||||
checksum = "7699249cc2c7d71939f30868f47e9d7add0bdc030d90ee10bfd16887ff8bb1c8"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"retain_mut",
|
||||
"serde",
|
||||
]
|
||||
|
||||
|
||||
@@ -40,8 +40,9 @@ pub struct Permit {
|
||||
|
||||
impl Drop for Permit {
|
||||
fn drop(&mut self) {
|
||||
let sender = self.sender.clone();
|
||||
// if the channel is closed then the whole instance is down
|
||||
let _ = futures::executor::block_on(self.sender.send(()));
|
||||
std::mem::drop(tokio::spawn(async move { sender.send(()).await }));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -47,6 +47,12 @@ pub struct FacetGroupValue {
|
||||
pub bitmap: RoaringBitmap,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FacetGroupLazyValue<'b> {
|
||||
pub size: u8,
|
||||
pub bitmap_bytes: &'b [u8],
|
||||
}
|
||||
|
||||
pub struct FacetGroupKeyCodec<T> {
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
@@ -69,6 +75,7 @@ where
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T>
|
||||
where
|
||||
T: BytesDecode<'a>,
|
||||
@@ -84,6 +91,7 @@ where
|
||||
}
|
||||
|
||||
pub struct FacetGroupValueCodec;
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
||||
type EItem = FacetGroupValue;
|
||||
|
||||
@@ -93,11 +101,23 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
|
||||
type DItem = FacetGroupValue;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let size = bytes[0];
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
|
||||
Ok(FacetGroupValue { size, bitmap })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetGroupLazyValueCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec {
|
||||
type DItem = FacetGroupLazyValue<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] })
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
use std::io;
|
||||
use std::io::{self, Cursor};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
@@ -57,6 +57,24 @@ impl CboRoaringBitmapCodec {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn intersection_with_serialized(
|
||||
mut bytes: &[u8],
|
||||
other: &RoaringBitmap,
|
||||
) -> io::Result<RoaringBitmap> {
|
||||
// See above `deserialize_from` method for implementation details.
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||
if other.contains(integer) {
|
||||
bitmap.insert(integer);
|
||||
}
|
||||
}
|
||||
Ok(bitmap)
|
||||
} else {
|
||||
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
||||
///
|
||||
/// if the merged values length is under the threshold, values are directly
|
||||
|
||||
@@ -38,7 +38,7 @@ where
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
||||
Ok(())
|
||||
} else {
|
||||
@@ -81,7 +81,7 @@ where
|
||||
field_id,
|
||||
)?;
|
||||
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
// We first fill the heap with values from the highest level
|
||||
let starting_key =
|
||||
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
|
||||
@@ -4,9 +4,11 @@ use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::Result;
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
/// Find all the document ids for which the given field contains a value contained within
|
||||
/// the two bounds.
|
||||
@@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
|
||||
field_id: u16,
|
||||
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
|
||||
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
docids: &mut RoaringBitmap,
|
||||
) -> Result<()>
|
||||
where
|
||||
@@ -46,13 +49,15 @@ where
|
||||
}
|
||||
Bound::Unbounded => Bound::Unbounded,
|
||||
};
|
||||
let db = db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>();
|
||||
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids };
|
||||
let db = db.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>();
|
||||
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids };
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
|
||||
if let Some(starting_left_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
|
||||
if let Some(starting_left_bound) =
|
||||
get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?
|
||||
{
|
||||
let rightmost_bound =
|
||||
Bound::Included(get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
|
||||
Bound::Included(get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
|
||||
let group_size = usize::MAX;
|
||||
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
|
||||
Ok(())
|
||||
@@ -64,12 +69,16 @@ where
|
||||
/// Fetch the document ids that have a facet with a value between the two given bounds
|
||||
struct FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
rtxn: &'t heed::RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
||||
field_id: u16,
|
||||
left: Bound<&'b [u8]>,
|
||||
right: Bound<&'b [u8]>,
|
||||
/// The subset of documents ids that are useful for this search.
|
||||
/// Great performance optimizations can be achieved by only fetching values matching this subset.
|
||||
universe: Option<&'bitmap RoaringBitmap>,
|
||||
docids: &'bitmap mut RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
|
||||
let left_key =
|
||||
@@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
}
|
||||
|
||||
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
|
||||
*self.docids |= value.bitmap;
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
@@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
left_condition && right_condition
|
||||
};
|
||||
if should_take_whole_group {
|
||||
*self.docids |= &previous_value.bitmap;
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
previous_value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||
};
|
||||
previous_key = next_key;
|
||||
previous_value = next_value;
|
||||
continue;
|
||||
@@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
|
||||
left_condition && right_condition
|
||||
};
|
||||
if should_take_whole_group {
|
||||
*self.docids |= &previous_value.bitmap;
|
||||
*self.docids |= match self.universe {
|
||||
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
|
||||
previous_value.bitmap_bytes,
|
||||
universe,
|
||||
)?,
|
||||
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
|
||||
};
|
||||
} else {
|
||||
let level = level - 1;
|
||||
let starting_left_bound = previous_key.left_bound;
|
||||
@@ -365,6 +392,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -384,6 +412,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -418,6 +447,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -439,6 +469,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -474,6 +505,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -499,6 +531,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -537,6 +570,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -556,6 +590,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -571,6 +606,7 @@ mod tests {
|
||||
0,
|
||||
&Bound::Unbounded,
|
||||
&Bound::Unbounded,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -586,6 +622,7 @@ mod tests {
|
||||
1,
|
||||
&Bound::Unbounded,
|
||||
&Bound::Unbounded,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -621,6 +658,7 @@ mod tests {
|
||||
0,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
@@ -634,6 +672,7 @@ mod tests {
|
||||
1,
|
||||
&start,
|
||||
&end,
|
||||
None,
|
||||
&mut docids,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
@@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>(
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
|
||||
|
||||
|
||||
@@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>(
|
||||
candidates: RoaringBitmap,
|
||||
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
||||
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? {
|
||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
||||
let last_bound = get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap();
|
||||
let last_bound = get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap();
|
||||
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
|
||||
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
|
||||
Ok(itertools::Either::Left(DescendingFacetSort {
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::ops::Bound::{self, Excluded, Included};
|
||||
|
||||
use either::Either;
|
||||
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token};
|
||||
use roaring::RoaringBitmap;
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::facet_range_search;
|
||||
@@ -224,14 +224,14 @@ impl<'a> Filter<'a> {
|
||||
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
|
||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||
let filterable_fields = index.filterable_fields(rtxn)?;
|
||||
|
||||
self.inner_evaluate(rtxn, index, &filterable_fields)
|
||||
self.inner_evaluate(rtxn, index, &filterable_fields, None)
|
||||
}
|
||||
|
||||
fn evaluate_operator(
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
field_id: FieldId,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
operator: &Condition<'a>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
let numbers_db = index.facet_id_f64_docids;
|
||||
@@ -291,14 +291,22 @@ impl<'a> Filter<'a> {
|
||||
}
|
||||
Condition::NotEqual(val) => {
|
||||
let operator = Condition::Equal(val.clone());
|
||||
let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?;
|
||||
let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?;
|
||||
let all_ids = index.documents_ids(rtxn)?;
|
||||
return Ok(all_ids - docids);
|
||||
}
|
||||
};
|
||||
|
||||
let mut output = RoaringBitmap::new();
|
||||
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?;
|
||||
Self::explore_facet_number_levels(
|
||||
rtxn,
|
||||
numbers_db,
|
||||
field_id,
|
||||
left,
|
||||
right,
|
||||
universe,
|
||||
&mut output,
|
||||
)?;
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
@@ -310,6 +318,7 @@ impl<'a> Filter<'a> {
|
||||
field_id: FieldId,
|
||||
left: Bound<f64>,
|
||||
right: Bound<f64>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
output: &mut RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
match (left, right) {
|
||||
@@ -321,7 +330,7 @@ impl<'a> Filter<'a> {
|
||||
(_, _) => (),
|
||||
}
|
||||
facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>(
|
||||
rtxn, db, field_id, &left, &right, output,
|
||||
rtxn, db, field_id, &left, &right, universe, output,
|
||||
)?;
|
||||
|
||||
Ok(())
|
||||
@@ -332,31 +341,37 @@ impl<'a> Filter<'a> {
|
||||
rtxn: &heed::RoTxn,
|
||||
index: &Index,
|
||||
filterable_fields: &HashSet<String>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
) -> Result<RoaringBitmap> {
|
||||
if universe.map_or(false, |u| u.is_empty()) {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
|
||||
match &self.condition {
|
||||
FilterCondition::Not(f) => {
|
||||
let all_ids = index.documents_ids(rtxn)?;
|
||||
let selected = Self::inner_evaluate(
|
||||
&(f.as_ref().clone()).into(),
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
universe,
|
||||
)?;
|
||||
Ok(all_ids - selected)
|
||||
match universe {
|
||||
Some(universe) => Ok(universe - selected),
|
||||
None => {
|
||||
let all_ids = index.documents_ids(rtxn)?;
|
||||
Ok(all_ids - selected)
|
||||
}
|
||||
}
|
||||
}
|
||||
FilterCondition::In { fid, els } => {
|
||||
if crate::is_faceted(fid.value(), filterable_fields) {
|
||||
let field_ids_map = index.fields_ids_map(rtxn)?;
|
||||
|
||||
if let Some(fid) = field_ids_map.id(fid.value()) {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
|
||||
for el in els {
|
||||
let op = Condition::Equal(el.clone());
|
||||
let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?;
|
||||
bitmap |= el_bitmap;
|
||||
}
|
||||
Ok(bitmap)
|
||||
els.iter()
|
||||
.map(|el| Condition::Equal(el.clone()))
|
||||
.map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op))
|
||||
.union()
|
||||
} else {
|
||||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
@@ -371,7 +386,7 @@ impl<'a> Filter<'a> {
|
||||
if crate::is_faceted(fid.value(), filterable_fields) {
|
||||
let field_ids_map = index.fields_ids_map(rtxn)?;
|
||||
if let Some(fid) = field_ids_map.id(fid.value()) {
|
||||
Self::evaluate_operator(rtxn, index, fid, op)
|
||||
Self::evaluate_operator(rtxn, index, fid, universe, op)
|
||||
} else {
|
||||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
@@ -382,14 +397,11 @@ impl<'a> Filter<'a> {
|
||||
}))?
|
||||
}
|
||||
}
|
||||
FilterCondition::Or(subfilters) => {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
for f in subfilters {
|
||||
bitmap |=
|
||||
Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?;
|
||||
}
|
||||
Ok(bitmap)
|
||||
}
|
||||
FilterCondition::Or(subfilters) => subfilters
|
||||
.iter()
|
||||
.cloned()
|
||||
.map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe))
|
||||
.union(),
|
||||
FilterCondition::And(subfilters) => {
|
||||
let mut subfilters_iter = subfilters.iter();
|
||||
if let Some(first_subfilter) = subfilters_iter.next() {
|
||||
@@ -398,16 +410,21 @@ impl<'a> Filter<'a> {
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
universe,
|
||||
)?;
|
||||
for f in subfilters_iter {
|
||||
if bitmap.is_empty() {
|
||||
return Ok(bitmap);
|
||||
}
|
||||
// TODO We are doing the intersections two times,
|
||||
// it could be more efficient
|
||||
// Can't I just replace this `&=` by an `=`?
|
||||
bitmap &= Self::inner_evaluate(
|
||||
&(f.clone()).into(),
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
Some(&bitmap),
|
||||
)?;
|
||||
}
|
||||
Ok(bitmap)
|
||||
@@ -507,6 +524,7 @@ impl<'a> Filter<'a> {
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
universe,
|
||||
)?;
|
||||
|
||||
let geo_lng_token = Token::new(
|
||||
@@ -539,6 +557,7 @@ impl<'a> Filter<'a> {
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
universe,
|
||||
)?;
|
||||
|
||||
let condition_right = FilterCondition::Condition {
|
||||
@@ -552,6 +571,7 @@ impl<'a> Filter<'a> {
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
universe,
|
||||
)?;
|
||||
|
||||
left | right
|
||||
@@ -567,6 +587,7 @@ impl<'a> Filter<'a> {
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
universe,
|
||||
)?
|
||||
};
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
|
||||
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::filter::{BadGeoError, Filter};
|
||||
pub use self::search::{FacetValueHit, SearchForFacetValues};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::{Index, Result};
|
||||
|
||||
@@ -54,9 +54,9 @@ pub fn facet_max_value<'t>(
|
||||
}
|
||||
|
||||
/// Get the first facet value in the facet database
|
||||
pub(crate) fn get_first_facet_value<'t, BoundCodec>(
|
||||
pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>(
|
||||
txn: &'t RoTxn,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<Option<BoundCodec::DItem>>
|
||||
where
|
||||
@@ -78,9 +78,9 @@ where
|
||||
}
|
||||
|
||||
/// Get the last facet value in the facet database
|
||||
pub(crate) fn get_last_facet_value<'t, BoundCodec>(
|
||||
pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>(
|
||||
txn: &'t RoTxn,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<Option<BoundCodec::DItem>>
|
||||
where
|
||||
@@ -102,9 +102,9 @@ where
|
||||
}
|
||||
|
||||
/// Get the height of the highest level in the facet database
|
||||
pub(crate) fn get_highest_level<'t>(
|
||||
pub(crate) fn get_highest_level<'t, DC>(
|
||||
txn: &'t RoTxn<'t>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
|
||||
field_id: u16,
|
||||
) -> heed::Result<u8> {
|
||||
let field_id_prefix = &field_id.to_be_bytes();
|
||||
|
||||
@@ -548,6 +548,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search")]
|
||||
pub fn filtered_universe(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
|
||||
@@ -8,6 +8,7 @@ mod extract_vector_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
// mod searchable;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
@@ -0,0 +1,211 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use charabia::normalizer::NormalizedTokenIter;
|
||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::update::settings::InnerIndexSettings;
|
||||
use crate::{InternalError, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||
|
||||
pub struct FieldWordPositionExtractorBuilder<'a> {
|
||||
max_positions_per_attributes: u16,
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
separators: Option<Vec<&'a str>>,
|
||||
dictionary: Option<Vec<&'a str>>,
|
||||
}
|
||||
|
||||
impl<'a> FieldWordPositionExtractorBuilder<'a> {
|
||||
pub fn new(
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
settings: &'a InnerIndexSettings,
|
||||
) -> Result<Self> {
|
||||
let stop_words = settings.stop_words.as_ref();
|
||||
let separators: Option<Vec<_>> =
|
||||
settings.allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
Ok(Self {
|
||||
max_positions_per_attributes: max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE as u16, |max| {
|
||||
max.min(MAX_POSITION_PER_ATTRIBUTE) as u16
|
||||
}),
|
||||
stop_words,
|
||||
separators,
|
||||
dictionary,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn build(&'a self) -> FieldWordPositionExtractor<'a> {
|
||||
let builder = tokenizer_builder(
|
||||
self.stop_words,
|
||||
self.separators.as_deref(),
|
||||
self.dictionary.as_deref(),
|
||||
None,
|
||||
);
|
||||
|
||||
FieldWordPositionExtractor {
|
||||
tokenizer: builder.into_tokenizer(),
|
||||
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldWordPositionExtractor<'a> {
|
||||
tokenizer: Tokenizer<'a>,
|
||||
max_positions_per_attributes: u16,
|
||||
}
|
||||
|
||||
impl<'a> FieldWordPositionExtractor<'a> {
|
||||
pub fn extract<'b>(
|
||||
&'a self,
|
||||
field_bytes: &[u8],
|
||||
buffer: &'b mut String,
|
||||
) -> Result<ExtractedFieldWordPosition<'a, 'b>> {
|
||||
let field_value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
Ok(ExtractedFieldWordPosition {
|
||||
tokenizer: &self.tokenizer,
|
||||
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||
field_value,
|
||||
buffer: buffer,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractedFieldWordPosition<'a, 'b> {
|
||||
tokenizer: &'a Tokenizer<'a>,
|
||||
max_positions_per_attributes: u16,
|
||||
field_value: Value,
|
||||
buffer: &'b mut String,
|
||||
}
|
||||
|
||||
impl<'a> ExtractedFieldWordPosition<'a, '_> {
|
||||
pub fn iter<'o>(&'o mut self) -> FieldWordPositionIter<'o> {
|
||||
self.buffer.clear();
|
||||
let inner = match json_to_string(&self.field_value, &mut self.buffer) {
|
||||
Some(field) => Some(self.tokenizer.tokenize(field)),
|
||||
None => None,
|
||||
};
|
||||
|
||||
// create an iterator of token with their positions.
|
||||
FieldWordPositionIter {
|
||||
inner,
|
||||
max_positions_per_attributes: self.max_positions_per_attributes,
|
||||
position: 0,
|
||||
prev_kind: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FieldWordPositionIter<'a> {
|
||||
inner: Option<NormalizedTokenIter<'a, 'a>>,
|
||||
max_positions_per_attributes: u16,
|
||||
position: u16,
|
||||
prev_kind: Option<TokenKind>,
|
||||
}
|
||||
|
||||
impl<'a> Iterator for FieldWordPositionIter<'a> {
|
||||
type Item = (u16, Token<'a>);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.position >= self.max_positions_per_attributes {
|
||||
return None;
|
||||
}
|
||||
|
||||
let token = self.inner.as_mut().map(|i| i.next()).flatten()?;
|
||||
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
self.position += match self.prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
self.prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(_) if self.position == 0 => {
|
||||
return self.next();
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
self.prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if self.prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
self.prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => return self.next(),
|
||||
}
|
||||
|
||||
if !token.is_word() {
|
||||
return self.next();
|
||||
}
|
||||
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let lemma = token.lemma().trim();
|
||||
if !lemma.is_empty() && lemma.len() <= MAX_WORD_LENGTH {
|
||||
Some((self.position, token))
|
||||
} else {
|
||||
self.next()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
pub fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
if let Some(script_language) = script_language {
|
||||
tokenizer_builder.allow_list(script_language);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null | Value::Object(_) => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Value::String(string) = value {
|
||||
Some(string)
|
||||
} else if inner(value, buffer) {
|
||||
Some(buffer)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
114
milli/src/update/index_documents/extract/searchable/mod.rs
Normal file
114
milli/src/update/index_documents/extract/searchable/mod.rs
Normal file
@@ -0,0 +1,114 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::BufReader;
|
||||
|
||||
use field_word_position::FieldWordPositionExtractorBuilder;
|
||||
use obkv::KvReader;
|
||||
use roaring::RoaringBitmap;
|
||||
use word_docids::{WordDocidsDump, WordDocidsExtractor};
|
||||
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::extract::extract_docid_word_positions::ScriptLanguageDocidsMap;
|
||||
use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, SerializationError};
|
||||
|
||||
mod field_word_position;
|
||||
mod word_docids;
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_searchable_data<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
let searchable_fields_to_index = settings_diff.searchable_fields_to_index();
|
||||
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
|
||||
let add_builder =
|
||||
FieldWordPositionExtractorBuilder::new(max_positions_per_attributes, &settings_diff.new)?;
|
||||
let add_token_positions_extractor = add_builder.build();
|
||||
let del_builder;
|
||||
let del_token_positions_extractor = if settings_diff.settings_update_only {
|
||||
del_builder = FieldWordPositionExtractorBuilder::new(
|
||||
max_positions_per_attributes,
|
||||
&settings_diff.old,
|
||||
)?;
|
||||
del_builder.build()
|
||||
} else {
|
||||
add_builder.build()
|
||||
};
|
||||
let token_positions_extractor = &[del_token_positions_extractor, add_token_positions_extractor];
|
||||
|
||||
let mut word_map = BTreeMap::new();
|
||||
let mut word_docids_extractor = WordDocidsExtractor::new(settings_diff);
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
// loop over documents
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !settings_diff.reindex_searchable()
|
||||
&& !searchable_fields_changed(&obkv, &searchable_fields_to_index)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
|
||||
let mut buffer = String::new();
|
||||
for field_id in searchable_fields_to_index.iter() {
|
||||
let Some(field_obkv) = obkv.get(*field_id).map(KvReaderDelAdd::new) else { continue };
|
||||
|
||||
for (deladd, field_bytes) in field_obkv {
|
||||
let mut extracted_positions =
|
||||
token_positions_extractor[deladd as usize].extract(field_bytes, &mut buffer)?;
|
||||
for (position, token) in extracted_positions.iter() {
|
||||
let word = token.lemma().trim();
|
||||
if !word_map.contains_key(word) {
|
||||
word_map.insert(word.to_string(), word_map.len() as u32);
|
||||
}
|
||||
let word_id = word_map.get(word).unwrap();
|
||||
word_docids_extractor.insert(*word_id, *field_id, document_id, deladd);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if word_docids_extractor.rough_size_estimate()
|
||||
> indexer.max_memory.map_or(512 * 1024 * 1024, |s| s.min(512 * 1024 * 1024))
|
||||
{
|
||||
let WordDocidsDump { .. } =
|
||||
word_docids_extractor.dump(&word_map, &searchable_fields_to_index, indexer)?;
|
||||
}
|
||||
}
|
||||
|
||||
todo!()
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &BTreeSet<FieldId>,
|
||||
) -> bool {
|
||||
for field_id in searchable_fields {
|
||||
let Some(field_obkv) = obkv.get(*field_id).map(KvReaderDelAdd::new) else { continue };
|
||||
match (field_obkv.get(DelAdd::Deletion), field_obkv.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
// if both contains a value and values are the same, check the next field.
|
||||
(Some(del), Some(add)) if del == add => (),
|
||||
// otherwise the fields are different, return true.
|
||||
_otherwise => return true,
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
@@ -0,0 +1,203 @@
|
||||
use std::collections::hash_map::Entry::{Occupied, Vacant};
|
||||
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::hash::Hash;
|
||||
use std::io::BufReader;
|
||||
use std::mem::size_of;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::update::del_add::KvWriterDelAdd;
|
||||
use crate::update::index_documents::extract::searchable::DelAdd;
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
pub struct WordDocidsExtractor<'a> {
|
||||
word_fid_docids: RevertedIndex<(u32, FieldId)>,
|
||||
settings_diff: &'a InnerIndexSettingsDiff,
|
||||
}
|
||||
|
||||
impl<'a> WordDocidsExtractor<'a> {
|
||||
pub fn new(settings_diff: &'a InnerIndexSettingsDiff) -> Self {
|
||||
Self { word_fid_docids: RevertedIndex::new(), settings_diff }
|
||||
}
|
||||
pub fn insert(&mut self, wordid: u32, fieldid: FieldId, docid: DocumentId, del_add: DelAdd) {
|
||||
self.word_fid_docids.insert((wordid, fieldid), docid, del_add);
|
||||
}
|
||||
|
||||
pub fn rough_size_estimate(&self) -> usize {
|
||||
self.word_fid_docids.rough_size_estimate()
|
||||
}
|
||||
|
||||
pub fn dump(
|
||||
&mut self,
|
||||
word_map: &BTreeMap<String, u32>,
|
||||
fields: &BTreeSet<FieldId>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<WordDocidsDump> {
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut exact_word_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut exact_word_deletion = RoaringBitmap::new();
|
||||
let mut exact_word_addition = RoaringBitmap::new();
|
||||
let mut word_deletion = RoaringBitmap::new();
|
||||
let mut word_addition = RoaringBitmap::new();
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut bitmap_buffer = Vec::new();
|
||||
let mut obkv_buffer = Vec::new();
|
||||
for (word, wid) in word_map {
|
||||
exact_word_deletion.clear();
|
||||
exact_word_addition.clear();
|
||||
word_deletion.clear();
|
||||
word_addition.clear();
|
||||
for fid in fields {
|
||||
if let Some((deletion, addition)) = self.word_fid_docids.inner.get(&(*wid, *fid)) {
|
||||
if self.settings_diff.old.exact_attributes.contains(&fid) {
|
||||
exact_word_deletion |= deletion;
|
||||
} else {
|
||||
word_deletion |= deletion;
|
||||
}
|
||||
|
||||
if self.settings_diff.new.exact_attributes.contains(&fid) {
|
||||
exact_word_addition |= addition;
|
||||
} else {
|
||||
word_addition |= addition;
|
||||
}
|
||||
|
||||
if deletion != addition {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
let value = bitmaps_into_deladd_obkv(
|
||||
deletion,
|
||||
addition,
|
||||
&mut obkv_buffer,
|
||||
&mut bitmap_buffer,
|
||||
)?;
|
||||
word_fid_docids_writer.insert(&key_buffer, value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word.as_bytes());
|
||||
if exact_word_deletion != exact_word_addition {
|
||||
let value = bitmaps_into_deladd_obkv(
|
||||
&exact_word_deletion,
|
||||
&exact_word_addition,
|
||||
&mut obkv_buffer,
|
||||
&mut bitmap_buffer,
|
||||
)?;
|
||||
exact_word_docids_writer.insert(&key_buffer, value)?;
|
||||
}
|
||||
|
||||
if word_deletion != word_addition {
|
||||
let value = bitmaps_into_deladd_obkv(
|
||||
&word_deletion,
|
||||
&word_addition,
|
||||
&mut obkv_buffer,
|
||||
&mut bitmap_buffer,
|
||||
)?;
|
||||
word_docids_writer.insert(&key_buffer, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
self.word_fid_docids.clear();
|
||||
|
||||
Ok(WordDocidsDump {
|
||||
word_fid_docids: writer_into_reader(word_fid_docids_writer)?,
|
||||
word_docids: writer_into_reader(word_docids_writer)?,
|
||||
exact_word_docids: writer_into_reader(exact_word_docids_writer)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn bitmaps_into_deladd_obkv<'a>(
|
||||
deletion: &RoaringBitmap,
|
||||
addition: &RoaringBitmap,
|
||||
obkv_buffer: &'a mut Vec<u8>,
|
||||
bitmap_buffer: &mut Vec<u8>,
|
||||
) -> Result<&'a mut Vec<u8>> {
|
||||
obkv_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(obkv_buffer);
|
||||
if !deletion.is_empty() {
|
||||
bitmap_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(deletion, bitmap_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &*bitmap_buffer)?;
|
||||
}
|
||||
if !addition.is_empty() {
|
||||
bitmap_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(addition, bitmap_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &*bitmap_buffer)?;
|
||||
}
|
||||
Ok(value_writer.into_inner()?)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct RevertedIndex<K> {
|
||||
inner: HashMap<K, (RoaringBitmap, RoaringBitmap)>,
|
||||
max_value_size: usize,
|
||||
}
|
||||
|
||||
impl<K: PartialEq + Eq + Hash> RevertedIndex<K> {
|
||||
pub fn insert(&mut self, key: K, docid: DocumentId, del_add: DelAdd) {
|
||||
let size = match self.inner.entry(key) {
|
||||
Occupied(mut entry) => {
|
||||
let (ref mut del, ref mut add) = entry.get_mut();
|
||||
match del_add {
|
||||
DelAdd::Deletion => del.insert(docid),
|
||||
DelAdd::Addition => add.insert(docid),
|
||||
};
|
||||
del.serialized_size() + add.serialized_size()
|
||||
}
|
||||
Vacant(entry) => {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
bitmap.insert(docid);
|
||||
let size = bitmap.serialized_size();
|
||||
match del_add {
|
||||
DelAdd::Deletion => entry.insert((bitmap, RoaringBitmap::new())),
|
||||
DelAdd::Addition => entry.insert((RoaringBitmap::new(), bitmap)),
|
||||
};
|
||||
size * 2
|
||||
}
|
||||
};
|
||||
|
||||
self.max_value_size = self.max_value_size.max(size);
|
||||
}
|
||||
|
||||
pub fn new() -> Self {
|
||||
Self { inner: HashMap::new(), max_value_size: 0 }
|
||||
}
|
||||
|
||||
pub fn rough_size_estimate(&self) -> usize {
|
||||
self.inner.len() * size_of::<K>() + self.inner.len() * self.max_value_size
|
||||
}
|
||||
|
||||
fn clear(&mut self) {
|
||||
self.max_value_size = 0;
|
||||
self.inner.clear();
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsDump {
|
||||
pub word_fid_docids: grenad::Reader<BufReader<File>>,
|
||||
pub word_docids: grenad::Reader<BufReader<File>>,
|
||||
pub exact_word_docids: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
@@ -1162,6 +1162,18 @@ impl InnerIndexSettingsDiff {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn searchable_fields_to_index(&self) -> BTreeSet<FieldId> {
|
||||
if self.settings_update_only {
|
||||
self.new
|
||||
.fields_ids_map
|
||||
.ids()
|
||||
.filter(|id| self.reindex_searchable_id(*id).is_some())
|
||||
.collect()
|
||||
} else {
|
||||
self.new.searchable_fields_ids.iter().copied().collect()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn any_reindexing_needed(&self) -> bool {
|
||||
self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user