mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-10-26 21:46:27 +00:00
Merge branch 'indexer-edition-2024' into indexer-edition-2024-doc-chunks
This commit is contained in:
0
crates/milli/src/update/available_documents_ids.rs
Normal file
0
crates/milli/src/update/available_documents_ids.rs
Normal file
65
crates/milli/src/update/available_ids.rs
Normal file
65
crates/milli/src/update/available_ids.rs
Normal file
@@ -0,0 +1,65 @@
|
||||
use std::iter::{Chain, FromIterator};
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
use roaring::bitmap::{IntoIter, RoaringBitmap};
|
||||
|
||||
pub struct AvailableIds {
|
||||
iter: Chain<IntoIter, RangeInclusive<u32>>,
|
||||
}
|
||||
|
||||
impl AvailableIds {
|
||||
pub fn new(docids: &RoaringBitmap) -> AvailableIds {
|
||||
match docids.max() {
|
||||
Some(last_id) => {
|
||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||
available -= docids;
|
||||
|
||||
let iter = match last_id.checked_add(1) {
|
||||
Some(id) => id..=u32::MAX,
|
||||
#[allow(clippy::reversed_empty_ranges)]
|
||||
None => 1..=0, // empty range iterator
|
||||
};
|
||||
|
||||
AvailableIds { iter: available.into_iter().chain(iter) }
|
||||
}
|
||||
None => {
|
||||
let empty = RoaringBitmap::new().into_iter();
|
||||
AvailableIds { iter: empty.chain(0..=u32::MAX) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for AvailableIds {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
self.iter.next()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn empty() {
|
||||
let base = RoaringBitmap::new();
|
||||
let left = AvailableIds::new(&base);
|
||||
let right = 0..=u32::MAX;
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn scattered() {
|
||||
let mut base = RoaringBitmap::new();
|
||||
base.insert(0);
|
||||
base.insert(10);
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let left = AvailableIds::new(&base);
|
||||
let right = (0..=u32::MAX).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
}
|
||||
149
crates/milli/src/update/clear_documents.rs
Normal file
149
crates/milli/src/update/clear_documents.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
use heed::RwTxn;
|
||||
use roaring::RoaringBitmap;
|
||||
use time::OffsetDateTime;
|
||||
|
||||
use crate::{FieldDistribution, Index, Result};
|
||||
|
||||
pub struct ClearDocuments<'t, 'i> {
|
||||
wtxn: &'t mut RwTxn<'i>,
|
||||
index: &'i Index,
|
||||
}
|
||||
|
||||
impl<'t, 'i> ClearDocuments<'t, 'i> {
|
||||
pub fn new(wtxn: &'t mut RwTxn<'i>, index: &'i Index) -> ClearDocuments<'t, 'i> {
|
||||
ClearDocuments { wtxn, index }
|
||||
}
|
||||
|
||||
#[tracing::instrument(
|
||||
level = "trace",
|
||||
skip(self),
|
||||
target = "indexing::documents",
|
||||
name = "clear_documents"
|
||||
)]
|
||||
pub fn execute(self) -> Result<u64> {
|
||||
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
|
||||
let Index {
|
||||
env: _env,
|
||||
main: _main,
|
||||
external_documents_ids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_prefix_docids,
|
||||
exact_word_prefix_docids,
|
||||
word_pair_proximity_docids,
|
||||
word_position_docids,
|
||||
word_fid_docids,
|
||||
field_id_word_count_docids,
|
||||
word_prefix_position_docids,
|
||||
word_prefix_fid_docids,
|
||||
facet_id_f64_docids,
|
||||
facet_id_string_docids,
|
||||
facet_id_normalized_string_strings,
|
||||
facet_id_string_fst,
|
||||
facet_id_exists_docids,
|
||||
facet_id_is_null_docids,
|
||||
facet_id_is_empty_docids,
|
||||
field_id_docid_facet_f64s,
|
||||
field_id_docid_facet_strings,
|
||||
vector_arroy,
|
||||
embedder_category_id: _,
|
||||
documents,
|
||||
} = self.index;
|
||||
|
||||
let empty_roaring = RoaringBitmap::default();
|
||||
|
||||
// We retrieve the number of documents ids that we are deleting.
|
||||
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
|
||||
|
||||
// We clean some of the main engine datastructures.
|
||||
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
|
||||
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
|
||||
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
|
||||
self.index.delete_geo_rtree(self.wtxn)?;
|
||||
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
|
||||
|
||||
// Remove all user-provided bits from the configs
|
||||
let mut configs = self.index.embedding_configs(self.wtxn)?;
|
||||
for config in configs.iter_mut() {
|
||||
config.user_provided.clear();
|
||||
}
|
||||
self.index.put_embedding_configs(self.wtxn, configs)?;
|
||||
|
||||
// Clear the other databases.
|
||||
external_documents_ids.clear(self.wtxn)?;
|
||||
word_docids.clear(self.wtxn)?;
|
||||
exact_word_docids.clear(self.wtxn)?;
|
||||
word_prefix_docids.clear(self.wtxn)?;
|
||||
exact_word_prefix_docids.clear(self.wtxn)?;
|
||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||
word_position_docids.clear(self.wtxn)?;
|
||||
word_fid_docids.clear(self.wtxn)?;
|
||||
field_id_word_count_docids.clear(self.wtxn)?;
|
||||
word_prefix_position_docids.clear(self.wtxn)?;
|
||||
word_prefix_fid_docids.clear(self.wtxn)?;
|
||||
facet_id_f64_docids.clear(self.wtxn)?;
|
||||
facet_id_normalized_string_strings.clear(self.wtxn)?;
|
||||
facet_id_string_fst.clear(self.wtxn)?;
|
||||
facet_id_exists_docids.clear(self.wtxn)?;
|
||||
facet_id_is_null_docids.clear(self.wtxn)?;
|
||||
facet_id_is_empty_docids.clear(self.wtxn)?;
|
||||
facet_id_string_docids.clear(self.wtxn)?;
|
||||
field_id_docid_facet_f64s.clear(self.wtxn)?;
|
||||
field_id_docid_facet_strings.clear(self.wtxn)?;
|
||||
// vector
|
||||
vector_arroy.clear(self.wtxn)?;
|
||||
|
||||
documents.clear(self.wtxn)?;
|
||||
|
||||
Ok(number_of_documents)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
|
||||
#[test]
|
||||
fn clear_documents() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
index
|
||||
.add_documents_using_wtxn(&mut wtxn, documents!([
|
||||
{ "id": 0, "name": "kevin", "age": 20 },
|
||||
{ "id": 1, "name": "kevina" },
|
||||
{ "id": 2, "name": "benoit", "country": "France", "_geo": { "lng": 42, "lat": 35 } }
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// Clear all documents from the database.
|
||||
let builder = ClearDocuments::new(&mut wtxn, &index);
|
||||
assert_eq!(builder.execute().unwrap(), 3);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]`
|
||||
assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7);
|
||||
|
||||
assert!(index.words_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
|
||||
assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
|
||||
assert!(index.documents_ids(&rtxn).unwrap().is_empty());
|
||||
assert!(index.field_distribution(&rtxn).unwrap().is_empty());
|
||||
assert!(index.geo_rtree(&rtxn).unwrap().is_none());
|
||||
assert!(index.geo_faceted_documents_ids(&rtxn).unwrap().is_empty());
|
||||
|
||||
assert!(index.word_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
|
||||
assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap());
|
||||
assert!(index.documents.is_empty(&rtxn).unwrap());
|
||||
}
|
||||
}
|
||||
59
crates/milli/src/update/concurrent_available_ids.rs
Normal file
59
crates/milli/src/update/concurrent_available_ids.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
/// A concurrent ID generate that will never return the same ID twice.
|
||||
#[derive(Debug)]
|
||||
pub struct ConcurrentAvailableIds {
|
||||
/// The current tree node ID we should use if there is no other IDs available.
|
||||
current: AtomicU32,
|
||||
/// The total number of tree node IDs used.
|
||||
used: AtomicU64,
|
||||
|
||||
/// A list of IDs to exhaust before picking IDs from `current`.
|
||||
available: RoaringBitmap,
|
||||
/// The current Nth ID to select in the bitmap.
|
||||
select_in_bitmap: AtomicU32,
|
||||
/// Tells if you should look in the roaring bitmap or if all the IDs are already exhausted.
|
||||
look_into_bitmap: AtomicBool,
|
||||
}
|
||||
|
||||
impl ConcurrentAvailableIds {
|
||||
/// Creates an ID generator returning unique IDs, avoiding the specified used IDs.
|
||||
pub fn new(used: RoaringBitmap) -> ConcurrentAvailableIds {
|
||||
let last_id = used.max().map_or(0, |id| id + 1);
|
||||
let used_ids = used.len();
|
||||
let available = RoaringBitmap::from_sorted_iter(0..last_id).unwrap() - used;
|
||||
|
||||
ConcurrentAvailableIds {
|
||||
current: AtomicU32::new(last_id),
|
||||
used: AtomicU64::new(used_ids),
|
||||
select_in_bitmap: AtomicU32::new(0),
|
||||
look_into_bitmap: AtomicBool::new(!available.is_empty()),
|
||||
available,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a new unique ID and increase the count of IDs used.
|
||||
pub fn next(&self) -> Option<u32> {
|
||||
if self.used.fetch_add(1, Ordering::Relaxed) > u32::MAX as u64 {
|
||||
None
|
||||
} else if self.look_into_bitmap.load(Ordering::Relaxed) {
|
||||
let current = self.select_in_bitmap.fetch_add(1, Ordering::Relaxed);
|
||||
match self.available.select(current) {
|
||||
Some(id) => Some(id),
|
||||
None => {
|
||||
self.look_into_bitmap.store(false, Ordering::Relaxed);
|
||||
Some(self.current.fetch_add(1, Ordering::Relaxed))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Some(self.current.fetch_add(1, Ordering::Relaxed))
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of used ids in total.
|
||||
pub fn used(&self) -> u64 {
|
||||
self.used.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
140
crates/milli/src/update/del_add.rs
Normal file
140
crates/milli/src/update/del_add.rs
Normal file
@@ -0,0 +1,140 @@
|
||||
use obkv::Key;
|
||||
|
||||
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||
pub type KvReaderDelAdd = obkv::KvReader<DelAdd>;
|
||||
|
||||
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||
///
|
||||
/// Its used in an OBKV to be serialized in grenad files.
|
||||
#[repr(u8)]
|
||||
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
||||
pub enum DelAdd {
|
||||
Deletion = 0,
|
||||
Addition = 1,
|
||||
}
|
||||
|
||||
impl Key for DelAdd {
|
||||
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
||||
type BYTES = [u8; Self::BYTES_SIZE];
|
||||
|
||||
fn to_be_bytes(&self) -> Self::BYTES {
|
||||
u8::to_be_bytes(*self as u8)
|
||||
}
|
||||
|
||||
fn from_be_bytes(array: Self::BYTES) -> Self {
|
||||
match u8::from_be_bytes(array) {
|
||||
0 => Self::Deletion,
|
||||
1 => Self::Addition,
|
||||
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
||||
///
|
||||
/// Deletion: put all the values under DelAdd::Deletion
|
||||
/// Addition: put all the values under DelAdd::Addition,
|
||||
/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
|
||||
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
reader: &obkv::KvReader<K>,
|
||||
operation: DelAddOperation,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
into_del_add_obkv_conditional_operation(reader, buffer, |_| operation)
|
||||
}
|
||||
|
||||
/// Akin to the [into_del_add_obkv] function but lets you
|
||||
/// conditionally define the `DelAdd` variant based on the obkv key.
|
||||
pub fn into_del_add_obkv_conditional_operation<K, F>(
|
||||
reader: &obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
operation: F,
|
||||
) -> std::io::Result<()>
|
||||
where
|
||||
K: obkv::Key + PartialOrd,
|
||||
F: Fn(K) -> DelAddOperation,
|
||||
{
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for (key, value) in reader.iter() {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let operation = operation(key);
|
||||
if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Deletion, value)?;
|
||||
}
|
||||
if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) {
|
||||
value_writer.insert(DelAdd::Addition, value)?;
|
||||
}
|
||||
value_writer.finish()?;
|
||||
writer.insert(key, &value_buffer)?;
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
/// Enum controlling the side of the DelAdd obkv in which the provided value will be written.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum DelAddOperation {
|
||||
Deletion,
|
||||
Addition,
|
||||
DeletionAndAddition,
|
||||
}
|
||||
|
||||
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
||||
///
|
||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
deletion: &obkv::KvReader<K>,
|
||||
addition: &obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
value_buffer.clear();
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Right((k, v)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, v).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
Both((k, deletion), (_, addition)) => {
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
pub fn is_noop_del_add_obkv(del_add: &KvReaderDelAdd) -> bool {
|
||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||
}
|
||||
|
||||
/// A function that extracts and returns the Add side of a DelAdd obkv.
|
||||
/// This is useful when there are no previous value in the database and
|
||||
/// therefore we don't need to do a diff with what's already there.
|
||||
///
|
||||
/// If there is no Add side we currently write an empty buffer
|
||||
/// which is a valid CboRoaringBitmap.
|
||||
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
|
||||
pub fn deladd_serialize_add_side<'a>(
|
||||
obkv: &'a [u8],
|
||||
_buffer: &mut Vec<u8>,
|
||||
) -> crate::Result<&'a [u8]> {
|
||||
Ok(KvReaderDelAdd::from_slice(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
}
|
||||
533
crates/milli/src/update/facet/bulk.rs
Normal file
533
crates/milli/src/update/facet/bulk.rs
Normal file
@@ -0,0 +1,533 @@
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::{CompressionType, Merger};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
/// by rebuilding the database "from scratch".
|
||||
///
|
||||
/// First, the new elements are inserted into the level 0 of the database. Then, the
|
||||
/// higher levels are cleared and recomputed from the content of level 0.
|
||||
pub struct FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
delta_data: Option<Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
FacetsUpdateBulk {
|
||||
index,
|
||||
field_ids,
|
||||
group_size,
|
||||
min_level_size,
|
||||
facet_type,
|
||||
delta_data: Some(delta_data),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn new_not_updating_level_0(
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
FacetsUpdateBulk {
|
||||
index,
|
||||
field_ids,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
delta_data: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facets::bulk")]
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
|
||||
let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
|
||||
|
||||
let db = match facet_type {
|
||||
FacetType::String => {
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
};
|
||||
|
||||
let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
|
||||
|
||||
inner.update(wtxn, &field_ids)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
pub delta_data: Option<Merger<R, MergeDeladdCboRoaringBitmaps>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> {
|
||||
self.update_level0(wtxn)?;
|
||||
for &field_id in field_ids.iter() {
|
||||
self.clear_levels(wtxn, field_id)?;
|
||||
}
|
||||
|
||||
for &field_id in field_ids.iter() {
|
||||
let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
|
||||
|
||||
for level_reader in level_readers {
|
||||
let mut cursor = level_reader.into_cursor()?;
|
||||
while let Some((k, v)) = cursor.move_on_next()? {
|
||||
self.db.remap_types::<Bytes, Bytes>().put(wtxn, k, v)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> {
|
||||
let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] };
|
||||
let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] };
|
||||
let range = left..=right;
|
||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> {
|
||||
let delta_data = match self.delta_data.take() {
|
||||
Some(x) => x,
|
||||
None => return Ok(()),
|
||||
};
|
||||
if self.db.is_empty(wtxn)? {
|
||||
let mut buffer = Vec::new();
|
||||
let mut database = self.db.iter_mut(wtxn)?.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value = KvReaderDelAdd::from_slice(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.clear();
|
||||
// the group size for level 0
|
||||
buffer.push(1);
|
||||
// then we extend the buffer with the docids bitmap
|
||||
buffer.extend_from_slice(value);
|
||||
unsafe {
|
||||
database.put_current_with_options::<Bytes>(PutFlags::APPEND, key, &buffer)?
|
||||
};
|
||||
}
|
||||
} else {
|
||||
let mut buffer = Vec::new();
|
||||
let database = self.db.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut iter = delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = KvReaderDelAdd::from_slice(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
buffer.clear();
|
||||
buffer.push(1);
|
||||
// then we extend the buffer with the docids bitmap
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => {
|
||||
// prev_value is the group size for level 0, followed by the previous bitmap.
|
||||
let old_bitmap = &prev_value[1..];
|
||||
CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
|
||||
}
|
||||
None => {
|
||||
// it is safe to ignore the del in that case.
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
// won't put the key in DB as the value would be empty
|
||||
continue;
|
||||
};
|
||||
|
||||
buffer.extend_from_slice(value);
|
||||
}
|
||||
};
|
||||
let new_bitmap = &buffer[1..];
|
||||
// if the new bitmap is empty, let's remove it
|
||||
if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
|
||||
database.delete(wtxn, key)?;
|
||||
} else {
|
||||
database.put(wtxn, key, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn compute_levels_for_field_id(
|
||||
&self,
|
||||
field_id: FieldId,
|
||||
txn: &RoTxn<'_>,
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
|
||||
|
||||
Ok(subwriters)
|
||||
}
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn read_level_0<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
field_id: u16,
|
||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
// we read the elements one by one and
|
||||
// 1. keep track of the left bound
|
||||
// 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read
|
||||
let mut bitmaps = vec![];
|
||||
|
||||
let mut level_0_prefix = vec![];
|
||||
level_0_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level_0_prefix.push(0);
|
||||
|
||||
let level_0_iter = self
|
||||
.db
|
||||
.remap_types::<Bytes, Bytes>()
|
||||
.prefix_iter(rtxn, level_0_prefix.as_slice())?
|
||||
.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>();
|
||||
|
||||
let mut left_bound: &[u8] = &[];
|
||||
let mut first_iteration_for_new_group = true;
|
||||
for el in level_0_iter {
|
||||
let (key, value) = el?;
|
||||
let bound = key.left_bound;
|
||||
let docids = value.bitmap;
|
||||
|
||||
if first_iteration_for_new_group {
|
||||
left_bound = bound;
|
||||
first_iteration_for_new_group = false;
|
||||
}
|
||||
bitmaps.push(docids);
|
||||
|
||||
if bitmaps.len() == self.group_size as usize {
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
first_iteration_for_new_group = true;
|
||||
bitmaps.clear();
|
||||
}
|
||||
}
|
||||
// don't forget to give the leftover bitmaps as well
|
||||
if !bitmaps.is_empty() {
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
bitmaps.clear();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compute the content of the database levels from its level 0 for the given field id.
|
||||
///
|
||||
/// ## Returns:
|
||||
/// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1`
|
||||
/// that must be inserted into the database.
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn compute_higher_levels<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||
) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
|
||||
if level == 0 {
|
||||
self.read_level_0(rtxn, field_id, handle_group)?;
|
||||
// Level 0 is already in the database
|
||||
return Ok(vec![]);
|
||||
}
|
||||
// level >= 1
|
||||
// we compute each element of this level based on the elements of the level below it
|
||||
// once we have computed `level_group_size` elements, we give the left bound
|
||||
// of those elements, and their bitmaps, to the level above
|
||||
|
||||
let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
|
||||
let mut cur_writer_len: usize = 0;
|
||||
|
||||
let mut group_sizes = vec![];
|
||||
let mut left_bounds = vec![];
|
||||
let mut bitmaps = vec![];
|
||||
|
||||
// compute the levels below
|
||||
// in the callback, we fill `cur_writer` with the correct elements for this level
|
||||
let mut sub_writers = self.compute_higher_levels(
|
||||
rtxn,
|
||||
field_id,
|
||||
level - 1,
|
||||
&mut |sub_bitmaps, left_bound| {
|
||||
let mut combined_bitmap = RoaringBitmap::default();
|
||||
for bitmap in sub_bitmaps {
|
||||
combined_bitmap |= bitmap;
|
||||
}
|
||||
// The conversion of sub_bitmaps.len() to a u8 will always be correct
|
||||
// since its length is bounded by max_group_size, which is a u8.
|
||||
group_sizes.push(sub_bitmaps.len() as u8);
|
||||
left_bounds.push(left_bound);
|
||||
|
||||
bitmaps.push(combined_bitmap);
|
||||
if bitmaps.len() != self.group_size as usize {
|
||||
return Ok(());
|
||||
}
|
||||
let left_bound = left_bounds.first().unwrap();
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
|
||||
for ((bitmap, left_bound), group_size) in
|
||||
bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..))
|
||||
{
|
||||
let key = FacetGroupKey { field_id, level, left_bound };
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key)
|
||||
.map_err(Error::Encoding)?;
|
||||
let value = FacetGroupValue { size: group_size, bitmap };
|
||||
let value =
|
||||
FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?;
|
||||
cur_writer.insert(key, value)?;
|
||||
cur_writer_len += 1;
|
||||
}
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
// don't forget to insert the leftover elements into the writer as well
|
||||
|
||||
// but only do so if the current number of elements to be inserted into this
|
||||
// levelcould grow to the minimum level size
|
||||
|
||||
if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) {
|
||||
// the length of bitmaps is between 0 and group_size
|
||||
assert!(bitmaps.len() < self.group_size as usize);
|
||||
assert!(cur_writer_len > 0);
|
||||
|
||||
let left_bound = left_bounds.first().unwrap();
|
||||
handle_group(&bitmaps, left_bound)?;
|
||||
|
||||
// Note: how many bitmaps are there here?
|
||||
for ((bitmap, left_bound), group_size) in
|
||||
bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..))
|
||||
{
|
||||
let key = FacetGroupKey { field_id, level, left_bound };
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key)
|
||||
.map_err(Error::Encoding)?;
|
||||
let value = FacetGroupValue { size: group_size, bitmap };
|
||||
let value = FacetGroupValueCodec::bytes_encode(&value).map_err(Error::Encoding)?;
|
||||
cur_writer.insert(key, value)?;
|
||||
cur_writer_len += 1;
|
||||
}
|
||||
}
|
||||
// if we inserted enough elements to reach the minimum level size, then we push the writer
|
||||
if cur_writer_len >= self.min_level_size as usize {
|
||||
sub_writers.push(writer_into_reader(cur_writer)?);
|
||||
} else {
|
||||
// otherwise, if there are still leftover elements, we give them to the level above
|
||||
// this is necessary in order to get the union of all docids
|
||||
if !bitmaps.is_empty() {
|
||||
handle_group(&bitmaps, left_bounds.first().unwrap())?;
|
||||
}
|
||||
}
|
||||
Ok(sub_writers)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::once;
|
||||
|
||||
use big_s::S;
|
||||
use maplit::hashset;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::documents::documents_batch_reader_from_objects;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::update::facet::test_helpers::{ordered_string, FacetIndex};
|
||||
use crate::{db_snap, milli_snap};
|
||||
|
||||
#[test]
|
||||
fn insert() {
|
||||
let test = |name: &str, group_size: u8, min_level_size: u8| {
|
||||
let index =
|
||||
FacetIndex::<OrderedF64Codec>::new(group_size, 0 /*NA*/, min_level_size);
|
||||
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..1_000u32 {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
for i in 0..100u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, i as f64), once(i).collect()));
|
||||
}
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
milli_snap!(format!("{index}"), name);
|
||||
};
|
||||
|
||||
test("default", 4, 5);
|
||||
test("small_group_small_min_level", 2, 2);
|
||||
test("small_group_large_min_level", 2, 128);
|
||||
test("large_group_small_min_level", 16, 2);
|
||||
test("odd_group_odd_min_level", 7, 3);
|
||||
}
|
||||
#[test]
|
||||
fn insert_delete_field_insert() {
|
||||
let test = |name: &str, group_size: u8, min_level_size: u8| {
|
||||
let index =
|
||||
FacetIndex::<OrderedF64Codec>::new(group_size, 0 /*NA*/, min_level_size);
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..100u32 {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, i as f64), once(i).collect()));
|
||||
}
|
||||
for i in 0..100u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, i as f64), once(i).collect()));
|
||||
}
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
// delete all the elements for the facet id 0
|
||||
for i in 0..100u32 {
|
||||
index.delete_single_docid(&mut wtxn, 0, &(i as f64), i);
|
||||
}
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
// then add some elements again for the facet id 1
|
||||
for i in 0..110u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, i as f64), once(i).collect()));
|
||||
}
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
milli_snap!(format!("{index}"), name);
|
||||
};
|
||||
|
||||
test("default", 4, 5);
|
||||
test("small_group_small_min_level", 2, 2);
|
||||
test("small_group_large_min_level", 2, 128);
|
||||
test("large_group_small_min_level", 16, 2);
|
||||
test("odd_group_odd_min_level", 7, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3165() {
|
||||
// Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies)
|
||||
// would lead to a facet DB which was missing some levels.
|
||||
// That was because before writing a level into the database, we would
|
||||
// check that its size was higher than the minimum level size using
|
||||
// a lossy integer conversion: `level_size as u8 >= min_level_size`.
|
||||
//
|
||||
// This missing level in the facet DBs would make the incremental indexer
|
||||
// (and other search algorithms) crash.
|
||||
//
|
||||
// https://github.com/meilisearch/meilisearch/issues/3165
|
||||
let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_owned());
|
||||
settings.set_filterable_fields(hashset! { S("id") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut documents = vec![];
|
||||
for i in 0..=22_540 {
|
||||
documents.push(
|
||||
serde_json::json! {
|
||||
{
|
||||
"id": i as u64,
|
||||
}
|
||||
}
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
}
|
||||
|
||||
let documents = documents_batch_reader_from_objects(documents);
|
||||
index.add_documents(documents).unwrap();
|
||||
|
||||
db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn insert_string() {
|
||||
let test = |name: &str, group_size: u8, min_level_size: u8| {
|
||||
let index = FacetIndex::<StrRefCodec>::new(group_size, 0 /*NA*/, min_level_size);
|
||||
|
||||
let strings = (0..1_000).map(|i| ordered_string(i as usize)).collect::<Vec<_>>();
|
||||
let mut elements = Vec::<((u16, &str), RoaringBitmap)>::new();
|
||||
for i in 0..1_000u32 {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, &strings[i as usize]), once(i).collect()));
|
||||
}
|
||||
for i in 0..100u32 {
|
||||
// field id = 1, left_bound = i, docids = [i]
|
||||
elements.push(((1, &strings[i as usize]), once(i).collect()));
|
||||
}
|
||||
let mut wtxn = index.env.write_txn().unwrap();
|
||||
index.bulk_insert(&mut wtxn, &[0, 1], elements.iter());
|
||||
|
||||
index.verify_structure_validity(&wtxn, 0);
|
||||
index.verify_structure_validity(&wtxn, 1);
|
||||
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
milli_snap!(format!("{index}"), name);
|
||||
};
|
||||
|
||||
test("default", 4, 5);
|
||||
test("small_group_small_min_level", 2, 2);
|
||||
test("small_group_large_min_level", 2, 128);
|
||||
test("large_group_small_min_level", 16, 2);
|
||||
test("odd_group_odd_min_level", 7, 3);
|
||||
}
|
||||
}
|
||||
1266
crates/milli/src/update/facet/incremental.rs
Normal file
1266
crates/milli/src/update/facet/incremental.rs
Normal file
File diff suppressed because it is too large
Load Diff
640
crates/milli/src/update/facet/mod.rs
Normal file
640
crates/milli/src/update/facet/mod.rs
Normal file
@@ -0,0 +1,640 @@
|
||||
/*!
|
||||
This module implements two different algorithms for updating the `facet_id_string_docids`
|
||||
and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that
|
||||
it recreates the database from scratch when new elements are added to it. The second algorithm
|
||||
is incremental: it modifies the database as little as possible.
|
||||
|
||||
The databases must be able to return results for queries such as:
|
||||
1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y
|
||||
2. Min/Max : find the minimum/maximum facet value among these document ids
|
||||
3. Sort : sort these document ids by increasing/decreasing facet values
|
||||
4. Distribution : given some document ids, make a list of each facet value
|
||||
found in these documents along with the number of documents that contain it
|
||||
|
||||
The algorithms that implement these queries are found in the `src/search/facet` folder.
|
||||
|
||||
To make these queries fast to compute, the database adopts a tree structure:
|
||||
```text
|
||||
┌───────────────────────────────┬───────────────────────────────┬───────────────┐
|
||||
┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │
|
||||
│Level 2│ │ │ │ │
|
||||
└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │
|
||||
├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤
|
||||
┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │
|
||||
│Level 1│ │ │ │ │ │ │
|
||||
└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │
|
||||
├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤
|
||||
┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │
|
||||
│Level 0│ │ │ │ │ │ │ │ │ │ │ │
|
||||
└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │
|
||||
└───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘
|
||||
```
|
||||
In the diagram above, each cell corresponds to a node in the tree. The first line of the cell
|
||||
contains the left bound of the range of facet values as well as the number of children of the node.
|
||||
The second line contains the document ids which have a facet value within the range of the node.
|
||||
The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range.
|
||||
|
||||
In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because
|
||||
`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`.
|
||||
These documents all contain a facet value that is contained within `ab .. gaf`.
|
||||
|
||||
In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
|
||||
[`FacetGroupValue`], which have the following format:
|
||||
|
||||
```text
|
||||
FacetGroupKey:
|
||||
- field id : u16
|
||||
- level : u8
|
||||
- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str
|
||||
|
||||
FacetGroupValue:
|
||||
- #children : u8
|
||||
- docids : RoaringBitmap
|
||||
```
|
||||
|
||||
When the database is first created using the "bulk" method, each node has a fixed number of children
|
||||
(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`).
|
||||
The tree is also built such that the highest level has more than `min_level_size`
|
||||
(default to `FACET_MIN_LEVEL_SIZE`) elements in it.
|
||||
|
||||
When the database is incrementally updated, the number of children of a node can vary between
|
||||
1 and `max_group_size`. This is done so that most incremental operations do not need to change
|
||||
the structure of the tree. When the number of children of a node reaches `max_group_size`,
|
||||
we split the node in two and update the number of children of its parent.
|
||||
|
||||
When adding documents to the databases, it is important to determine which method to use to
|
||||
minimise indexing time. The incremental method is faster when adding few new facet values, but the
|
||||
bulk method is faster when a large part of the database is modified. Empirically, it seems that
|
||||
it takes 50x more time to incrementally add N facet values to an existing database than it is to
|
||||
construct a database of N facet values. This is the heuristic that is used to choose between the
|
||||
two methods.
|
||||
|
||||
Related PR: https://github.com/meilisearch/milli/pull/619
|
||||
*/
|
||||
|
||||
pub const FACET_MAX_GROUP_SIZE: u8 = 8;
|
||||
pub const FACET_GROUP_SIZE: u8 = 4;
|
||||
pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
||||
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
|
||||
use grenad::Merger;
|
||||
use heed::types::{Bytes, DecodeIgnore};
|
||||
use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
use self::incremental::FacetsUpdateIncremental;
|
||||
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||
|
||||
pub mod bulk;
|
||||
pub mod incremental;
|
||||
|
||||
/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
|
||||
///
|
||||
/// Depending on the number of new elements and the existing size of the database, we use either
|
||||
/// a bulk update method or an incremental update method.
|
||||
pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
data_size: u64,
|
||||
}
|
||||
impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
||||
data_size: u64,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
FacetType::String => {
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
FacetType::Number => {
|
||||
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>()
|
||||
}
|
||||
};
|
||||
Self {
|
||||
index,
|
||||
database,
|
||||
group_size: FACET_GROUP_SIZE,
|
||||
max_group_size: FACET_MAX_GROUP_SIZE,
|
||||
min_level_size: FACET_MIN_LEVEL_SIZE,
|
||||
facet_type,
|
||||
delta_data,
|
||||
normalized_delta_data,
|
||||
data_size,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> {
|
||||
if self.data_size == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
debug!("Computing and writing the facet values levels docids into LMDB on disk...");
|
||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||
|
||||
// See self::comparison_bench::benchmark_facet_indexing
|
||||
if self.data_size >= (self.database.len(wtxn)? / 500) {
|
||||
let field_ids =
|
||||
self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
|
||||
let bulk_update = FacetsUpdateBulk::new(
|
||||
self.index,
|
||||
field_ids,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
);
|
||||
bulk_update.execute(wtxn)?;
|
||||
} else {
|
||||
let incremental_update = FacetsUpdateIncremental::new(
|
||||
self.index,
|
||||
self.facet_type,
|
||||
self.delta_data,
|
||||
self.group_size,
|
||||
self.min_level_size,
|
||||
self.max_group_size,
|
||||
);
|
||||
incremental_update.execute(wtxn)?;
|
||||
}
|
||||
|
||||
match self.normalized_delta_data {
|
||||
Some(data) => index_facet_search(wtxn, data, self.index),
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn index_facet_search(
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
normalized_delta_data: Merger<BufReader<File>, MergeDeladdBtreesetString>,
|
||||
index: &Index,
|
||||
) -> Result<()> {
|
||||
let mut iter = normalized_delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key_bytes, delta_bytes)) = iter.next()? {
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(delta_bytes);
|
||||
|
||||
let database_set = index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.get(wtxn, key_bytes)?
|
||||
.unwrap_or_default();
|
||||
|
||||
let add_set = deladd_reader
|
||||
.get(DelAdd::Addition)
|
||||
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
let del_set = match deladd_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.and_then(|bytes| serde_json::from_slice::<BTreeSet<String>>(bytes).ok())
|
||||
{
|
||||
Some(del_set) => {
|
||||
let (field_id_bytes, _) = try_split_array_at(key_bytes).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
let mut set = BTreeSet::new();
|
||||
for facet in del_set {
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: facet.as_str() };
|
||||
// Check if the referenced value doesn't exist anymore before deleting it.
|
||||
if index
|
||||
.facet_id_string_docids
|
||||
.remap_data_type::<DecodeIgnore>()
|
||||
.get(wtxn, &key)?
|
||||
.is_none()
|
||||
{
|
||||
set.insert(facet);
|
||||
}
|
||||
}
|
||||
set
|
||||
}
|
||||
None => BTreeSet::new(),
|
||||
};
|
||||
|
||||
let set: BTreeSet<_> =
|
||||
database_set.difference(&del_set).chain(add_set.iter()).cloned().collect();
|
||||
|
||||
if set.is_empty() {
|
||||
index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.delete(wtxn, key_bytes)?;
|
||||
} else {
|
||||
index
|
||||
.facet_id_normalized_string_strings
|
||||
.remap_key_type::<Bytes>()
|
||||
.put(wtxn, key_bytes, &set)?;
|
||||
}
|
||||
}
|
||||
|
||||
// We clear the FST of normalized-for-search to compute everything from scratch.
|
||||
index.facet_id_string_fst.clear(wtxn)?;
|
||||
// We compute one FST by string facet
|
||||
let mut text_fsts = vec![];
|
||||
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||
let database = index.facet_id_normalized_string_strings.remap_data_type::<DecodeIgnore>();
|
||||
for result in database.iter(wtxn)? {
|
||||
let ((field_id, normalized_facet), _) = result?;
|
||||
current_fst = match current_fst.take() {
|
||||
Some((fid, fst_builder)) if fid != field_id => {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((fid, fst));
|
||||
Some((field_id, fst::SetBuilder::memory()))
|
||||
}
|
||||
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||
None => Some((field_id, fst::SetBuilder::memory())),
|
||||
};
|
||||
|
||||
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||
fst_builder.insert(normalized_facet)?;
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((field_id, fst_builder)) = current_fst {
|
||||
let fst = fst_builder.into_set();
|
||||
text_fsts.push((field_id, fst));
|
||||
}
|
||||
|
||||
// We write those FSTs in LMDB now
|
||||
for (field_id, fst) in text_fsts {
|
||||
index.facet_id_string_fst.put(wtxn, &field_id, &fst)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test_helpers {
|
||||
use std::cell::Cell;
|
||||
use std::fmt::Display;
|
||||
use std::iter::FromIterator;
|
||||
use std::marker::PhantomData;
|
||||
use std::rc::Rc;
|
||||
|
||||
use grenad::MergerBuilder;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::bulk::FacetsUpdateBulkInner;
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::update::FacetsUpdateIncrementalInner;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
/// Utility function to generate a string whose position in a lexicographically
|
||||
/// ordered list is `i`.
|
||||
pub fn ordered_string(mut i: usize) -> String {
|
||||
// The first string is empty
|
||||
if i == 0 {
|
||||
return String::new();
|
||||
}
|
||||
// The others are 5 char long, each between 'a' and 'z'
|
||||
let mut s = String::new();
|
||||
for _ in 0..5 {
|
||||
let (digit, next) = (i % 26, i / 26);
|
||||
s.insert(0, char::from_u32('a' as u32 + digit as u32).unwrap());
|
||||
i = next;
|
||||
}
|
||||
s
|
||||
}
|
||||
|
||||
/// A dummy index that only contains the facet database, used for testing
|
||||
pub struct FacetIndex<BoundCodec>
|
||||
where
|
||||
for<'a> BoundCodec:
|
||||
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
|
||||
{
|
||||
pub env: Env,
|
||||
pub content: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
pub group_size: Cell<u8>,
|
||||
pub min_level_size: Cell<u8>,
|
||||
pub max_group_size: Cell<u8>,
|
||||
_tempdir: Rc<tempfile::TempDir>,
|
||||
_phantom: PhantomData<BoundCodec>,
|
||||
}
|
||||
|
||||
impl<BoundCodec> FacetIndex<BoundCodec>
|
||||
where
|
||||
for<'a> BoundCodec:
|
||||
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
|
||||
{
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn open_from_tempdir(
|
||||
tempdir: Rc<tempfile::TempDir>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetIndex<BoundCodec> {
|
||||
let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16
|
||||
let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16
|
||||
let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17
|
||||
|
||||
let mut options = heed::EnvOpenOptions::new();
|
||||
let options = options.map_size(4096 * 4 * 10 * 1000);
|
||||
unsafe {
|
||||
options.flag(heed::flags::Flags::MdbAlwaysFreePages);
|
||||
}
|
||||
let env = options.open(tempdir.path()).unwrap();
|
||||
let content = env.open_database(None).unwrap().unwrap();
|
||||
|
||||
FacetIndex {
|
||||
content,
|
||||
group_size: Cell::new(group_size),
|
||||
max_group_size: Cell::new(max_group_size),
|
||||
min_level_size: Cell::new(min_level_size),
|
||||
_tempdir: tempdir,
|
||||
env,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
pub fn new(
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetIndex<BoundCodec> {
|
||||
let group_size = group_size.clamp(2, 127);
|
||||
let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127
|
||||
let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf
|
||||
let mut options = heed::EnvOpenOptions::new();
|
||||
let options = options.map_size(4096 * 4 * 1000 * 100);
|
||||
let tempdir = tempfile::TempDir::new().unwrap();
|
||||
let env = unsafe { options.open(tempdir.path()) }.unwrap();
|
||||
let mut wtxn = env.write_txn().unwrap();
|
||||
let content = env.create_database(&mut wtxn, None).unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
FacetIndex {
|
||||
content,
|
||||
group_size: Cell::new(group_size),
|
||||
max_group_size: Cell::new(max_group_size),
|
||||
min_level_size: Cell::new(min_level_size),
|
||||
_tempdir: Rc::new(tempdir),
|
||||
env,
|
||||
_phantom: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn set_group_size(&self, group_size: u8) {
|
||||
// 2 <= x <= 64
|
||||
self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2)));
|
||||
}
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn set_max_group_size(&self, max_group_size: u8) {
|
||||
// 2*group_size <= x <= 128
|
||||
let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size));
|
||||
self.max_group_size.set(max_group_size);
|
||||
if self.group_size.get() < max_group_size / 2 {
|
||||
self.group_size.set(max_group_size / 2);
|
||||
}
|
||||
}
|
||||
#[cfg(all(test, fuzzing))]
|
||||
pub fn set_min_level_size(&self, min_level_size: u8) {
|
||||
// 1 <= x <= inf
|
||||
self.min_level_size.set(std::cmp::max(1, min_level_size));
|
||||
}
|
||||
|
||||
pub fn insert<'a>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_id: u16,
|
||||
key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
|
||||
docids: &RoaringBitmap,
|
||||
) {
|
||||
let update = FacetsUpdateIncrementalInner {
|
||||
db: self.content,
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
max_group_size: self.max_group_size.get(),
|
||||
};
|
||||
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
|
||||
update.modify(wtxn, field_id, &key_bytes, Some(docids), None).unwrap();
|
||||
update.add_or_delete_level(wtxn, field_id).unwrap();
|
||||
}
|
||||
pub fn delete_single_docid<'a>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_id: u16,
|
||||
key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
|
||||
docid: u32,
|
||||
) {
|
||||
self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid)))
|
||||
}
|
||||
|
||||
pub fn delete<'a>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_id: u16,
|
||||
key: &'a <BoundCodec as BytesEncode<'a>>::EItem,
|
||||
docids: &RoaringBitmap,
|
||||
) {
|
||||
let update = FacetsUpdateIncrementalInner {
|
||||
db: self.content,
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
max_group_size: self.max_group_size.get(),
|
||||
};
|
||||
let key_bytes = BoundCodec::bytes_encode(key).unwrap();
|
||||
update.modify(wtxn, field_id, &key_bytes, None, Some(docids)).unwrap();
|
||||
update.add_or_delete_level(wtxn, field_id).unwrap();
|
||||
}
|
||||
|
||||
pub fn bulk_insert<'a, 'b>(
|
||||
&self,
|
||||
wtxn: &'a mut RwTxn<'_>,
|
||||
field_ids: &[u16],
|
||||
els: impl IntoIterator<
|
||||
Item = &'a ((u16, <BoundCodec as BytesEncode<'a>>::EItem), RoaringBitmap),
|
||||
>,
|
||||
) where
|
||||
for<'c> <BoundCodec as BytesEncode<'c>>::EItem: Sized,
|
||||
{
|
||||
let mut new_data = vec![];
|
||||
let mut writer = grenad::Writer::new(&mut new_data);
|
||||
for ((field_id, left_bound), docids) in els {
|
||||
let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned();
|
||||
let key: FacetGroupKey<&[u8]> =
|
||||
FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_encode(&key).unwrap();
|
||||
let mut inner_writer = KvWriterDelAdd::memory();
|
||||
let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
|
||||
inner_writer.insert(DelAdd::Addition, value).unwrap();
|
||||
writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
builder.push(reader.into_cursor().unwrap());
|
||||
let merger = builder.build();
|
||||
|
||||
let update = FacetsUpdateBulkInner {
|
||||
db: self.content,
|
||||
delta_data: Some(merger),
|
||||
group_size: self.group_size.get(),
|
||||
min_level_size: self.min_level_size.get(),
|
||||
};
|
||||
|
||||
update.update(wtxn, field_ids).unwrap();
|
||||
}
|
||||
|
||||
pub fn verify_structure_validity(&self, txn: &RoTxn<'_>, field_id: u16) {
|
||||
let mut field_id_prefix = vec![];
|
||||
field_id_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
let highest_level = get_highest_level(txn, self.content, field_id).unwrap();
|
||||
|
||||
for level_no in (1..=highest_level).rev() {
|
||||
let mut level_no_prefix = vec![];
|
||||
level_no_prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||
level_no_prefix.push(level_no);
|
||||
|
||||
let iter = self
|
||||
.content
|
||||
.remap_types::<Bytes, FacetGroupValueCodec>()
|
||||
.prefix_iter(txn, &level_no_prefix)
|
||||
.unwrap();
|
||||
for el in iter {
|
||||
let (key, value) = el.unwrap();
|
||||
let key = FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key).unwrap();
|
||||
|
||||
let mut prefix_start_below = vec![];
|
||||
prefix_start_below.extend_from_slice(&field_id.to_be_bytes());
|
||||
prefix_start_below.push(level_no - 1);
|
||||
prefix_start_below.extend_from_slice(key.left_bound);
|
||||
|
||||
let start_below = {
|
||||
let mut start_below_iter = self
|
||||
.content
|
||||
.remap_types::<Bytes, FacetGroupValueCodec>()
|
||||
.prefix_iter(txn, &prefix_start_below)
|
||||
.unwrap();
|
||||
let (key_bytes, _) = start_below_iter.next().unwrap().unwrap();
|
||||
FacetGroupKeyCodec::<BytesRefCodec>::bytes_decode(key_bytes).unwrap()
|
||||
};
|
||||
|
||||
assert!(value.size > 0);
|
||||
|
||||
let mut actual_size = 0;
|
||||
let mut values_below = RoaringBitmap::new();
|
||||
let iter_below = self
|
||||
.content
|
||||
.range(txn, &(start_below..))
|
||||
.unwrap()
|
||||
.take(value.size as usize);
|
||||
for el in iter_below {
|
||||
let (_, value) = el.unwrap();
|
||||
actual_size += 1;
|
||||
values_below |= value.bitmap;
|
||||
}
|
||||
assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}");
|
||||
|
||||
assert_eq!(value.bitmap, values_below);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<BoundCodec> Display for FacetIndex<BoundCodec>
|
||||
where
|
||||
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized + Display,
|
||||
for<'a> BoundCodec:
|
||||
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
|
||||
{
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let txn = self.env.read_txn().unwrap();
|
||||
let iter = self.content.iter(&txn).unwrap();
|
||||
for el in iter {
|
||||
let (key, value) = el.unwrap();
|
||||
let FacetGroupKey { field_id, level, left_bound: bound } = key;
|
||||
let bound = BoundCodec::bytes_decode(bound).unwrap();
|
||||
let FacetGroupValue { size, bitmap } = value;
|
||||
writeln!(
|
||||
f,
|
||||
"{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}",
|
||||
values = display_bitmap(&bitmap)
|
||||
)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(unused)]
|
||||
#[cfg(test)]
|
||||
mod comparison_bench {
|
||||
use std::iter::once;
|
||||
|
||||
use rand::Rng;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::test_helpers::FacetIndex;
|
||||
use crate::heed_codec::facet::OrderedF64Codec;
|
||||
|
||||
// This is a simple test to get an intuition on the relative speed
|
||||
// of the incremental vs. bulk indexer.
|
||||
//
|
||||
// The benchmark shows the worst-case scenario for the incremental indexer, since
|
||||
// each facet value contains only one document ID.
|
||||
//
|
||||
// In that scenario, it appears that the incremental indexer is about 50 times slower than the
|
||||
// bulk indexer.
|
||||
// #[test]
|
||||
fn benchmark_facet_indexing() {
|
||||
let mut facet_value = 0;
|
||||
|
||||
let mut r = rand::thread_rng();
|
||||
|
||||
for i in 1..=20 {
|
||||
let size = 50_000 * i;
|
||||
let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
|
||||
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
|
||||
for i in 0..size {
|
||||
// field id = 0, left_bound = i, docids = [i]
|
||||
elements.push(((0, facet_value as f64), once(i).collect()));
|
||||
facet_value += 1;
|
||||
}
|
||||
let timer = std::time::Instant::now();
|
||||
index.bulk_insert(&mut txn, &[0], elements.iter());
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!("bulk {size} : {time_spent}ms");
|
||||
|
||||
txn.commit().unwrap();
|
||||
|
||||
for nbr_doc in [1, 100, 1000, 10_000] {
|
||||
let mut txn = index.env.write_txn().unwrap();
|
||||
let timer = std::time::Instant::now();
|
||||
//
|
||||
// insert one document
|
||||
//
|
||||
for _ in 0..nbr_doc {
|
||||
index.insert(&mut txn, 0, &r.gen(), &once(1).collect());
|
||||
}
|
||||
let time_spent = timer.elapsed().as_millis();
|
||||
println!(" add {nbr_doc} : {time_spent}ms");
|
||||
txn.abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
b40dd31a65e033ffc6b35c027ce19506
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
7ee22d8e9387e72758f00918eb67e4c6
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
60f567359382507afdaf45fb075740c3
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
b986d6e6cbf425685f409a8b417010e1
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
ee10dd2ae2b5c6621a89a5d0a9aa8ccc
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
fa877559eef78b383b496c15a364a2dc
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
16a96353bc42f2ff3e91611ca4d5b184
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
be1b08073b9d9788d18080c1320151d7
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
16a96353bc42f2ff3e91611ca4d5b184
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
32a45d555df2e001420fea149818d376
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
353d70f52eea66e5031dca989ea8a037
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
52a093c909133d84023a4a7b83864808
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
9d86c72ddb241d0aeca2995d61a3648a
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
c0943177594534bfe5527cbf40fe388e
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/bulk.rs
|
||||
---
|
||||
6ed86f234028ae3df5881bee5512f11e
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
5dbfa134cc44abeb3ab6242fc182e48e
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
6ed7bf5d440599b3b10b37549a271fdf
|
||||
@@ -0,0 +1,19 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k0 1 "[0, ]"
|
||||
0 0 k1 1 "[1, ]"
|
||||
0 0 k2 1 "[2, ]"
|
||||
0 0 k3 1 "[3, ]"
|
||||
0 0 k4 1 "[4, ]"
|
||||
0 0 k5 1 "[5, ]"
|
||||
0 0 k6 1 "[6, ]"
|
||||
0 0 k7 1 "[7, ]"
|
||||
0 0 k8 1 "[8, ]"
|
||||
0 0 k9 1 "[9, ]"
|
||||
0 0 k10 1 "[10, ]"
|
||||
0 0 k11 1 "[11, ]"
|
||||
0 0 k12 1 "[12, ]"
|
||||
0 0 k13 1 "[13, ]"
|
||||
0 0 k14 1 "[14, ]"
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b5203f0df0036ebaa133dd77d63a00eb
|
||||
@@ -0,0 +1,26 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k0 1 "[0, ]"
|
||||
0 0 k1 1 "[1, ]"
|
||||
0 0 k2 1 "[2, ]"
|
||||
0 0 k3 1 "[3, ]"
|
||||
0 0 k4 1 "[4, ]"
|
||||
0 0 k5 1 "[5, ]"
|
||||
0 0 k6 1 "[6, ]"
|
||||
0 0 k7 1 "[7, ]"
|
||||
0 0 k8 1 "[8, ]"
|
||||
0 0 k9 1 "[9, ]"
|
||||
0 0 k10 1 "[10, ]"
|
||||
0 0 k11 1 "[11, ]"
|
||||
0 0 k12 1 "[12, ]"
|
||||
0 0 k13 1 "[13, ]"
|
||||
0 0 k14 1 "[14, ]"
|
||||
0 0 k15 1 "[15, ]"
|
||||
0 0 k16 1 "[16, ]"
|
||||
0 1 k0 4 "[0, 1, 2, 3, ]"
|
||||
0 1 k4 4 "[4, 5, 6, 7, ]"
|
||||
0 1 k8 4 "[8, 9, 10, 11, ]"
|
||||
0 1 k12 4 "[12, 13, 14, 15, ]"
|
||||
0 1 k16 1 "[16, ]"
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
95497d8579740868ee0bfc655b0bf782
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
d565c2f7bbd9e13e12de40cfbbfba6bb
|
||||
@@ -0,0 +1,54 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k216 1 "[216, ]"
|
||||
0 0 k217 1 "[217, ]"
|
||||
0 0 k218 1 "[218, ]"
|
||||
0 0 k219 1 "[219, ]"
|
||||
0 0 k220 1 "[220, ]"
|
||||
0 0 k221 1 "[221, ]"
|
||||
0 0 k222 1 "[222, ]"
|
||||
0 0 k223 1 "[223, ]"
|
||||
0 0 k224 1 "[224, ]"
|
||||
0 0 k225 1 "[225, ]"
|
||||
0 0 k226 1 "[226, ]"
|
||||
0 0 k227 1 "[227, ]"
|
||||
0 0 k228 1 "[228, ]"
|
||||
0 0 k229 1 "[229, ]"
|
||||
0 0 k230 1 "[230, ]"
|
||||
0 0 k231 1 "[231, ]"
|
||||
0 0 k232 1 "[232, ]"
|
||||
0 0 k233 1 "[233, ]"
|
||||
0 0 k234 1 "[234, ]"
|
||||
0 0 k235 1 "[235, ]"
|
||||
0 0 k236 1 "[236, ]"
|
||||
0 0 k237 1 "[237, ]"
|
||||
0 0 k238 1 "[238, ]"
|
||||
0 0 k239 1 "[239, ]"
|
||||
0 0 k240 1 "[240, ]"
|
||||
0 0 k241 1 "[241, ]"
|
||||
0 0 k242 1 "[242, ]"
|
||||
0 0 k243 1 "[243, ]"
|
||||
0 0 k244 1 "[244, ]"
|
||||
0 0 k245 1 "[245, ]"
|
||||
0 0 k246 1 "[246, ]"
|
||||
0 0 k247 1 "[247, ]"
|
||||
0 0 k248 1 "[248, ]"
|
||||
0 0 k249 1 "[249, ]"
|
||||
0 0 k250 1 "[250, ]"
|
||||
0 0 k251 1 "[251, ]"
|
||||
0 0 k252 1 "[252, ]"
|
||||
0 0 k253 1 "[253, ]"
|
||||
0 0 k254 1 "[254, ]"
|
||||
0 0 k255 1 "[255, ]"
|
||||
0 1 k216 4 "[216, 217, 218, 219, ]"
|
||||
0 1 k220 4 "[220, 221, 222, 223, ]"
|
||||
0 1 k224 4 "[224, 225, 226, 227, ]"
|
||||
0 1 k228 4 "[228, 229, 230, 231, ]"
|
||||
0 1 k232 4 "[232, 233, 234, 235, ]"
|
||||
0 1 k236 4 "[236, 237, 238, 239, ]"
|
||||
0 1 k240 4 "[240, 241, 242, 243, ]"
|
||||
0 1 k244 4 "[244, 245, 246, 247, ]"
|
||||
0 1 k248 4 "[248, 249, 250, 251, ]"
|
||||
0 1 k252 4 "[252, 253, 254, 255, ]"
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
7cb503827ba17e9670296cc9531a1380
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b061f43e379e16f0617c05d3313d0078
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
81fc9489d6b163935b97433477dea63b
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b17b2c4ec87a778aae07854c96c08b48
|
||||
@@ -0,0 +1,20 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
0 0 k0 1 "[3, 435, 583, 849, ]"
|
||||
0 0 k1 1 "[35, 494, 693, 796, ]"
|
||||
0 0 k2 1 "[76, 420, 526, 909, ]"
|
||||
0 0 k3 1 "[133, 451, 653, 806, ]"
|
||||
0 0 k4 1 "[131, 464, 656, 853, ]"
|
||||
0 0 k5 1 "[61, 308, 701, 903, ]"
|
||||
0 0 k6 1 "[144, 449, 674, 794, ]"
|
||||
0 0 k7 1 "[182, 451, 735, 941, ]"
|
||||
0 0 k8 1 "[6, 359, 679, 1003, ]"
|
||||
0 0 k9 1 "[197, 418, 659, 904, ]"
|
||||
0 0 k10 1 "[88, 297, 567, 800, ]"
|
||||
0 0 k11 1 "[150, 309, 530, 946, ]"
|
||||
0 0 k12 1 "[156, 466, 567, 892, ]"
|
||||
0 0 k13 1 "[46, 425, 610, 807, ]"
|
||||
0 0 k14 1 "[236, 433, 549, 891, ]"
|
||||
0 0 k15 1 "[207, 472, 603, 974, ]"
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
7f8aa18d2b3a6422d55c03bede0563db
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
7f8aa18d2b3a6422d55c03bede0563db
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
b3e2de9020d9e0f3941bc3a179c795ba
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
5dbfa134cc44abeb3ab6242fc182e48e
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
9343355bf535ed4a0c956df2b229d5e6
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
4fc800f49201a336295af0542fdf01ab
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/facet/incremental.rs
|
||||
---
|
||||
fd65ce7d96a07aafb0ef6cfb5bf016b8
|
||||
263
crates/milli/src/update/index_documents/enrich.rs
Normal file
263
crates/milli/src/update/index_documents/enrich.rs
Normal file
@@ -0,0 +1,263 @@
|
||||
use std::fmt;
|
||||
use std::io::{BufWriter, Read, Seek};
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::documents::{
|
||||
DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
|
||||
EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
|
||||
};
|
||||
use crate::error::{GeoError, InternalError, UserError};
|
||||
use crate::update::index_documents::{obkv_to_object, writer_into_reader};
|
||||
use crate::{FieldId, Index, Result};
|
||||
|
||||
/// This function validates and enrich the documents by checking that:
|
||||
/// - we can infer a primary key,
|
||||
/// - all the documents id exist and are extracted,
|
||||
/// - the validity of them but also,
|
||||
/// - the validity of the `_geo` field depending on the settings.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - if reader.is_empty(), this function may panic in some cases
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents")]
|
||||
pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
index: &Index,
|
||||
autogenerate_docids: bool,
|
||||
reader: DocumentsBatchReader<R>,
|
||||
) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
|
||||
let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
|
||||
let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
|
||||
|
||||
// The primary key *field id* that has already been set for this index or the one
|
||||
// we will guess by searching for the first key that contains "id" as a substring.
|
||||
let primary_key = match index.primary_key(rtxn)? {
|
||||
Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
|
||||
Some(primary_key) => primary_key,
|
||||
None if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: primary_key,
|
||||
field_id: documents_batch_index.insert(primary_key),
|
||||
},
|
||||
None => {
|
||||
return match cursor.next_document()? {
|
||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.to_string(),
|
||||
document: obkv_to_object(first_document, &documents_batch_index)?,
|
||||
})),
|
||||
None => unreachable!("Called with reader.is_empty()"),
|
||||
};
|
||||
}
|
||||
},
|
||||
None => {
|
||||
let mut guesses: Vec<(u16, &str)> = documents_batch_index
|
||||
.iter()
|
||||
.filter(|(_, name)| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
||||
.map(|(field_id, name)| (*field_id, name.as_str()))
|
||||
.collect();
|
||||
|
||||
// sort the keys in a deterministic, obvious way, so that fields are always in the same order.
|
||||
guesses.sort_by(|(_, left_name), (_, right_name)| {
|
||||
// shortest name first
|
||||
left_name.len().cmp(&right_name.len()).then_with(
|
||||
// then alphabetical order
|
||||
|| left_name.cmp(right_name),
|
||||
)
|
||||
});
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] if autogenerate_docids => PrimaryKey::Flat {
|
||||
name: DEFAULT_PRIMARY_KEY,
|
||||
field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
|
||||
},
|
||||
[] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[(field_id, name)] => {
|
||||
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
PrimaryKey::Flat { name, field_id: *field_id }
|
||||
}
|
||||
multiple => {
|
||||
return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
candidates: multiple
|
||||
.iter()
|
||||
.map(|(_, candidate)| candidate.to_string())
|
||||
.collect(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// If the settings specifies that a _geo field must be used therefore we must check the
|
||||
// validity of it in all the documents of this batch and this is when we return `Some`.
|
||||
let geo_field_id = match documents_batch_index.id("_geo") {
|
||||
Some(geo_field_id)
|
||||
if index.sortable_fields(rtxn)?.contains("_geo")
|
||||
|| index.filterable_fields(rtxn)?.contains("_geo") =>
|
||||
{
|
||||
Some(geo_field_id)
|
||||
}
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
let mut count = 0;
|
||||
while let Some(document) = cursor.next_document()? {
|
||||
let document_id = match fetch_or_generate_document_id(
|
||||
document,
|
||||
&documents_batch_index,
|
||||
primary_key,
|
||||
autogenerate_docids,
|
||||
&mut uuid_buffer,
|
||||
count,
|
||||
)? {
|
||||
Ok(document_id) => document_id,
|
||||
Err(user_error) => return Ok(Err(user_error)),
|
||||
};
|
||||
|
||||
if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
|
||||
if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
|
||||
return Ok(Err(UserError::from(user_error)));
|
||||
}
|
||||
}
|
||||
|
||||
let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?;
|
||||
external_ids.insert(count.to_be_bytes(), document_id)?;
|
||||
|
||||
count += 1;
|
||||
}
|
||||
|
||||
let external_ids = writer_into_reader(external_ids)?;
|
||||
let primary_key_name = primary_key.name().to_string();
|
||||
let reader = EnrichedDocumentsBatchReader::new(
|
||||
DocumentsBatchReader::new(cursor, documents_batch_index),
|
||||
primary_key_name,
|
||||
external_ids,
|
||||
)?;
|
||||
|
||||
Ok(Ok(reader))
|
||||
}
|
||||
|
||||
/// Retrieve the document id after validating it, returning a `UserError`
|
||||
/// if the id is invalid or can't be guessed.
|
||||
#[tracing::instrument(level = "trace", skip(uuid_buffer, documents_batch_index, document)
|
||||
target = "indexing::documents")]
|
||||
fn fetch_or_generate_document_id(
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
documents_batch_index: &DocumentsBatchIndex,
|
||||
primary_key: PrimaryKey<'_>,
|
||||
autogenerate_docids: bool,
|
||||
uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
|
||||
count: u32,
|
||||
) -> Result<StdResult<DocumentId, UserError>> {
|
||||
Ok(match primary_key.document_id(document, documents_batch_index)? {
|
||||
Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
|
||||
Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
|
||||
let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
|
||||
Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
|
||||
}
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
}),
|
||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||
Err(UserError::TooManyDocumentIds {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: obkv_to_object(document, documents_batch_index)?,
|
||||
})
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
/// A type that represents a document id that has been retrieved from a document or auto-generated.
|
||||
///
|
||||
/// In case the document id has been auto-generated, the document nth is kept to help
|
||||
/// users debug if there is an issue with the document itself.
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub enum DocumentId {
|
||||
Retrieved { value: String },
|
||||
Generated { value: String, document_nth: u32 },
|
||||
}
|
||||
|
||||
impl DocumentId {
|
||||
fn debug(&self) -> String {
|
||||
format!("{:?}", self)
|
||||
}
|
||||
|
||||
pub fn is_generated(&self) -> bool {
|
||||
matches!(self, DocumentId::Generated { .. })
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &str {
|
||||
match self {
|
||||
DocumentId::Retrieved { value } => value,
|
||||
DocumentId::Generated { value, .. } => value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for DocumentId {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
match self {
|
||||
DocumentId::Retrieved { value } => write!(f, "{:?}", value),
|
||||
DocumentId::Generated { value, document_nth } => {
|
||||
write!(f, "{{{:?}}} of the {}nth document", value, document_nth)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to extract an `f64` from a JSON `Value` and return the `Value`
|
||||
/// in the `Err` variant if it failed.
|
||||
pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||
let number = match value {
|
||||
Value::Number(ref n) => match n.as_f64() {
|
||||
Some(number) => number,
|
||||
None => return Err(value),
|
||||
},
|
||||
Value::String(ref s) => match s.parse::<f64>() {
|
||||
Ok(number) => number,
|
||||
Err(_) => return Err(value),
|
||||
},
|
||||
value => return Err(value),
|
||||
};
|
||||
|
||||
if number.is_finite() {
|
||||
Ok(number)
|
||||
} else {
|
||||
Err(value)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
|
||||
use GeoError::*;
|
||||
let debug_id = || {
|
||||
serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
|
||||
};
|
||||
match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
|
||||
(Some(lat), Some(lng)) => {
|
||||
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
|
||||
(Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
|
||||
document_id: debug_id(),
|
||||
value: object.into(),
|
||||
})),
|
||||
(Ok(_), Ok(_)) => Ok(Ok(())),
|
||||
(Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
|
||||
(Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
|
||||
(Err(lat), Err(lng)) => {
|
||||
Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng }))
|
||||
}
|
||||
}
|
||||
}
|
||||
(None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })),
|
||||
(Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
|
||||
(None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
|
||||
},
|
||||
Value::Null => Ok(Ok(())),
|
||||
value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,319 @@
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::{KvReader, KvWriterU16};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepLatestObkv};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
///
|
||||
/// Returns the generated internal documents ids and a grenad reader
|
||||
/// with the list of extracted words from the given chunk of documents.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let force_reindexing = settings_diff.reindex_searchable();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
KeepLatestObkv,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
// initialize buffers.
|
||||
let mut del_buffers = Buffers::default();
|
||||
let mut add_buffers = Buffers::default();
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||
let old_separators: Option<Vec<_>> = settings_diff
|
||||
.old
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let del_builder =
|
||||
tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref());
|
||||
let del_tokenizer = del_builder.into_tokenizer();
|
||||
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> = settings_diff
|
||||
.new
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let add_builder =
|
||||
tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref());
|
||||
let add_tokenizer = add_builder.into_tokenizer();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::from_slice(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !force_reindexing && !searchable_fields_changed(obkv, settings_diff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
|
||||
// Update key buffer prefix.
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
// Tokenize deletions and additions in 2 diffferent threads.
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
&settings_diff.old,
|
||||
&del_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
)
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
&settings_diff.new,
|
||||
&add_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
let del_obkv = del?;
|
||||
let add_obkv = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::from_slice(del_obkv),
|
||||
KvReader::<FieldId>::from_slice(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::from_slice(&value_buffer);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let del_add = KvReaderDelAdd::from_slice(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
// if both contains a value and values are the same, check the next field.
|
||||
(Some(del), Some(add)) if del == add => (),
|
||||
// otherwise the fields are different, return true.
|
||||
_otherwise => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &'a KvReader<FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
tokenizer: &Tokenizer<'_>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
) -> Result<&'a [u8]> {
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if settings.searchable_fields_ids.contains(&field_id) {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
// prepare writing destination.
|
||||
buffers.obkv_positions_buffer.clear();
|
||||
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||
|
||||
// convert json into a unique string.
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = settings.localized_searchable_fields_ids.locales(field_id);
|
||||
let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
writer.insert(position, token.as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
// write positions into document.
|
||||
let positions = writer.into_inner()?;
|
||||
document_writer.insert(field_id, positions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns a KV<FieldId, KV<u16, String>>
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null | Value::Object(_) => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Value::String(string) = value {
|
||||
Some(string)
|
||||
} else if inner(value, buffer) {
|
||||
Some(buffer)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standard proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((0, None), |(offset, prev_kind), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => token.kind = TokenKind::Unknown,
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Buffers {
|
||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||
field_buffer: String,
|
||||
// buffer used to store the value data containing an obkv.
|
||||
obkv_buffer: Vec<u8>,
|
||||
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||
obkv_positions_buffer: Vec<u8>,
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet numbers and
|
||||
/// documents ids from the given chunk of docid facet number positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id, document_id, number) =
|
||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::from_slice(deladd_obkv_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||
}
|
||||
@@ -0,0 +1,303 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
use std::{io, str};
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
use charabia::{Language, StrDetection, Token};
|
||||
use heed::types::SerdeJson;
|
||||
use heed::BytesEncode;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
if settings_diff.settings_update_only() {
|
||||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdBtreesetString,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// as the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdBtreesetString,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
|
||||
if is_same_value && are_same_locales {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
|
||||
let new_hyper_normalized_value = if are_same_locales {
|
||||
&old_hyper_normalized_value
|
||||
} else {
|
||||
&normalize_facet_string(normalized_value, new_locales)
|
||||
};
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
} else {
|
||||
// if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different.
|
||||
// deletion
|
||||
if deladd_reader.get(DelAdd::Deletion).is_some() {
|
||||
// insert old value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, val)?;
|
||||
obkv.finish()?;
|
||||
let key: (u16, &str) = (field_id, old_hyper_normalized_value.as_ref());
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
// addition
|
||||
if deladd_reader.get(DelAdd::Addition).is_some() {
|
||||
// insert new value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, val)?;
|
||||
obkv.finish()?;
|
||||
let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Normalizes the facet string and truncates it to the max length.
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
let script = detection.script();
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script,
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// truncate the facet string to the max length
|
||||
token
|
||||
.normalize(&options)
|
||||
.lemma
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect()
|
||||
}
|
||||
@@ -0,0 +1,574 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::mem::size_of;
|
||||
|
||||
use bytemuck::bytes_of;
|
||||
use grenad::Sorter;
|
||||
use heed::BytesEncode;
|
||||
use itertools::{merge_join_by, EitherOrBoth, Itertools};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
use FilterableValues::{Empty, Null, Values};
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepFirst};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the original value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
KeepFirst,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
KeepFirst,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
// The tuples represents the Del and Add side for a bitmap
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
|
||||
// We create two buffers for mutable ref issues with closures.
|
||||
let mut numbers_key_buffer = Vec::new();
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let old_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.old.faceted_fields_ids.iter().copied().collect();
|
||||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
|
||||
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
let get_document_json_value = move |field_id, side| {
|
||||
obkv.get(field_id)
|
||||
.map(KvReaderDelAdd::from_slice)
|
||||
.and_then(|kv| kv.get(side))
|
||||
.map(from_slice)
|
||||
.transpose()
|
||||
.map_err(InternalError::SerdeJson)
|
||||
};
|
||||
// iterate over the faceted fields instead of over the whole document.
|
||||
for eob in
|
||||
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
|
||||
old.cmp(new)
|
||||
})
|
||||
{
|
||||
let (field_id, del_value, add_value) = match eob {
|
||||
EitherOrBoth::Left(&field_id) => {
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
|
||||
// deletion only
|
||||
(field_id, del_value, None)
|
||||
}
|
||||
EitherOrBoth::Right(&field_id) => {
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
// addition only
|
||||
(field_id, None, add_value)
|
||||
}
|
||||
EitherOrBoth::Both(&field_id, _) => {
|
||||
// during settings update, recompute the changing settings only.
|
||||
if settings_diff.settings_update_only {
|
||||
continue;
|
||||
}
|
||||
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
(field_id, del_value, add_value)
|
||||
}
|
||||
};
|
||||
|
||||
if del_value.is_some() || add_value.is_some() {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = DocumentId::from_be_bytes(document);
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
|
||||
let del_geo_support = settings_diff
|
||||
.old
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let add_geo_support = settings_diff
|
||||
.new
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, del_geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, add_geo_support));
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut facet_exists_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
let mut facet_is_null_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
let mut facet_is_empty_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||
buffer: &mut Vec<u8>,
|
||||
del_bitmap: &RoaringBitmap,
|
||||
add_bitmap: &RoaringBitmap,
|
||||
) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||
obkv.finish()
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_str(s: &str) -> &str {
|
||||
let index = s
|
||||
.char_indices()
|
||||
.map(|(idx, _)| idx)
|
||||
.chain(std::iter::once(s.len()))
|
||||
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||
.last();
|
||||
|
||||
&s[..index.unwrap_or(0)]
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_numbers_diff(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<KeepFirst>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_numbers: Vec<f64>,
|
||||
mut add_numbers: Vec<f64>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the float numbers
|
||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
|
||||
let merged_numbers_iter = itertools::merge_join_by(
|
||||
del_numbers.into_iter().map(OrderedFloat),
|
||||
add_numbers.into_iter().map(OrderedFloat),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for eob in merged_numbers_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, bytes_of(&()))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, bytes_of(&()))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add strings and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_strings_diff(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<KeepFirst>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_strings: Vec<(String, String)>,
|
||||
mut add_strings: Vec<(String, String)>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the normalized and original strings
|
||||
del_strings.sort_unstable();
|
||||
add_strings.sort_unstable();
|
||||
del_strings.dedup();
|
||||
add_strings.dedup();
|
||||
|
||||
let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||
let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||
|
||||
let merged_strings_iter = itertools::merge_join_by(
|
||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
|(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
|
||||
);
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for eob in merged_strings_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let (side, normalized, original) = match eob {
|
||||
EitherOrBoth::Both((normalized, del), (_, add)) => {
|
||||
let merged_strings_iter =
|
||||
itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
|
||||
original_del.cmp(original_add)
|
||||
});
|
||||
|
||||
// FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
|
||||
// but we possibly have multiple original values that changed in the case where the field is an
|
||||
// array of multiple values that normalize to the same value.
|
||||
// (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
|
||||
//
|
||||
// We'll work best effort by ignoring when the same value appears in both sides, deleting the first
|
||||
// value that is only in the old version, and adding the first value that is only in the new version
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
let mut del = None;
|
||||
let mut add = None;
|
||||
let mut both = None;
|
||||
|
||||
for eob in merged_strings_iter {
|
||||
match eob {
|
||||
EitherOrBoth::Both((_normalized, original), _) => {
|
||||
both = match both {
|
||||
Some(both) => Some(both),
|
||||
None => Some(original),
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Left((_normalized, original)) => {
|
||||
del = match del {
|
||||
Some(del) => Some(del),
|
||||
None => Some(original),
|
||||
};
|
||||
}
|
||||
EitherOrBoth::Right((_normalized, original)) => {
|
||||
add = match add {
|
||||
Some(add) => Some(add),
|
||||
None => Some(original),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(del) = del {
|
||||
obkv.insert(DelAdd::Deletion, del)?;
|
||||
}
|
||||
if let Some(add) = add
|
||||
// prefer the newly added, but if there is none, keep a value in the list of values
|
||||
// since the normalized value appears both in old and new, we should never remove it.
|
||||
.or(both)
|
||||
{
|
||||
obkv.insert(DelAdd::Addition, add)?;
|
||||
}
|
||||
|
||||
let truncated = truncate_str(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
continue;
|
||||
}
|
||||
EitherOrBoth::Left((_normalized, mut original)) => {
|
||||
// FIXME: we only consider the first value for the purpose of facet search
|
||||
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||
let Some((normalized, original)) = original.next() else {
|
||||
continue;
|
||||
};
|
||||
(DelAdd::Deletion, normalized, original)
|
||||
}
|
||||
EitherOrBoth::Right((_normalized, mut original)) => {
|
||||
// FIXME: we only consider the first value for the purpose of facet search
|
||||
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||
let Some((normalized, original)) = original.next() else {
|
||||
continue;
|
||||
};
|
||||
(DelAdd::Addition, normalized, original)
|
||||
}
|
||||
};
|
||||
let truncated = truncate_str(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(side, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
Null,
|
||||
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
|
||||
Empty,
|
||||
/// Represents all the numbers and strings values found in this document field.
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
/// Extracts the facet values of a JSON field.
|
||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
geo_field: bool,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
// if we're working on a geofield it MUST be something we can parse or else there was an internal error
|
||||
// in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
|
||||
if geo_field {
|
||||
if let Ok(float) = original.parse() {
|
||||
output_numbers.push(float);
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"Internal error, could not parse a geofield that has been validated. Please open an issue."
|
||||
)
|
||||
}
|
||||
}
|
||||
let normalized = crate::normalize_facet(original);
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(
|
||||
value,
|
||||
false,
|
||||
output_numbers,
|
||||
output_strings,
|
||||
geo_field,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
||||
match value {
|
||||
Value::Null => FilterableValues::Null,
|
||||
Value::String(s) if s.is_empty() => FilterableValues::Empty,
|
||||
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
|
||||
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,96 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted field id word counts
|
||||
/// and documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| {
|
||||
KvReaderU16::from_slice(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()
|
||||
})
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| {
|
||||
KvReaderU16::from_slice(addition).iter().take(MAX_COUNTED_WORDS + 1).count()
|
||||
})
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
if deletion != addition {
|
||||
// Insert deleted word count in sorter if exist.
|
||||
if let Some(word_count) = deletion {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
// Insert added word count in sorter if exist.
|
||||
if let Some(word_count) = addition {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
||||
@@ -0,0 +1,103 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::GeoError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::extract_finite_float_from_value;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let reader = KvReaderDelAdd::from_slice(obkv.get(primary_key_id).unwrap());
|
||||
let document_id =
|
||||
reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// extract old version
|
||||
let del_lat_lng = extract_lat_lng(obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
|
||||
// extract new version
|
||||
let add_lat_lng = extract_lat_lng(obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
|
||||
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
if let Some([lat, lng]) = del_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||
}
|
||||
if let Some([lat, lng]) = add_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Addition, bytes)?;
|
||||
}
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
deladd: DelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<Option<[f64; 2]>> {
|
||||
match settings.geo_fields_ids {
|
||||
Some((lat_fid, lng_fid)) => {
|
||||
let lat =
|
||||
document.get(lat_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
|
||||
let lng =
|
||||
document.get(lng_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
|
||||
let (lat, lng) = match (lat, lng) {
|
||||
(Some(lat), Some(lng)) => (lat, lng),
|
||||
(Some(_), None) => {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, Some(_)) => {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, None) => return Ok(None),
|
||||
};
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
Ok(Some([lat, lng]))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,841 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::FaultSource;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::vector::settings::ReindexAction;
|
||||
use crate::vector::{Embedder, Embedding};
|
||||
use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
pub struct ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
pub manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> ()
|
||||
pub remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> prompt
|
||||
pub prompts: grenad::Reader<BufReader<File>>,
|
||||
|
||||
// embedder
|
||||
pub embedder_name: String,
|
||||
pub embedder: Arc<Embedder>,
|
||||
pub add_to_user_provided: RoaringBitmap,
|
||||
pub remove_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
NoChange,
|
||||
// Remove all vectors, generated or manual, from this document
|
||||
NowRemoved,
|
||||
|
||||
NowManual(Vec<Vec<f32>>),
|
||||
|
||||
// Add the vector computed from the specified prompt
|
||||
// Remove any previous vector
|
||||
// Note: changing the value of the prompt **does require** recording this delta
|
||||
NowGenerated(String),
|
||||
}
|
||||
|
||||
impl VectorStateDelta {
|
||||
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
|
||||
match self {
|
||||
VectorStateDelta::NoChange => Default::default(),
|
||||
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
||||
// We always delete the previous vectors
|
||||
VectorStateDelta::NowManual(add) => (true, Default::default(), add),
|
||||
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct EmbedderVectorExtractor {
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
prompt: Arc<Prompt>,
|
||||
|
||||
// (docid) -> (prompt)
|
||||
prompts_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> ()
|
||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
manual_vectors_writer: Writer<BufWriter<File>>,
|
||||
// The docids of the documents that contains a user defined embedding
|
||||
add_to_user_provided: RoaringBitmap,
|
||||
|
||||
action: ExtractionAction,
|
||||
}
|
||||
|
||||
struct DocumentOperation {
|
||||
// The docids of the documents that contains an auto-generated embedding
|
||||
remove_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum ExtractionAction {
|
||||
SettingsFullReindex,
|
||||
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
|
||||
DocumentOperation(DocumentOperation),
|
||||
}
|
||||
|
||||
struct ManualEmbedderErrors {
|
||||
embedder_name: String,
|
||||
docid: String,
|
||||
other_docids: usize,
|
||||
}
|
||||
|
||||
impl ManualEmbedderErrors {
|
||||
pub fn push_error(
|
||||
errors: &mut Option<ManualEmbedderErrors>,
|
||||
embedder_name: &str,
|
||||
document_id: impl Fn() -> Value,
|
||||
) {
|
||||
match errors {
|
||||
Some(errors) => {
|
||||
if errors.embedder_name == embedder_name {
|
||||
errors.other_docids = errors.other_docids.saturating_add(1)
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*errors = Some(Self {
|
||||
embedder_name: embedder_name.to_owned(),
|
||||
docid: document_id().to_string(),
|
||||
other_docids: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_result(
|
||||
errors: Option<ManualEmbedderErrors>,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
) -> Result<()> {
|
||||
match errors {
|
||||
Some(errors) => {
|
||||
let embedder_name = &errors.embedder_name;
|
||||
let mut msg = format!(
|
||||
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document {}{}",
|
||||
errors.docid,
|
||||
if errors.other_docids != 0 {
|
||||
format!(" and at least {} other document(s)", errors.other_docids)
|
||||
} else {
|
||||
"".to_string()
|
||||
}
|
||||
);
|
||||
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedders_configs: &[IndexEmbeddingConfig],
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
) -> Result<(Vec<ExtractedVectorPoints>, UnusedVectorsDistribution)> {
|
||||
let mut unused_vectors_distribution = UnusedVectorsDistribution::new();
|
||||
let mut manual_errors = None;
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let old_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
|
||||
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
let new_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut extractors = Vec::new();
|
||||
|
||||
let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
|
||||
let old_configs = &settings_diff.old.embedding_configs;
|
||||
|
||||
if reindex_vectors {
|
||||
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
||||
if let Some(action) = action.reindex() {
|
||||
let Some((embedder_name, (embedder, prompt, _quantized))) =
|
||||
configs.remove_entry(name)
|
||||
else {
|
||||
tracing::error!(embedder = name, "Requested embedder config not found");
|
||||
continue;
|
||||
};
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let action = match action {
|
||||
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
|
||||
ReindexAction::RegeneratePrompts => {
|
||||
let Some((_, old_prompt, _quantized)) = old_configs.get(name) else {
|
||||
tracing::error!(embedder = name, "Old embedder config not found");
|
||||
continue;
|
||||
};
|
||||
|
||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
|
||||
}
|
||||
};
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided: RoaringBitmap::new(),
|
||||
action,
|
||||
});
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// document operation
|
||||
|
||||
for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() {
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided: RoaringBitmap::new(),
|
||||
action: ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided: RoaringBitmap::new(),
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
let docid = DocumentId::from_be_bytes(docid_bytes);
|
||||
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
||||
|
||||
// since we only need the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(
|
||||
docid,
|
||||
embedders_configs,
|
||||
obkv,
|
||||
old_vectors_fid,
|
||||
new_vectors_fid,
|
||||
)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided,
|
||||
action,
|
||||
} in extractors.iter_mut()
|
||||
{
|
||||
let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_));
|
||||
|
||||
let (old, new) = parsed_vectors.remove(embedder_name);
|
||||
let delta = match action {
|
||||
ExtractionAction::SettingsFullReindex => match old {
|
||||
// A full reindex can be triggered either by:
|
||||
// 1. a new embedder
|
||||
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
|
||||
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
|
||||
VectorState::Inline(vectors) => {
|
||||
if !vectors.must_regenerate() {
|
||||
add_to_user_provided.insert(docid);
|
||||
}
|
||||
|
||||
match vectors.into_array_of_vectors() {
|
||||
Some(add_vectors) => {
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(
|
||||
crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
),
|
||||
));
|
||||
}
|
||||
VectorStateDelta::NowManual(add_vectors)
|
||||
}
|
||||
None => VectorStateDelta::NoChange,
|
||||
}
|
||||
}
|
||||
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
||||
VectorState::Manual => VectorStateDelta::NoChange,
|
||||
// generated vectors must be regenerated
|
||||
VectorState::Generated => {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(
|
||||
&mut manual_errors,
|
||||
embedder_name.as_str(),
|
||||
document_id,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
|
||||
}
|
||||
},
|
||||
// prompt regeneration is only triggered for existing embedders
|
||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
||||
if old.must_regenerate() {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(
|
||||
&mut manual_errors,
|
||||
embedder_name.as_str(),
|
||||
document_id,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_if_prompt_changed(
|
||||
obkv,
|
||||
(old_prompt, prompt),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
)?
|
||||
} else {
|
||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||
// already in the DB since this is an existing embedder
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
}
|
||||
ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided,
|
||||
}) => extract_vector_document_diff(
|
||||
docid,
|
||||
obkv,
|
||||
prompt,
|
||||
(add_to_user_provided, remove_from_user_provided),
|
||||
(old, new),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
document_id,
|
||||
embedder_name,
|
||||
embedder_is_manual,
|
||||
&mut manual_errors,
|
||||
)?,
|
||||
};
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
remove_vectors_writer,
|
||||
prompts_writer,
|
||||
manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
)?;
|
||||
}
|
||||
|
||||
unused_vectors_distribution.append(parsed_vectors);
|
||||
}
|
||||
|
||||
ManualEmbedderErrors::to_result(
|
||||
manual_errors,
|
||||
possible_embedding_mistakes,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt: _,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
action,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided,
|
||||
} in extractors
|
||||
{
|
||||
let remove_from_user_provided =
|
||||
if let ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided,
|
||||
}) = action
|
||||
{
|
||||
remove_from_user_provided
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
results.push(ExtractedVectorPoints {
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
embedder,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
})
|
||||
}
|
||||
|
||||
Ok((results, unused_vectors_distribution))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments
|
||||
fn extract_vector_document_diff(
|
||||
docid: DocumentId,
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||
(old, new): (VectorState, VectorState),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
document_id: impl Fn() -> Value,
|
||||
embedder_name: &str,
|
||||
embedder_is_manual: bool,
|
||||
manual_errors: &mut Option<ManualEmbedderErrors>,
|
||||
) -> Result<VectorStateDelta> {
|
||||
match (old.must_regenerate(), new.must_regenerate()) {
|
||||
(true, true) | (false, false) => {}
|
||||
(true, false) => {
|
||||
add_to_user_provided.insert(docid);
|
||||
}
|
||||
(false, true) => {
|
||||
remove_from_user_provided.insert(docid);
|
||||
}
|
||||
}
|
||||
|
||||
let delta = match (old, new) {
|
||||
// regardless of the previous state, if a document now contains inline _vectors, they must
|
||||
// be extracted manually
|
||||
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
|
||||
Some(add_vectors) => {
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::NowManual(add_vectors)
|
||||
}
|
||||
None => VectorStateDelta::NoChange,
|
||||
},
|
||||
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
|
||||
// document changed
|
||||
(VectorState::Generated, VectorState::Generated) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(&prompt).map(|p| {
|
||||
p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
.unwrap_or_default()
|
||||
});
|
||||
let new_prompt =
|
||||
prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
|
||||
// the previous version of the document.
|
||||
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
|
||||
// Generated -> Generated is handled above, so not possible
|
||||
// As a result, this code is unreachable
|
||||
(_not_generated, VectorState::Generated) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render_kvdeladd(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
// make sure the document is always removed from user provided on removal
|
||||
remove_from_user_provided.insert(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
|
||||
// version of the document.
|
||||
// however the Rust type system cannot know that.
|
||||
(_manual, VectorState::Manual) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// if the new version of documents has the vectors in the DB,
|
||||
// then they are user-provided and nothing possibly changed
|
||||
VectorStateDelta::NoChange
|
||||
} else {
|
||||
// make sure the document is always removed from user provided on removal
|
||||
remove_from_user_provided.insert(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(delta)
|
||||
}
|
||||
|
||||
fn regenerate_if_prompt_changed(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
) -> Result<VectorStateDelta> {
|
||||
let old_prompt = old_prompt
|
||||
.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
.unwrap_or(Default::default());
|
||||
let new_prompt = new_prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
if new_prompt == old_prompt {
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
Ok(VectorStateDelta::NowGenerated(new_prompt))
|
||||
}
|
||||
|
||||
fn regenerate_prompt(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||
) -> Result<VectorStateDelta> {
|
||||
let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
Ok(VectorStateDelta::NowGenerated(prompt))
|
||||
}
|
||||
|
||||
/// We cannot compute the diff between both Del and Add vectors.
|
||||
/// We'll push every vector and compute the difference later in TypedChunk.
|
||||
fn push_vectors_diff(
|
||||
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
prompts_writer: &mut Writer<BufWriter<File>>,
|
||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
) -> Result<()> {
|
||||
let (must_remove, prompt, mut add_vectors) = delta.into_values();
|
||||
if must_remove {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
}
|
||||
if !prompt.is_empty() {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
prompts_writer.insert(&key_buffer, prompt.as_bytes())?;
|
||||
}
|
||||
|
||||
// We sort and dedup the vectors
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compares two vectors by using the OrderingFloat helper.
|
||||
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
// docid, state with embedding
|
||||
let mut state_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut chunks = Vec::with_capacity(n_chunks);
|
||||
let mut current_chunk = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut current_chunk_ids = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut chunks_ids = Vec::with_capacity(n_chunks);
|
||||
let mut cursor = prompt_reader.into_cursor()?;
|
||||
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
// SAFETY: precondition, the grenad value was saved from a string
|
||||
let prompt = unsafe { std::str::from_utf8_unchecked(value) };
|
||||
if current_chunk.len() == current_chunk.capacity() {
|
||||
chunks.push(std::mem::replace(
|
||||
&mut current_chunk,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
chunks_ids.push(std::mem::replace(
|
||||
&mut current_chunk_ids,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
};
|
||||
current_chunk.push(prompt.to_owned());
|
||||
current_chunk_ids.push(docid);
|
||||
|
||||
if chunks.len() == chunks.capacity() {
|
||||
let chunked_embeds = embed_chunks(
|
||||
&embedder,
|
||||
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
|
||||
}
|
||||
chunks_ids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// send last chunk
|
||||
if !chunks.is_empty() {
|
||||
let chunked_embeds = embed_chunks(
|
||||
&embedder,
|
||||
std::mem::take(&mut chunks),
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
|
||||
}
|
||||
}
|
||||
|
||||
if !current_chunk.is_empty() {
|
||||
let embeds = embed_chunks(
|
||||
&embedder,
|
||||
vec![std::mem::take(&mut current_chunk)],
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
|
||||
if let Some(embeds) = embeds.first() {
|
||||
for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(state_writer)
|
||||
}
|
||||
|
||||
fn embed_chunks(
|
||||
embedder: &Embedder,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embedding>>> {
|
||||
match embedder.embed_chunks(text_chunks, request_threads) {
|
||||
Ok(chunks) => Ok(chunks),
|
||||
Err(error) => {
|
||||
if let FaultSource::Bug = error.fault {
|
||||
Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(
|
||||
error.into(),
|
||||
)))
|
||||
} else {
|
||||
let mut msg =
|
||||
format!(r"While embedding documents for embedder `{embedder_name}`: {error}");
|
||||
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
}
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,243 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::sorter_into_reader;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
///
|
||||
/// The first returned reader is the one for normal word_docids, and the second one is for
|
||||
/// exact_word_docids
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
true,
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
let mut add_words = BTreeSet::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::from_slice(deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::from_slice(addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
&mut key_buffer,
|
||||
&del_words,
|
||||
&add_words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
|
||||
del_words.clear();
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
let mut buffer = Vec::new();
|
||||
// NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::from_slice(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, value)?;
|
||||
if delete_from_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, value)?;
|
||||
if add_in_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||
writer_into_reader(word_fid_docids_writer)?,
|
||||
))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn words_into_sorter(
|
||||
document_id: DocumentId,
|
||||
fid: FieldId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_words: &BTreeSet<Vec<u8>>,
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let word_bytes = match eob {
|
||||
Left(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Right(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Both(word_bytes, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn docids_into_writers<W>(
|
||||
word: &str,
|
||||
deletions: &RoaringBitmap,
|
||||
additions: &RoaringBitmap,
|
||||
writer: &mut grenad::Writer<W>,
|
||||
) -> Result<()>
|
||||
where
|
||||
W: std::io::Write,
|
||||
{
|
||||
if deletions == additions {
|
||||
// if the same value is deleted and added, do nothing.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Write each value in the same KvDelAdd before inserting it in the final writer.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
// deletions:
|
||||
if !deletions.is_empty() && !deletions.is_subset(additions) {
|
||||
obkv.insert(
|
||||
DelAdd::Deletion,
|
||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
// additions:
|
||||
if !additions.is_empty() {
|
||||
obkv.insert(
|
||||
DelAdd::Addition,
|
||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert everything in the same writer.
|
||||
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,261 @@
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{cmp, io};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted word pairs proximities and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
// early return if the data shouldn't be deleted nor created.
|
||||
if settings_diff.settings_update_only && !settings_diff.reindex_proximities() {
|
||||
let writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
return writer_into_reader(writer);
|
||||
}
|
||||
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
||||
true,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut del_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut add_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut del_word_pair_proximity = BTreeMap::new();
|
||||
let mut add_word_pair_proximity = BTreeMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
// if we change document, we fill the sorter
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
del_word_pair_proximity.clear();
|
||||
add_word_pair_proximity.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
if !any_deletion {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::from_slice(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
del_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !del_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
|| {
|
||||
if !any_addition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::from_slice(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
add_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !add_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
del?;
|
||||
add?;
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
document_id,
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
for sorter in word_pair_proximity_docids_sorters {
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||
///
|
||||
/// This list is used by the engine to calculate the documents containing words that are
|
||||
/// close to each other.
|
||||
fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdCboRoaringBitmaps>],
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut key_buffer = Vec::new();
|
||||
for eob in
|
||||
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||
d.cmp(a)
|
||||
})
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let ((w1, w2), prox) = match eob {
|
||||
Left(key_value) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Right(key_value) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Both(key_value, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.push(*prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
|
||||
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
) -> Result<()> {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity
|
||||
.entry((head_word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words at positions and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut current_document_id: Option<u32> = None;
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
if current_document_id.map_or(false, |id| document_id != id) {
|
||||
words_position_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
del_word_positions.clear();
|
||||
add_word_positions.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::from_slice(deletion).iter() {
|
||||
let position = bucketed_position(position);
|
||||
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (position, word_bytes) in KvReaderU16::from_slice(addition).iter() {
|
||||
let position = bucketed_position(position);
|
||||
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
words_position_into_sorter(
|
||||
document_id,
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO remove noop DelAdd OBKV
|
||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_position_docids_reader)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn words_position_into_sorter(
|
||||
document_id: DocumentId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let (position, word_bytes) = match eob {
|
||||
Left(key) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Right(key) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Both(key, _) => {
|
||||
// both values needs to be kept because it will be used in other extractors.
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
406
crates/milli/src/update/index_documents/extract/mod.rs
Normal file
406
crates/milli/src/update/index_documents/extract/mod.rs
Normal file
@@ -0,0 +1,406 @@
|
||||
mod extract_docid_word_positions;
|
||||
mod extract_facet_number_docids;
|
||||
mod extract_facet_string_docids;
|
||||
mod extract_fid_docid_facet_values;
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_geo_points;
|
||||
mod extract_vector_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::{
|
||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
||||
};
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::PossibleEmbeddingMistakes;
|
||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub(crate) fn data_from_obkv_documents(
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
) -> Result<()> {
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
embedders_configs.clone(),
|
||||
settings_diff.clone(),
|
||||
possible_embedding_mistakes.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
|| {
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
primary_key_id,
|
||||
settings_diff.clone(),
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.map(|result| {
|
||||
if let Ok((
|
||||
ref docid_word_positions_chunk,
|
||||
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
|
||||
)) = result
|
||||
{
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
);
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
),
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_docids,
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
);
|
||||
|
||||
original_pipeline_result.and(flattened_pipeline_result)
|
||||
}
|
||||
|
||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||
/// Generated grenad chunks are merged using the merge_fn.
|
||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||
/// and sent into lmdb_writer_sx.
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
) where
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
GrenadParameters,
|
||||
&InnerIndexSettingsDiff,
|
||||
) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
|
||||
M: Send,
|
||||
{
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||
|
||||
REQUEST_THREADS.get_or_init(|| {
|
||||
ThreadPoolNoAbortBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()
|
||||
.unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents
|
||||
fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||
// no point in indexing vectors without embedders
|
||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
let embedders_configs = embedders_configs.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
match extract_vector_points(
|
||||
original_documents_chunk.clone(),
|
||||
indexer,
|
||||
&embedders_configs,
|
||||
&settings_diff,
|
||||
&possible_embedding_mistakes,
|
||||
) {
|
||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||
for ExtractedVectorPoints {
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
embedder_name,
|
||||
embedder,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&unused_vectors_distribution,
|
||||
request_threads(),
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: create a custom internal error
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents_ids
|
||||
/// - docid_word_positions
|
||||
/// - docid_fid_facet_numbers
|
||||
/// - docid_fid_facet_strings
|
||||
/// - docid_fid_facet_exists
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
if settings_diff.run_geo_indexing() {
|
||||
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
let settings_diff = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
let result =
|
||||
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
|
||||
let _ = match result {
|
||||
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let docid_word_positions_chunk = extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||
|
||||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let ExtractedFacetValues {
|
||||
fid_docid_facet_numbers_chunk,
|
||||
fid_docid_facet_strings_chunk,
|
||||
fid_facet_is_null_docids_chunk,
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
)?;
|
||||
|
||||
// send fid_docid_facet_numbers_chunk to DB writer
|
||||
let fid_docid_facet_numbers_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
)));
|
||||
|
||||
// send fid_docid_facet_strings_chunk to DB writer
|
||||
let fid_docid_facet_strings_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
|
||||
|
||||
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
|
||||
},
|
||||
);
|
||||
|
||||
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
||||
}
|
||||
@@ -0,0 +1,24 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
/// Wrapper around Mmap allowing to virtually clone grenad-chunks
|
||||
/// in a parallel process like the indexing.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClonableMmap {
|
||||
inner: Arc<Mmap>,
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ClonableMmap {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.inner.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Mmap> for ClonableMmap {
|
||||
fn from(inner: Mmap) -> ClonableMmap {
|
||||
ClonableMmap { inner: Arc::new(inner) }
|
||||
}
|
||||
}
|
||||
|
||||
pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>;
|
||||
@@ -0,0 +1,217 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
|
||||
use grenad::{CompressionType, MergeFunction, Sorter};
|
||||
use heed::types::Bytes;
|
||||
|
||||
use super::ClonableMmap;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
/// This is something reasonable given the fact
|
||||
/// that there is one grenad sorter by thread.
|
||||
const MAX_GRENAD_SORTER_USAGE: usize = 500 * 1024 * 1024; // 500 MiB
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
|
||||
pub fn create_writer<R: io::Write>(
|
||||
typ: grenad::CompressionType,
|
||||
level: Option<u32>,
|
||||
file: R,
|
||||
) -> grenad::Writer<BufWriter<R>> {
|
||||
let mut builder = grenad::Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(BufWriter::new(file))
|
||||
}
|
||||
|
||||
/// A helper function that creates a grenad sorter
|
||||
/// with the given parameters. The max memory is
|
||||
/// clamped to something reasonable.
|
||||
pub fn create_sorter<MF: MergeFunction>(
|
||||
sort_algorithm: grenad::SortAlgorithm,
|
||||
merge: MF,
|
||||
chunk_compression_type: grenad::CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
sort_in_parallel: bool,
|
||||
) -> grenad::Sorter<MF> {
|
||||
let mut builder = grenad::Sorter::builder(merge);
|
||||
builder.chunk_compression_type(chunk_compression_type);
|
||||
if let Some(level) = chunk_compression_level {
|
||||
builder.chunk_compression_level(level);
|
||||
}
|
||||
if let Some(nb_chunks) = max_nb_chunks {
|
||||
builder.max_nb_chunks(nb_chunks);
|
||||
}
|
||||
if let Some(memory) = max_memory {
|
||||
builder.dump_threshold(memory.min(MAX_GRENAD_SORTER_USAGE));
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.sort_in_parallel(sort_in_parallel);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn sorter_into_reader<MF>(
|
||||
sorter: grenad::Sorter<MF>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>>
|
||||
where
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(
|
||||
writer: grenad::Writer<BufWriter<File>>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
file.rewind()?;
|
||||
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
/// We use memory mapping inside. So, according to the Rust community, it's unsafe.
|
||||
pub unsafe fn as_cloneable_grenad(
|
||||
reader: &grenad::Reader<BufReader<File>>,
|
||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||
let file = reader.get_ref().get_ref();
|
||||
let mmap = memmap2::Mmap::map(file)?;
|
||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||
let reader = grenad::Reader::new(cursor)?;
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GrenadParameters {
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
}
|
||||
|
||||
impl Default for GrenadParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_memory: None,
|
||||
max_nb_chunks: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GrenadParameters {
|
||||
/// This function use the number of threads in the current threadpool to compute the value.
|
||||
///
|
||||
/// This should be called inside of a rayon thread pool,
|
||||
/// otherwise, it will take the global number of threads.
|
||||
///
|
||||
/// The max memory cannot exceed a given reasonable value.
|
||||
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| {
|
||||
(max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator that outputs grenad readers of obkv documents
|
||||
/// with a maximum size of approximately `documents_chunks_size`.
|
||||
///
|
||||
/// The grenad obkv entries are composed of an incremental document id big-endian
|
||||
/// encoded as the key and an obkv object with an `u8` for the field as the key
|
||||
/// and a simple UTF-8 encoded string as the value.
|
||||
pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
documents_chunk_size: usize,
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||
let mut continue_reading = true;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
|
||||
let mut transposer = move || {
|
||||
if !continue_reading {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut current_chunk_size = 0u64;
|
||||
let mut obkv_documents = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||
if !obkv.is_empty() {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue_reading = false;
|
||||
writer_into_reader(obkv_documents).map(Some)
|
||||
};
|
||||
|
||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||
}
|
||||
|
||||
/// Write provided sorter in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn write_sorter_into_database<K, V, FS, FM, MF>(
|
||||
sorter: Sorter<MF>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = merger_iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -0,0 +1,314 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use either::Either;
|
||||
use grenad::MergeFunction;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::transform::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub type EitherObkvMerge =
|
||||
Either<ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions>;
|
||||
|
||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
buffer.reserve(bitmap.serialized_size());
|
||||
bitmap.serialize_into(buffer)
|
||||
}
|
||||
|
||||
pub struct MergeRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let merged = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
let mut buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&merged, &mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeepFirst;
|
||||
|
||||
impl MergeFunction for KeepFirst {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Only the last value associated with an id is kept.
|
||||
pub struct KeepLatestObkv;
|
||||
|
||||
impl MergeFunction for KeepLatestObkv {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn merge_two_del_add_obkvs(
|
||||
base: &obkv::KvReaderU16,
|
||||
update: &obkv::KvReaderU16,
|
||||
merge_additions: bool,
|
||||
buffer: &mut Vec<u8>,
|
||||
) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
buffer.clear();
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
if merge_additions {
|
||||
writer.insert(k, v).unwrap()
|
||||
} else {
|
||||
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::from_slice(v);
|
||||
|
||||
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
Both((k, base), (_, update)) => {
|
||||
// merge deletions and additions.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::from_slice(base);
|
||||
let update_reader = KvReaderDelAdd::from_slice(update);
|
||||
|
||||
// keep newest deletion.
|
||||
if let Some(deletion) = update_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.or_else(|| base_reader.get(DelAdd::Deletion))
|
||||
{
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
}
|
||||
|
||||
// keep base addition only if merge_additions is true.
|
||||
let base_addition =
|
||||
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
||||
// keep newest addition.
|
||||
// TODO use or_else
|
||||
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
}
|
||||
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
fn inner_merge_del_add_obkvs<'a>(
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
merge_additions: bool,
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
// pop the newest operation from the list.
|
||||
let (newest, obkvs) = obkvs.split_last().unwrap();
|
||||
// keep the operation type for the returned value.
|
||||
let newest_operation_type = newest[0];
|
||||
|
||||
// treat the newest obkv as the starting point of the merge.
|
||||
let mut acc_operation_type = newest_operation_type;
|
||||
let mut acc = newest[1..].to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
// reverse iter from the most recent to the oldest.
|
||||
for current in obkvs.iter().rev() {
|
||||
// if in the previous iteration there was a complete deletion,
|
||||
// stop the merge process.
|
||||
if acc_operation_type == Operation::Deletion as u8 {
|
||||
break;
|
||||
}
|
||||
|
||||
let newest = obkv::KvReader::from_slice(&acc);
|
||||
let oldest = obkv::KvReader::from_slice(¤t[1..]);
|
||||
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator.
|
||||
std::mem::swap(&mut acc, &mut buffer);
|
||||
acc_operation_type = current[0];
|
||||
}
|
||||
|
||||
acc.insert(0, newest_operation_type);
|
||||
Ok(Cow::from(acc))
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ObkvsMergeAdditionsAndDeletions;
|
||||
|
||||
impl MergeFunction for ObkvsMergeAdditionsAndDeletions {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ObkvsKeepLastAdditionMergeDeletions;
|
||||
|
||||
impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, false)
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||
pub struct MergeCboRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeCboRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||
Ok(Cow::from(vec))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub struct MergeDeladdCboRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeDeladdCboRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_bitmaps_bytes = Vec::new();
|
||||
let mut add_bitmaps_bytes = Vec::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||
del_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||
add_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let mut buffer = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||
///
|
||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||
/// the second one is the CboRoaringBitmap to merge into.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
deladd_obkv: &[u8],
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::from_slice(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Do a union of BtreeSet on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub struct MergeDeladdBtreesetString;
|
||||
|
||||
impl MergeFunction for MergeDeladdBtreesetString {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_set = BTreeSet::new();
|
||||
let mut add_set = BTreeSet::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
del_set.insert(value);
|
||||
}
|
||||
}
|
||||
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
add_set.insert(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let del = serde_json::to_vec(&del_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||
let add = serde_json::to_vec(&add_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||
pub struct MergeIgnoreValues;
|
||||
|
||||
impl MergeFunction for MergeIgnoreValues {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(
|
||||
&self,
|
||||
_key: &[u8],
|
||||
_values: &[Cow<'a, [u8]>],
|
||||
) -> std::result::Result<Cow<'a, [u8]>, Self::Error> {
|
||||
Ok(Cow::Owned(Vec::new()))
|
||||
}
|
||||
}
|
||||
66
crates/milli/src/update/index_documents/helpers/mod.rs
Normal file
66
crates/milli/src/update/index_documents/helpers/mod.rs
Normal file
@@ -0,0 +1,66 @@
|
||||
mod clonable_mmap;
|
||||
mod grenad_helpers;
|
||||
mod merge_functions;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
|
||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::*;
|
||||
pub use merge_functions::*;
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
/// Converts an fst Stream into an HashSet of Strings.
|
||||
pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut hashset = HashSet::new();
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some(value) = stream.next() {
|
||||
hashset.insert(value.to_owned());
|
||||
}
|
||||
hashset
|
||||
}
|
||||
|
||||
// Converts an fst Stream into a Vec of Strings.
|
||||
pub fn fst_stream_into_vec<'f, I, S>(stream: I) -> Vec<String>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut strings = Vec::new();
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let s = std::str::from_utf8(word).unwrap();
|
||||
strings.push(s.to_owned());
|
||||
}
|
||||
strings
|
||||
}
|
||||
3453
crates/milli/src/update/index_documents/mod.rs
Normal file
3453
crates/milli/src/update/index_documents/mod.rs
Normal file
File diff suppressed because it is too large
Load Diff
86
crates/milli/src/update/index_documents/parallel.rs
Normal file
86
crates/milli/src/update/index_documents/parallel.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use heed::types::Bytes;
|
||||
use heed::{Database, RoTxn};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{all_obkv_to_json, DocumentId, FieldsIdsMap, Object, ObkvCodec, Result, BEU32};
|
||||
|
||||
pub struct ImmutableObkvs<'t> {
|
||||
ids: RoaringBitmap,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
slices: Vec<&'t [u8]>,
|
||||
}
|
||||
|
||||
impl<'t> ImmutableObkvs<'t> {
|
||||
/// Creates the structure by fetching all the OBKVs
|
||||
/// and keeping the transaction making the pointers valid.
|
||||
pub fn new(
|
||||
rtxn: &'t RoTxn,
|
||||
documents_database: Database<BEU32, ObkvCodec>,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
subset: RoaringBitmap,
|
||||
) -> heed::Result<Self> {
|
||||
let mut slices = Vec::new();
|
||||
let documents_database = documents_database.remap_data_type::<Bytes>();
|
||||
for docid in &subset {
|
||||
let slice = documents_database.get(rtxn, &docid)?.unwrap();
|
||||
slices.push(slice);
|
||||
}
|
||||
|
||||
Ok(ImmutableObkvs { ids: subset, fields_ids_map, slices })
|
||||
}
|
||||
|
||||
/// Returns the OBKVs identified by the given ID.
|
||||
pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
|
||||
match self
|
||||
.ids
|
||||
.rank(docid)
|
||||
.checked_sub(1)
|
||||
.and_then(|offset| self.slices.get(offset as usize))
|
||||
{
|
||||
Some(&bytes) => Ok(Some(bytes.into())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the owned rhai::Map identified by the given ID.
|
||||
pub fn rhai_map(&self, docid: DocumentId) -> Result<Option<rhai::Map>> {
|
||||
let obkv = match self.obkv(docid) {
|
||||
Ok(Some(obkv)) => obkv,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
||||
let map: Result<rhai::Map> = all_keys
|
||||
.iter()
|
||||
.copied()
|
||||
.flat_map(|id| obkv.get(id).map(|value| (id, value)))
|
||||
.map(|(id, value)| {
|
||||
let name = self.fields_ids_map.name(id).ok_or(
|
||||
crate::error::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: id,
|
||||
process: "all_obkv_to_rhaimap",
|
||||
},
|
||||
)?;
|
||||
let value = serde_json::from_slice(value)
|
||||
.map_err(crate::error::InternalError::SerdeJson)?;
|
||||
Ok((name.into(), value))
|
||||
})
|
||||
.collect();
|
||||
|
||||
map.map(Some)
|
||||
}
|
||||
|
||||
pub fn json_map(&self, docid: DocumentId) -> Result<Option<Object>> {
|
||||
let obkv = match self.obkv(docid) {
|
||||
Ok(Some(obkv)) => obkv,
|
||||
Ok(None) => return Ok(None),
|
||||
Err(e) => return Err(e.into()),
|
||||
};
|
||||
|
||||
all_obkv_to_json(obkv, &self.fields_ids_map).map(Some)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl Sync for ImmutableObkvs<'_> {}
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[]
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
[2, ]
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
benoit [2, ]
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
|
||||
2 [21, ]
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
2 0 2.2 1 [21, ]
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
1 0 aquarium 1 [5, ]
|
||||
1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ]
|
||||
1 0 cartoon 1 [2, 7, 15, 17, ]
|
||||
1 0 colorfulness 1 [13, ]
|
||||
1 0 design 1 [2, 18, ]
|
||||
1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ]
|
||||
1 0 geometry 1 [19, ]
|
||||
1 0 letter 1 [1, ]
|
||||
1 0 outdoor 1 [4, ]
|
||||
1 0 painting 1 [3, ]
|
||||
1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ]
|
||||
2 0 design 1 [21, ]
|
||||
|
||||
@@ -0,0 +1,38 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
|
||||
2 [21, ]
|
||||
36 [3, ]
|
||||
37 [4, ]
|
||||
38 [5, ]
|
||||
39 [6, ]
|
||||
40 [7, ]
|
||||
41 [8, ]
|
||||
42 [9, ]
|
||||
43 [10, ]
|
||||
44 [11, ]
|
||||
45 [12, ]
|
||||
46 [13, ]
|
||||
47 [14, ]
|
||||
5 [1, ]
|
||||
52 [15, ]
|
||||
57 [16, ]
|
||||
58 [17, ]
|
||||
68 [18, ]
|
||||
69 [19, ]
|
||||
7 [2, ]
|
||||
71 [21, ]
|
||||
abstract [2, 6, 10, 13, 14, 15, 16, 17, ]
|
||||
aquarium [5, ]
|
||||
art [4, 5, 8, 9, 10, 12, 17, ]
|
||||
cartoon [2, 7, 15, 17, ]
|
||||
colorfulness [13, ]
|
||||
design [2, 18, 21, ]
|
||||
drawing [3, 4, 5, 8, 10, 11, 16, ]
|
||||
geometry [19, ]
|
||||
letter [1, ]
|
||||
outdoor [4, ]
|
||||
painting [3, ]
|
||||
pattern [2, 3, 9, 10, 13, 14, 16, ]
|
||||
|
||||
@@ -0,0 +1,25 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
1 1 36 [3, ]
|
||||
1 1 37 [4, ]
|
||||
1 1 38 [5, ]
|
||||
1 1 39 [6, ]
|
||||
1 1 40 [7, ]
|
||||
1 1 41 [8, ]
|
||||
1 1 42 [9, ]
|
||||
1 1 43 [10, ]
|
||||
1 1 44 [11, ]
|
||||
1 1 45 [12, ]
|
||||
1 1 46 [13, ]
|
||||
1 1 47 [14, ]
|
||||
1 1 5 [1, ]
|
||||
1 1 52 [15, ]
|
||||
1 1 57 [16, ]
|
||||
1 1 58 [17, ]
|
||||
1 1 68 [18, ]
|
||||
1 1 69 [19, ]
|
||||
1 1 7 [2, ]
|
||||
1 1 71 [21, ]
|
||||
1 2 2 [21, ]
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
3 0 48.9021 1 [19, ]
|
||||
3 0 49.9314 1 [17, ]
|
||||
3 0 50.1793 1 [15, ]
|
||||
3 0 50.2844 1 [14, ]
|
||||
3 0 50.3518 1 [13, ]
|
||||
3 0 50.4502 1 [12, ]
|
||||
3 0 50.6053 1 [8, ]
|
||||
3 0 50.6224 1 [3, ]
|
||||
3 0 50.6299 1 [0, ]
|
||||
3 0 50.6312 1 [2, ]
|
||||
3 0 50.6415 1 [1, ]
|
||||
3 0 50.7453 1 [7, ]
|
||||
3 0 50.8466 1 [10, ]
|
||||
3 0 51.0537 1 [9, ]
|
||||
4 0 2.271 1 [17, ]
|
||||
4 0 2.3708 1 [19, ]
|
||||
4 0 2.7637 1 [14, ]
|
||||
4 0 3.0569 1 [0, ]
|
||||
4 0 3.1106 1 [1, 2, ]
|
||||
4 0 3.1476 1 [3, ]
|
||||
4 0 3.2189 1 [15, ]
|
||||
4 0 3.2206 1 [7, ]
|
||||
4 0 3.3758 1 [8, ]
|
||||
4 0 3.5326 1 [13, ]
|
||||
4 0 3.6957 1 [9, ]
|
||||
4 0 3.9623 1 [12, ]
|
||||
4 0 4.337 1 [10, ]
|
||||
|
||||
@@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, ]
|
||||
1 [2, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
1344 [3, ]
|
||||
2 [0, ]
|
||||
23 [5, ]
|
||||
25 [2, ]
|
||||
3 [0, ]
|
||||
35 [5, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, ]
|
||||
alice [2, ]
|
||||
and [0, 4, ]
|
||||
antoine [1, ]
|
||||
austin [0, ]
|
||||
blood [4, ]
|
||||
carroll [2, ]
|
||||
de [1, ]
|
||||
douglas [5, ]
|
||||
exupery [1, ]
|
||||
fantasy [2, 3, 4, ]
|
||||
galaxy [5, ]
|
||||
guide [5, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [3, 4, ]
|
||||
jane [0, ]
|
||||
k [4, ]
|
||||
le [1, ]
|
||||
lewis [2, ]
|
||||
petit [1, ]
|
||||
potter [4, ]
|
||||
prejudice [0, ]
|
||||
pride [0, ]
|
||||
prince [1, 4, ]
|
||||
r [3, ]
|
||||
romance [0, ]
|
||||
rowling [4, ]
|
||||
s [5, ]
|
||||
saint [1, ]
|
||||
the [3, 4, 5, ]
|
||||
to [5, ]
|
||||
tolkien [3, ]
|
||||
wonderland [2, ]
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
---
|
||||
source: milli/src/update/index_documents/mod.rs
|
||||
---
|
||||
0 [1, ]
|
||||
1 [2, ]
|
||||
10 [1, ]
|
||||
12 [0, ]
|
||||
1344 [3, ]
|
||||
1813 [0, ]
|
||||
2 [0, ]
|
||||
23 [5, ]
|
||||
25 [2, ]
|
||||
3 [0, ]
|
||||
35 [5, ]
|
||||
4 [4, ]
|
||||
42 [0, 5, ]
|
||||
456 [1, ]
|
||||
5 [0, ]
|
||||
99 [2, ]
|
||||
adams [5, ]
|
||||
adventure [1, ]
|
||||
alice [2, ]
|
||||
and [0, 4, ]
|
||||
antoine [1, ]
|
||||
austen [0, ]
|
||||
blood [4, ]
|
||||
carroll [2, ]
|
||||
de [1, ]
|
||||
douglas [5, ]
|
||||
exupery [1, ]
|
||||
fantasy [2, 3, 4, ]
|
||||
galaxy [5, ]
|
||||
guide [5, ]
|
||||
half [4, ]
|
||||
harry [4, ]
|
||||
hitchhiker [5, ]
|
||||
hobbit [3, ]
|
||||
in [2, ]
|
||||
j [0, 3, 4, ]
|
||||
k [4, ]
|
||||
lewis [2, ]
|
||||
little [1, ]
|
||||
potter [4, ]
|
||||
prejudice [0, ]
|
||||
pride [0, ]
|
||||
prince [1, ]
|
||||
princess [4, ]
|
||||
r [3, ]
|
||||
romance [0, ]
|
||||
rowling [4, ]
|
||||
s [5, ]
|
||||
saint [1, ]
|
||||
the [1, 3, 4, 5, ]
|
||||
to [5, ]
|
||||
tolkien [3, ]
|
||||
wonderland [2, ]
|
||||
|
||||
1270
crates/milli/src/update/index_documents/transform.rs
Normal file
1270
crates/milli/src/update/index_documents/transform.rs
Normal file
File diff suppressed because it is too large
Load Diff
852
crates/milli/src/update/index_documents/typed_chunk.rs
Normal file
852
crates/milli/src/update/index_documents/typed_chunk.rs
Normal file
@@ -0,0 +1,852 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use bytemuck::allocation::pod_collect_to_vec;
|
||||
use grenad::{MergeFunction, Merger, MergerBuilder};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, RwTxn};
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
MergeIgnoreValues,
|
||||
};
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::ArroyWrapper;
|
||||
use crate::{
|
||||
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
|
||||
Result, SerializationError, U8StrStrCodec,
|
||||
};
|
||||
|
||||
/// This struct accumulates and group the TypedChunks
|
||||
/// and is able to give the biggest accumulated group to index them all together
|
||||
/// with a merger.
|
||||
#[derive(Default)]
|
||||
pub(crate) struct ChunkAccumulator {
|
||||
inner: Vec<Vec<TypedChunk>>,
|
||||
}
|
||||
|
||||
impl ChunkAccumulator {
|
||||
pub fn pop_longest(&mut self) -> Option<Vec<TypedChunk>> {
|
||||
match self.inner.iter().max_by_key(|v| v.len()) {
|
||||
Some(left) => {
|
||||
let position = self.inner.iter().position(|right| left.len() == right.len());
|
||||
position.map(|p| self.inner.remove(p)).filter(|v| !v.is_empty())
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, chunk: TypedChunk) {
|
||||
match self
|
||||
.inner
|
||||
.iter()
|
||||
.position(|right| right.first().map_or(false, |right| chunk.mergeable_with(right)))
|
||||
{
|
||||
Some(position) => {
|
||||
let v = self.inner.get_mut(position).unwrap();
|
||||
v.push(chunk);
|
||||
}
|
||||
None => self.inner.push(vec![chunk]),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) enum TypedChunk {
|
||||
FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
|
||||
Documents(grenad::Reader<CursorClonableMmap>),
|
||||
FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
|
||||
WordDocids {
|
||||
word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
exact_word_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
word_fid_docids_reader: grenad::Reader<BufReader<File>>,
|
||||
},
|
||||
WordPositionDocids(grenad::Reader<BufReader<File>>),
|
||||
WordPairProximityDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetStringDocids((grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)),
|
||||
FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
|
||||
FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
|
||||
GeoPoints(grenad::Reader<BufReader<File>>),
|
||||
VectorPoints {
|
||||
remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
embeddings: Option<grenad::Reader<BufReader<File>>>,
|
||||
expected_dimension: usize,
|
||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
embedder_name: String,
|
||||
add_to_user_provided: RoaringBitmap,
|
||||
remove_from_user_provided: RoaringBitmap,
|
||||
},
|
||||
}
|
||||
|
||||
impl TypedChunk {
|
||||
fn mergeable_with(&self, other: &Self) -> bool {
|
||||
use TypedChunk::*;
|
||||
match (self, other) {
|
||||
(FieldIdDocidFacetStrings(_), FieldIdDocidFacetStrings(_))
|
||||
| (FieldIdDocidFacetNumbers(_), FieldIdDocidFacetNumbers(_))
|
||||
| (Documents(_), Documents(_))
|
||||
| (FieldIdWordCountDocids(_), FieldIdWordCountDocids(_))
|
||||
| (WordDocids { .. }, WordDocids { .. })
|
||||
| (WordPositionDocids(_), WordPositionDocids(_))
|
||||
| (WordPairProximityDocids(_), WordPairProximityDocids(_))
|
||||
| (FieldIdFacetStringDocids(_), FieldIdFacetStringDocids(_))
|
||||
| (FieldIdFacetNumberDocids(_), FieldIdFacetNumberDocids(_))
|
||||
| (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_))
|
||||
| (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_))
|
||||
| (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_))
|
||||
| (GeoPoints(_), GeoPoints(_)) => true,
|
||||
(
|
||||
VectorPoints { embedder_name: left, expected_dimension: left_dim, .. },
|
||||
VectorPoints { embedder_name: right, expected_dimension: right_dim, .. },
|
||||
) => left == right && left_dim == right_dim,
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Write typed chunk in the corresponding LMDB database of the provided index.
|
||||
/// Return new documents seen.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
pub(crate) fn write_typed_chunk_into_index(
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
index: &Index,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
typed_chunks: Vec<TypedChunk>,
|
||||
) -> Result<(RoaringBitmap, bool)> {
|
||||
let mut is_merged_database = false;
|
||||
match typed_chunks[0] {
|
||||
TypedChunk::Documents(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "documents");
|
||||
let _entered = span.enter();
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(wtxn)?;
|
||||
let vectors_fid =
|
||||
fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut builder = MergerBuilder::new(KeepLatestObkv);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::Documents(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
let mut operations: Vec<DocumentOperation> = Default::default();
|
||||
|
||||
let mut docids = index.documents_ids(wtxn)?;
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
|
||||
let embedders: BTreeSet<_> = index
|
||||
.embedding_configs(wtxn)?
|
||||
.into_iter()
|
||||
.map(|IndexEmbeddingConfig { name, .. }| name)
|
||||
.collect();
|
||||
let mut vectors_buffer = Vec::new();
|
||||
while let Some((key, reader)) = iter.next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
let reader: &KvReader<FieldId> = reader.into();
|
||||
|
||||
let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
|
||||
let docid = DocumentId::from_be_bytes(document_id_bytes);
|
||||
let external_id = std::str::from_utf8(external_id_bytes)?;
|
||||
|
||||
for (field_id, value) in reader.iter() {
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
let addition = if vectors_fid == Some(field_id) {
|
||||
'vectors: {
|
||||
vectors_buffer.clear();
|
||||
let Ok(mut vectors) =
|
||||
crate::vector::parsed_vectors::ParsedVectors::from_bytes(
|
||||
addition,
|
||||
)
|
||||
else {
|
||||
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
||||
break 'vectors Some(addition);
|
||||
};
|
||||
vectors.retain_not_embedded_vectors(&embedders);
|
||||
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
||||
if vectors.is_empty() {
|
||||
// skip writing empty `_vectors` map
|
||||
break 'vectors None;
|
||||
}
|
||||
|
||||
serde_json::to_writer(&mut vectors_buffer, &vectors)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
Some(vectors_buffer.as_slice())
|
||||
}
|
||||
} else {
|
||||
Some(addition)
|
||||
};
|
||||
|
||||
if let Some(addition) = addition {
|
||||
writer.insert(field_id, addition)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let db = index.documents.remap_data_type::<Bytes>();
|
||||
|
||||
if !writer.is_empty() {
|
||||
db.put(wtxn, &docid, &writer.into_inner().unwrap())?;
|
||||
operations.push(DocumentOperation {
|
||||
external_id: external_id.to_string(),
|
||||
internal_id: docid,
|
||||
kind: DocumentOperationKind::Create,
|
||||
});
|
||||
docids.insert(docid);
|
||||
} else {
|
||||
db.delete(wtxn, &docid)?;
|
||||
operations.push(DocumentOperation {
|
||||
external_id: external_id.to_string(),
|
||||
internal_id: docid,
|
||||
kind: DocumentOperationKind::Delete,
|
||||
});
|
||||
docids.remove(docid);
|
||||
}
|
||||
}
|
||||
let external_documents_docids = index.external_documents_ids();
|
||||
external_documents_docids.apply(wtxn, operations)?;
|
||||
index.put_documents_ids(wtxn, &docids)?;
|
||||
}
|
||||
TypedChunk::FieldIdWordCountDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.field_id_word_count_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordDocids { .. } => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||
let clonable_exact_word_docids =
|
||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||
|
||||
word_docids_builder.push(word_docids_reader.into_cursor()?);
|
||||
exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
|
||||
word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_word_docids.into_cursor()?);
|
||||
fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
|
||||
}
|
||||
|
||||
let word_docids_merger = word_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
word_docids_merger,
|
||||
&index.word_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let exact_word_docids_merger = exact_word_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
exact_word_docids_merger,
|
||||
&index.exact_word_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
let word_fid_docids_merger = word_fid_docids_builder.build();
|
||||
write_entries_into_database(
|
||||
word_fid_docids_merger,
|
||||
&index.word_fid_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
|
||||
// create fst from word docids
|
||||
let fst_merger = fst_merger_builder.build();
|
||||
let fst = merge_word_docids_reader_into_fst(fst_merger)?;
|
||||
let db_fst = index.words_fst(wtxn)?;
|
||||
|
||||
// merge new fst with database fst
|
||||
let union_stream = fst.op().add(db_fst.stream()).union();
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
builder.extend_stream(union_stream)?;
|
||||
let fst = builder.into_set();
|
||||
index.put_words_fst(wtxn, &fst)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPositionDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_position_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetNumberDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
data_size += facet_id_number_docids.len();
|
||||
builder.push(facet_id_number_docids.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
|
||||
indexer.execute(wtxn)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut normalized_facet_id_string_builder =
|
||||
MergerBuilder::new(MergeDeladdBtreesetString);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetStringDocids((
|
||||
facet_id_string_docids,
|
||||
normalized_facet_id_string_docids,
|
||||
)) = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
data_size += facet_id_string_docids.len();
|
||||
facet_id_string_builder.push(facet_id_string_docids.into_cursor()?);
|
||||
normalized_facet_id_string_builder
|
||||
.push(normalized_facet_id_string_docids.into_cursor()?);
|
||||
}
|
||||
let facet_id_string_merger = facet_id_string_builder.build();
|
||||
let normalized_facet_id_string_merger = normalized_facet_id_string_builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(
|
||||
index,
|
||||
FacetType::String,
|
||||
facet_id_string_merger,
|
||||
Some(normalized_facet_id_string_merger),
|
||||
data_size,
|
||||
);
|
||||
indexer.execute(wtxn)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_exists_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsNullDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_is_null_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetIsEmptyDocids(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.facet_id_is_empty_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::WordPairProximityDocids(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
if settings_diff.only_additional_fields.is_some() {
|
||||
write_proximity_entries_into_database_additional_searchables(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
)?;
|
||||
} else {
|
||||
write_entries_into_database(
|
||||
merger,
|
||||
&index.word_pair_proximity_docids,
|
||||
wtxn,
|
||||
deladd_serialize_add_side,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
)?;
|
||||
}
|
||||
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetNumbers(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(KeepFirst);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let index_fid_docid_facet_numbers =
|
||||
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::from_slice(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?,
|
||||
(Some(_), None) => {
|
||||
index_fid_docid_facet_numbers.delete(wtxn, key)?;
|
||||
}
|
||||
(Some(_), Some(new)) => {
|
||||
index_fid_docid_facet_numbers.put(wtxn, key, new)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::FieldIdDocidFacetStrings(_) => {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(KeepFirst);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let index_fid_docid_facet_strings =
|
||||
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::from_slice(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?,
|
||||
(Some(_), None) => {
|
||||
index_fid_docid_facet_strings.delete(wtxn, key)?;
|
||||
}
|
||||
(Some(_), Some(new)) => {
|
||||
index_fid_docid_facet_strings.put(wtxn, key, new)?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
TypedChunk::GeoPoints(_) => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(KeepFirst);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::GeoPoints(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
builder.push(chunk.into_cursor()?);
|
||||
}
|
||||
let merger = builder.build();
|
||||
|
||||
let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default();
|
||||
let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?;
|
||||
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
let deladd_obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.remove(&geopoint);
|
||||
geo_faceted_docids.remove(docid);
|
||||
}
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.insert(geopoint);
|
||||
geo_faceted_docids.insert(docid);
|
||||
}
|
||||
}
|
||||
index.put_geo_rtree(wtxn, &rtree)?;
|
||||
index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
|
||||
}
|
||||
TypedChunk::VectorPoints { .. } => {
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut embeddings_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut add_to_user_provided = RoaringBitmap::new();
|
||||
let mut remove_from_user_provided = RoaringBitmap::new();
|
||||
let mut params = None;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
manual_vectors,
|
||||
embeddings,
|
||||
expected_dimension,
|
||||
embedder_name,
|
||||
add_to_user_provided: aud,
|
||||
remove_from_user_provided: rud,
|
||||
} = typed_chunk
|
||||
else {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
params = Some((expected_dimension, embedder_name));
|
||||
|
||||
remove_vectors_builder.push(remove_vectors.into_cursor()?);
|
||||
manual_vectors_builder.push(manual_vectors.into_cursor()?);
|
||||
if let Some(embeddings) = embeddings {
|
||||
embeddings_builder.push(embeddings.into_cursor()?);
|
||||
}
|
||||
add_to_user_provided |= aud;
|
||||
remove_from_user_provided |= rud;
|
||||
}
|
||||
|
||||
// typed chunks has always at least 1 chunk.
|
||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||
|
||||
let mut embedding_configs = index.embedding_configs(wtxn)?;
|
||||
let index_embedder_config = embedding_configs
|
||||
.iter_mut()
|
||||
.find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name)
|
||||
.unwrap();
|
||||
index_embedder_config.user_provided -= remove_from_user_provided;
|
||||
index_embedder_config.user_provided |= add_to_user_provided;
|
||||
|
||||
index.put_embedding_configs(wtxn, embedding_configs)?;
|
||||
|
||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
)?;
|
||||
let binary_quantized = settings_diff
|
||||
.old
|
||||
.embedding_configs
|
||||
.get(&embedder_name)
|
||||
.map_or(false, |conf| conf.2);
|
||||
// FIXME: allow customizing distance
|
||||
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let merger = remove_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, _)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
writer.del_items(wtxn, expected_dimension, docid)?;
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
let merger = embeddings_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
let data = pod_collect_to_vec(value);
|
||||
// it is a code error to have embeddings and not expected_dimension
|
||||
let embeddings = crate::vector::Embeddings::from_inner(data, expected_dimension)
|
||||
// code error if we somehow got the wrong dimension
|
||||
.unwrap();
|
||||
|
||||
if embeddings.embedding_count() > usize::from(u8::MAX) {
|
||||
let external_docid = if let Ok(Some(Ok(index))) = index
|
||||
.external_id_of(wtxn, std::iter::once(docid))
|
||||
.map(|it| it.into_iter().next())
|
||||
{
|
||||
index
|
||||
} else {
|
||||
format!("internal docid={docid}")
|
||||
};
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
external_docid,
|
||||
embeddings.embedding_count(),
|
||||
)));
|
||||
}
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
let merger = manual_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
let docid = DocumentId::from_be_bytes(left);
|
||||
|
||||
let vector_deladd_obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
|
||||
writer.del_item(wtxn, docid, &vector)?;
|
||||
}
|
||||
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
let vector = pod_collect_to_vec(value);
|
||||
|
||||
// overflow was detected during vector extraction.
|
||||
writer.add_item(wtxn, docid, &vector)?;
|
||||
}
|
||||
}
|
||||
|
||||
tracing::debug!("Finished vector chunk for {}", embedder_name);
|
||||
}
|
||||
}
|
||||
|
||||
Ok((RoaringBitmap::new(), is_merged_database))
|
||||
}
|
||||
|
||||
/// Converts the latitude and longitude back to an xyz GeoPoint.
|
||||
fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||
let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
|
||||
let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
|
||||
let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
|
||||
let xyz_point = lat_lng_to_xyz(&point);
|
||||
GeoPoint::new(xyz_point, (docid, point))
|
||||
}
|
||||
|
||||
fn merge_word_docids_reader_into_fst<MF>(
|
||||
merger: Merger<CursorClonableMmap, MF>,
|
||||
) -> Result<fst::Set<Vec<u8>>>
|
||||
where
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
while let Some((k, _)) = iter.next()? {
|
||||
builder.insert(k)?;
|
||||
}
|
||||
|
||||
Ok(builder.into_set())
|
||||
}
|
||||
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_entries_into_database<R, K, V, FS, FM, MF>(
|
||||
merger: Merger<R, MF>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Akin to the `write_entries_into_database` function but specialized
|
||||
/// for the case when we only index additional searchable fields only.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_proximity_entries_into_database_additional_searchables<R, MF>(
|
||||
merger: Merger<R, MF>,
|
||||
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
let (proximity_to_insert, word1, word2) =
|
||||
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
||||
Some(value) => {
|
||||
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
|
||||
}
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let mut data_to_remove = RoaringBitmap::new();
|
||||
for prox in 1..(MAX_DISTANCE as u8) {
|
||||
let key = (prox, word1, word2);
|
||||
let database_value = database.get(wtxn, &key)?.unwrap_or_default();
|
||||
let value = if prox == proximity_to_insert {
|
||||
// Proximity that should be changed.
|
||||
// Union values and remove lower proximity data
|
||||
(&database_value | &data_to_insert) - &data_to_remove
|
||||
} else {
|
||||
// Remove lower proximity data
|
||||
&database_value - &data_to_remove
|
||||
};
|
||||
|
||||
// add the current data in data_to_remove for the next proximities
|
||||
data_to_remove |= &value;
|
||||
|
||||
if database_value != value {
|
||||
database.put(wtxn, &key, &value)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
32
crates/milli/src/update/indexer_config.rs
Normal file
32
crates/milli/src/update/indexer_config.rs
Normal file
@@ -0,0 +1,32 @@
|
||||
use grenad::CompressionType;
|
||||
|
||||
use crate::thread_pool_no_abort::ThreadPoolNoAbort;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct IndexerConfig {
|
||||
pub log_every_n: Option<usize>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
pub documents_chunk_size: Option<usize>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub thread_pool: Option<ThreadPoolNoAbort>,
|
||||
pub max_positions_per_attributes: Option<u32>,
|
||||
pub skip_index_budget: bool,
|
||||
}
|
||||
|
||||
impl Default for IndexerConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
log_every_n: None,
|
||||
max_nb_chunks: None,
|
||||
documents_chunk_size: None,
|
||||
max_memory: None,
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
thread_pool: None,
|
||||
max_positions_per_attributes: None,
|
||||
skip_index_budget: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
26
crates/milli/src/update/mod.rs
Normal file
26
crates/milli/src/update/mod.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
pub use self::available_ids::AvailableIds;
|
||||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::concurrent_available_ids::ConcurrentAvailableIds;
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::*;
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::settings::{validate_embedding_settings, Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
|
||||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
mod available_ids;
|
||||
mod clear_documents;
|
||||
mod concurrent_available_ids;
|
||||
pub(crate) mod del_add;
|
||||
pub(crate) mod facet;
|
||||
mod index_documents;
|
||||
mod indexer_config;
|
||||
pub mod new;
|
||||
mod settings;
|
||||
mod update_step;
|
||||
mod word_prefix_docids;
|
||||
mod words_prefix_integer_docids;
|
||||
mod words_prefixes_fst;
|
||||
437
crates/milli/src/update/new/channel.rs
Normal file
437
crates/milli/src/update/new/channel.rs
Normal file
@@ -0,0 +1,437 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
|
||||
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
||||
use grenad::Merger;
|
||||
use hashbrown::HashMap;
|
||||
use heed::types::Bytes;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::extract::FacetKind;
|
||||
use super::StdResult;
|
||||
use crate::index::main_key::DOCUMENTS_IDS_KEY;
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::vector::Embedding;
|
||||
use crate::{DocumentId, Index};
|
||||
|
||||
/// The capacity of the channel is currently in number of messages.
|
||||
pub fn extractor_writer_channel(cap: usize) -> (ExtractorSender, WriterReceiver) {
|
||||
let (sender, receiver) = crossbeam_channel::bounded(cap);
|
||||
(
|
||||
ExtractorSender {
|
||||
sender,
|
||||
send_count: Default::default(),
|
||||
writer_contentious_count: Default::default(),
|
||||
extractor_contentious_count: Default::default(),
|
||||
},
|
||||
WriterReceiver(receiver),
|
||||
)
|
||||
}
|
||||
|
||||
pub struct KeyValueEntry {
|
||||
pub key_length: usize,
|
||||
pub data: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl KeyValueEntry {
|
||||
pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self {
|
||||
let mut data = Vec::with_capacity(key.len() + value.len());
|
||||
data.extend_from_slice(key);
|
||||
data.extend_from_slice(value);
|
||||
KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
|
||||
}
|
||||
|
||||
pub fn from_small_key_bitmap(key: &[u8], bitmap: RoaringBitmap) -> Self {
|
||||
let mut data = Vec::with_capacity(key.len() + bitmap.serialized_size());
|
||||
data.extend_from_slice(key);
|
||||
bitmap.serialize_into(&mut data).unwrap();
|
||||
KeyValueEntry { key_length: key.len(), data: data.into_boxed_slice() }
|
||||
}
|
||||
|
||||
pub fn key(&self) -> &[u8] {
|
||||
&self.data[..self.key_length]
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &[u8] {
|
||||
&self.data[self.key_length..]
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeyEntry {
|
||||
data: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl KeyEntry {
|
||||
pub fn from_key(key: &[u8]) -> Self {
|
||||
KeyEntry { data: key.to_vec().into_boxed_slice() }
|
||||
}
|
||||
|
||||
pub fn entry(&self) -> &[u8] {
|
||||
self.data.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum EntryOperation {
|
||||
Delete(KeyEntry),
|
||||
Write(KeyValueEntry),
|
||||
}
|
||||
|
||||
pub enum WriterOperation {
|
||||
DbOperation(DbOperation),
|
||||
ArroyOperation(ArroyOperation),
|
||||
}
|
||||
|
||||
pub enum ArroyOperation {
|
||||
/// TODO: call when deleting regular documents
|
||||
DeleteVectors {
|
||||
docid: DocumentId,
|
||||
},
|
||||
SetVectors {
|
||||
docid: DocumentId,
|
||||
embedder_id: u8,
|
||||
embeddings: Vec<Embedding>,
|
||||
},
|
||||
SetVector {
|
||||
docid: DocumentId,
|
||||
embedder_id: u8,
|
||||
embedding: Embedding,
|
||||
},
|
||||
Finish {
|
||||
user_provided: HashMap<String, RoaringBitmap>,
|
||||
},
|
||||
}
|
||||
|
||||
pub struct DbOperation {
|
||||
database: Database,
|
||||
entry: EntryOperation,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Database {
|
||||
Documents,
|
||||
ExternalDocumentsIds,
|
||||
ExactWordDocids,
|
||||
FidWordCountDocids,
|
||||
Main,
|
||||
WordDocids,
|
||||
WordFidDocids,
|
||||
WordPairProximityDocids,
|
||||
WordPositionDocids,
|
||||
FacetIdIsNullDocids,
|
||||
FacetIdIsEmptyDocids,
|
||||
FacetIdExistsDocids,
|
||||
FacetIdF64NumberDocids,
|
||||
FacetIdStringDocids,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
||||
match self {
|
||||
Database::Documents => index.documents.remap_types(),
|
||||
Database::ExternalDocumentsIds => index.external_documents_ids.remap_types(),
|
||||
Database::ExactWordDocids => index.exact_word_docids.remap_types(),
|
||||
Database::Main => index.main.remap_types(),
|
||||
Database::WordDocids => index.word_docids.remap_types(),
|
||||
Database::WordFidDocids => index.word_fid_docids.remap_types(),
|
||||
Database::WordPositionDocids => index.word_position_docids.remap_types(),
|
||||
Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
|
||||
Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
|
||||
Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(),
|
||||
Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(),
|
||||
Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(),
|
||||
Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(),
|
||||
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FacetKind> for Database {
|
||||
fn from(value: FacetKind) -> Self {
|
||||
match value {
|
||||
FacetKind::Number => Database::FacetIdF64NumberDocids,
|
||||
FacetKind::String => Database::FacetIdStringDocids,
|
||||
FacetKind::Null => Database::FacetIdIsNullDocids,
|
||||
FacetKind::Empty => Database::FacetIdIsEmptyDocids,
|
||||
FacetKind::Exists => Database::FacetIdExistsDocids,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DbOperation {
|
||||
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
||||
self.database.database(index)
|
||||
}
|
||||
|
||||
pub fn entry(self) -> EntryOperation {
|
||||
self.entry
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WriterReceiver(Receiver<WriterOperation>);
|
||||
|
||||
impl IntoIterator for WriterReceiver {
|
||||
type Item = WriterOperation;
|
||||
type IntoIter = IntoIter<Self::Item>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractorSender {
|
||||
sender: Sender<WriterOperation>,
|
||||
/// The number of message we sent in total in the channel.
|
||||
send_count: AtomicUsize,
|
||||
/// The number of times we sent something in a channel that was full.
|
||||
writer_contentious_count: AtomicUsize,
|
||||
/// The number of times we sent something in a channel that was empty.
|
||||
extractor_contentious_count: AtomicUsize,
|
||||
}
|
||||
|
||||
impl Drop for ExtractorSender {
|
||||
fn drop(&mut self) {
|
||||
let send_count = *self.send_count.get_mut();
|
||||
let writer_contentious_count = *self.writer_contentious_count.get_mut();
|
||||
let extractor_contentious_count = *self.extractor_contentious_count.get_mut();
|
||||
eprintln!(
|
||||
"Extractor channel stats: {send_count} sends, \
|
||||
{writer_contentious_count} writer contentions ({}%), \
|
||||
{extractor_contentious_count} extractor contentions ({}%)",
|
||||
(writer_contentious_count as f32 / send_count as f32) * 100.0,
|
||||
(extractor_contentious_count as f32 / send_count as f32) * 100.0
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl ExtractorSender {
|
||||
pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
|
||||
WordDocidsSender { sender: self, _marker: PhantomData }
|
||||
}
|
||||
|
||||
pub fn facet_docids(&self) -> FacetDocidsSender<'_> {
|
||||
FacetDocidsSender { sender: self }
|
||||
}
|
||||
|
||||
pub fn documents(&self) -> DocumentsSender<'_> {
|
||||
DocumentsSender(self)
|
||||
}
|
||||
|
||||
pub fn send_documents_ids(&self, documents_ids: RoaringBitmap) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_bitmap(
|
||||
DOCUMENTS_IDS_KEY.as_bytes(),
|
||||
documents_ids,
|
||||
));
|
||||
match self.send_db_operation(DbOperation { database: Database::Main, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
fn send_db_operation(&self, op: DbOperation) -> StdResult<(), SendError<()>> {
|
||||
if self.sender.is_full() {
|
||||
self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
if self.sender.is_empty() {
|
||||
self.extractor_contentious_count.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
|
||||
self.send_count.fetch_add(1, Ordering::SeqCst);
|
||||
match self.sender.send(WriterOperation::DbOperation(op)) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ExactWordDocids {}
|
||||
pub enum FidWordCountDocids {}
|
||||
pub enum WordDocids {}
|
||||
pub enum WordFidDocids {}
|
||||
pub enum WordPairProximityDocids {}
|
||||
pub enum WordPositionDocids {}
|
||||
|
||||
pub trait DatabaseType {
|
||||
const DATABASE: Database;
|
||||
}
|
||||
|
||||
impl DatabaseType for ExactWordDocids {
|
||||
const DATABASE: Database = Database::ExactWordDocids;
|
||||
}
|
||||
|
||||
impl DatabaseType for FidWordCountDocids {
|
||||
const DATABASE: Database = Database::FidWordCountDocids;
|
||||
}
|
||||
|
||||
impl DatabaseType for WordDocids {
|
||||
const DATABASE: Database = Database::WordDocids;
|
||||
}
|
||||
|
||||
impl DatabaseType for WordFidDocids {
|
||||
const DATABASE: Database = Database::WordFidDocids;
|
||||
}
|
||||
|
||||
impl DatabaseType for WordPairProximityDocids {
|
||||
const DATABASE: Database = Database::WordPairProximityDocids;
|
||||
}
|
||||
|
||||
impl DatabaseType for WordPositionDocids {
|
||||
const DATABASE: Database = Database::WordPositionDocids;
|
||||
}
|
||||
|
||||
pub trait DocidsSender {
|
||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
|
||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
|
||||
}
|
||||
|
||||
pub struct WordDocidsSender<'a, D> {
|
||||
sender: &'a ExtractorSender,
|
||||
_marker: PhantomData<D>,
|
||||
}
|
||||
|
||||
impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
|
||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
||||
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||
match self.sender.send_db_operation(DbOperation { database: D::DATABASE, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetDocidsSender<'a> {
|
||||
sender: &'a ExtractorSender,
|
||||
}
|
||||
|
||||
impl DocidsSender for FacetDocidsSender<'_> {
|
||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
||||
let database = Database::from(facet_kind);
|
||||
// let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
||||
let entry = match facet_kind {
|
||||
// skip level group size
|
||||
FacetKind::String | FacetKind::Number => {
|
||||
// add facet group size
|
||||
let value = [&[1], value].concat();
|
||||
EntryOperation::Write(KeyValueEntry::from_small_key_value(key, &value))
|
||||
}
|
||||
_ => EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value)),
|
||||
};
|
||||
match self.sender.send_db_operation(DbOperation { database, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let (facet_kind, key) = FacetKind::extract_from_key(key);
|
||||
let database = Database::from(facet_kind);
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||
match self.sender.send_db_operation(DbOperation { database, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentsSender<'a>(&'a ExtractorSender);
|
||||
|
||||
impl DocumentsSender<'_> {
|
||||
/// TODO do that efficiently
|
||||
pub fn uncompressed(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
external_id: String,
|
||||
document: &KvReaderFieldId,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
||||
&docid.to_be_bytes(),
|
||||
document.as_bytes(),
|
||||
));
|
||||
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}?;
|
||||
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
||||
external_id.as_bytes(),
|
||||
&docid.to_be_bytes(),
|
||||
));
|
||||
match self
|
||||
.0
|
||||
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
|
||||
{
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete(&self, docid: DocumentId, external_id: String) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
|
||||
match self.0.send_db_operation(DbOperation { database: Database::Documents, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}?;
|
||||
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(external_id.as_bytes()));
|
||||
match self
|
||||
.0
|
||||
.send_db_operation(DbOperation { database: Database::ExternalDocumentsIds, entry })
|
||||
{
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct EmbeddingSender<'a>(&'a Sender<WriterOperation>);
|
||||
|
||||
impl EmbeddingSender<'_> {
|
||||
pub fn set_vectors(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
embedder_id: u8,
|
||||
embeddings: Vec<Embedding>,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
self.0
|
||||
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVectors {
|
||||
docid,
|
||||
embedder_id,
|
||||
embeddings,
|
||||
}))
|
||||
.map_err(|_| SendError(()))
|
||||
}
|
||||
|
||||
pub fn set_vector(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
embedder_id: u8,
|
||||
embedding: Embedding,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
self.0
|
||||
.send(WriterOperation::ArroyOperation(ArroyOperation::SetVector {
|
||||
docid,
|
||||
embedder_id,
|
||||
embedding,
|
||||
}))
|
||||
.map_err(|_| SendError(()))
|
||||
}
|
||||
|
||||
/// Marks all embedders as "to be built"
|
||||
pub fn finish(
|
||||
self,
|
||||
user_provided: HashMap<String, RoaringBitmap>,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
self.0
|
||||
.send(WriterOperation::ArroyOperation(ArroyOperation::Finish { user_provided }))
|
||||
.map_err(|_| SendError(()))
|
||||
}
|
||||
}
|
||||
398
crates/milli/src/update/new/document.rs
Normal file
398
crates/milli/src/update/new/document.rs
Normal file
@@ -0,0 +1,398 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
|
||||
use heed::RoTxn;
|
||||
use raw_collections::RawMap;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::vector_document::VectorDocument;
|
||||
use super::{KvReaderFieldId, KvWriterFieldId};
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||
use crate::{DocumentId, GlobalFieldsIdsMap, Index, InternalError, Result, UserError};
|
||||
|
||||
/// A view into a document that can represent either the current version from the DB,
|
||||
/// the update data from payload or other means, or the merged updated version.
|
||||
///
|
||||
/// The 'doc lifetime is meant to live sufficiently for the document to be handled by the extractors.
|
||||
pub trait Document<'doc> {
|
||||
/// Iterate over all **top-level** fields of the document, returning their name and raw JSON value.
|
||||
///
|
||||
/// - The returned values *may* contain nested fields.
|
||||
/// - The `_vectors` and `_geo` fields are **ignored** by this method, meaning they are **not returned** by this method.
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>>;
|
||||
|
||||
fn len(&self) -> usize;
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>>;
|
||||
|
||||
/// Returns the unparsed value of the `_vectors` field from the document data.
|
||||
///
|
||||
/// This field alone is insufficient to retrieve vectors, as they may be stored in a dedicated location in the database.
|
||||
/// Use a [`super::vector_document::VectorDocument`] to access the vector.
|
||||
///
|
||||
/// This method is meant as a convenience for implementors of [`super::vector_document::VectorDocument`].
|
||||
fn vectors_field(&self) -> Result<Option<&'doc RawValue>>;
|
||||
|
||||
/// Returns the unparsed value of the `_geo` field from the document data.
|
||||
///
|
||||
/// This field alone is insufficient to retrieve geo data, as they may be stored in a dedicated location in the database.
|
||||
/// Use a [`super::geo_document::GeoDocument`] to access the vector.
|
||||
///
|
||||
/// This method is meant as a convenience for implementors of [`super::geo_document::GeoDocument`].
|
||||
fn geo_field(&self) -> Result<Option<&'doc RawValue>>;
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DocumentFromDb<'t, Mapper: FieldIdMapper>
|
||||
where
|
||||
Mapper: FieldIdMapper,
|
||||
{
|
||||
fields_ids_map: &'t Mapper,
|
||||
content: &'t KvReaderFieldId,
|
||||
}
|
||||
|
||||
impl<'t, Mapper: FieldIdMapper> Clone for DocumentFromDb<'t, Mapper> {
|
||||
#[inline]
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
}
|
||||
}
|
||||
impl<'t, Mapper: FieldIdMapper> Copy for DocumentFromDb<'t, Mapper> {}
|
||||
|
||||
impl<'t, Mapper: FieldIdMapper> Document<'t> for DocumentFromDb<'t, Mapper> {
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'t str, &'t RawValue)>> {
|
||||
let mut it = self.content.iter();
|
||||
|
||||
std::iter::from_fn(move || {
|
||||
let (fid, value) = it.next()?;
|
||||
|
||||
let res = (|| loop {
|
||||
let name = self.fields_ids_map.name(fid).ok_or(
|
||||
InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId {
|
||||
field_id: fid,
|
||||
process: "getting current document",
|
||||
}),
|
||||
)?;
|
||||
|
||||
if name == RESERVED_VECTORS_FIELD_NAME || name == "_geo" {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value =
|
||||
serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?;
|
||||
|
||||
return Ok((name, value));
|
||||
})();
|
||||
|
||||
Some(res)
|
||||
})
|
||||
}
|
||||
|
||||
fn vectors_field(&self) -> Result<Option<&'t RawValue>> {
|
||||
self.field(RESERVED_VECTORS_FIELD_NAME)
|
||||
}
|
||||
|
||||
fn geo_field(&self) -> Result<Option<&'t RawValue>> {
|
||||
self.field("_geo")
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.content.iter().count()
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'t RawValue>> {
|
||||
self.field(k)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> {
|
||||
pub fn new(
|
||||
docid: DocumentId,
|
||||
rtxn: &'t RoTxn,
|
||||
index: &'t Index,
|
||||
db_fields_ids_map: &'t Mapper,
|
||||
) -> Result<Option<Self>> {
|
||||
index.documents.get(rtxn, &docid).map_err(crate::Error::from).map(|reader| {
|
||||
reader.map(|reader| Self { fields_ids_map: db_fields_ids_map, content: reader })
|
||||
})
|
||||
}
|
||||
|
||||
pub fn field(&self, name: &str) -> Result<Option<&'t RawValue>> {
|
||||
let Some(fid) = self.fields_ids_map.id(name) else {
|
||||
return Ok(None);
|
||||
};
|
||||
let Some(value) = self.content.get(fid) else { return Ok(None) };
|
||||
Ok(Some(serde_json::from_slice(value).map_err(InternalError::SerdeJson)?))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct DocumentFromVersions<'a, 'doc> {
|
||||
versions: &'a Versions<'doc>,
|
||||
}
|
||||
|
||||
impl<'a, 'doc> DocumentFromVersions<'a, 'doc> {
|
||||
pub fn new(versions: &'a Versions<'doc>) -> Self {
|
||||
Self { versions }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'doc> Document<'doc> for DocumentFromVersions<'a, 'doc> {
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
|
||||
self.versions.iter_top_level_fields().map(Ok)
|
||||
}
|
||||
|
||||
fn vectors_field(&self) -> Result<Option<&'doc RawValue>> {
|
||||
Ok(self.versions.vectors_field())
|
||||
}
|
||||
|
||||
fn geo_field(&self) -> Result<Option<&'doc RawValue>> {
|
||||
Ok(self.versions.geo_field())
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.versions.len()
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
|
||||
Ok(self.versions.top_level_field(k))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct MergedDocument<'a, 'doc, 't, Mapper: FieldIdMapper> {
|
||||
new_doc: DocumentFromVersions<'a, 'doc>,
|
||||
db: Option<DocumentFromDb<'t, Mapper>>,
|
||||
}
|
||||
|
||||
impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> {
|
||||
pub fn with_db(
|
||||
docid: DocumentId,
|
||||
rtxn: &'t RoTxn,
|
||||
index: &'t Index,
|
||||
db_fields_ids_map: &'t Mapper,
|
||||
new_doc: DocumentFromVersions<'a, 'doc>,
|
||||
) -> Result<Self> {
|
||||
let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map)?;
|
||||
Ok(Self { new_doc, db })
|
||||
}
|
||||
|
||||
pub fn without_db(new_doc: DocumentFromVersions<'a, 'doc>) -> Self {
|
||||
Self { new_doc, db: None }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'d, 'doc: 'd, 't: 'd, Mapper: FieldIdMapper> Document<'d>
|
||||
for MergedDocument<'d, 'doc, 't, Mapper>
|
||||
{
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'d str, &'d RawValue)>> {
|
||||
let mut new_doc_it = self.new_doc.iter_top_level_fields();
|
||||
let mut db_it = self.db.iter().flat_map(|db| db.iter_top_level_fields());
|
||||
let mut seen_fields = BTreeSet::new();
|
||||
|
||||
std::iter::from_fn(move || {
|
||||
if let Some(next) = new_doc_it.next() {
|
||||
if let Ok((name, _)) = next {
|
||||
seen_fields.insert(name);
|
||||
}
|
||||
return Some(next);
|
||||
}
|
||||
loop {
|
||||
match db_it.next()? {
|
||||
Ok((name, value)) => {
|
||||
if seen_fields.contains(name) {
|
||||
continue;
|
||||
}
|
||||
return Some(Ok((name, value)));
|
||||
}
|
||||
Err(err) => return Some(Err(err)),
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn vectors_field(&self) -> Result<Option<&'d RawValue>> {
|
||||
if let Some(vectors) = self.new_doc.vectors_field()? {
|
||||
return Ok(Some(vectors));
|
||||
}
|
||||
|
||||
let Some(db) = self.db else { return Ok(None) };
|
||||
|
||||
db.vectors_field()
|
||||
}
|
||||
|
||||
fn geo_field(&self) -> Result<Option<&'d RawValue>> {
|
||||
if let Some(geo) = self.new_doc.geo_field()? {
|
||||
return Ok(Some(geo));
|
||||
}
|
||||
|
||||
let Some(db) = self.db else { return Ok(None) };
|
||||
|
||||
db.geo_field()
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
self.iter_top_level_fields().count()
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'d RawValue>> {
|
||||
if let Some(f) = self.new_doc.top_level_field(k)? {
|
||||
return Ok(Some(f));
|
||||
}
|
||||
if let Some(db) = self.db {
|
||||
return db.field(k);
|
||||
}
|
||||
Ok(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc, D> Document<'doc> for &D
|
||||
where
|
||||
D: Document<'doc>,
|
||||
{
|
||||
fn iter_top_level_fields(&self) -> impl Iterator<Item = Result<(&'doc str, &'doc RawValue)>> {
|
||||
D::iter_top_level_fields(self)
|
||||
}
|
||||
|
||||
fn vectors_field(&self) -> Result<Option<&'doc RawValue>> {
|
||||
D::vectors_field(self)
|
||||
}
|
||||
|
||||
fn geo_field(&self) -> Result<Option<&'doc RawValue>> {
|
||||
D::geo_field(self)
|
||||
}
|
||||
|
||||
fn len(&self) -> usize {
|
||||
D::len(self)
|
||||
}
|
||||
|
||||
fn top_level_field(&self, k: &str) -> Result<Option<&'doc RawValue>> {
|
||||
D::top_level_field(self, k)
|
||||
}
|
||||
}
|
||||
|
||||
/// Turn this document into an obkv, whose fields are indexed by the provided `FieldIdMapper`.
|
||||
///
|
||||
/// The produced obkv is suitable for storing into the documents DB, meaning:
|
||||
///
|
||||
/// - It contains the contains of `_vectors` that are not configured as an embedder
|
||||
/// - It contains all the top-level fields of the document, with their raw JSON value as value.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If the document contains a top-level field that is not present in `fields_ids_map`.
|
||||
///
|
||||
pub fn write_to_obkv<'s, 'a, 'map>(
|
||||
document: &'s impl Document<'s>,
|
||||
vector_document: Option<&'s impl VectorDocument<'s>>,
|
||||
fields_ids_map: &'a mut GlobalFieldsIdsMap<'map>,
|
||||
mut document_buffer: &'a mut Vec<u8>,
|
||||
) -> Result<&'a KvReaderFieldId>
|
||||
where
|
||||
's: 'a,
|
||||
{
|
||||
// will be used in 'inject_vectors
|
||||
let vectors_value: Box<RawValue>;
|
||||
|
||||
document_buffer.clear();
|
||||
let mut unordered_field_buffer = Vec::new();
|
||||
unordered_field_buffer.clear();
|
||||
|
||||
let mut writer = KvWriterFieldId::new(&mut document_buffer);
|
||||
|
||||
for res in document.iter_top_level_fields() {
|
||||
let (field_name, value) = res?;
|
||||
let field_id =
|
||||
fields_ids_map.id_or_insert(field_name).ok_or(UserError::AttributeLimitReached)?;
|
||||
unordered_field_buffer.push((field_id, value));
|
||||
}
|
||||
|
||||
'inject_vectors: {
|
||||
let Some(vector_document) = vector_document else { break 'inject_vectors };
|
||||
|
||||
let vectors_fid = fields_ids_map
|
||||
.id_or_insert(RESERVED_VECTORS_FIELD_NAME)
|
||||
.ok_or(UserError::AttributeLimitReached)?;
|
||||
|
||||
let mut vectors = BTreeMap::new();
|
||||
for res in vector_document.iter_vectors() {
|
||||
let (name, entry) = res?;
|
||||
if entry.has_configured_embedder {
|
||||
continue; // we don't write vectors with configured embedder in documents
|
||||
}
|
||||
vectors.insert(
|
||||
name,
|
||||
serde_json::json!({
|
||||
"regenerate": entry.regenerate,
|
||||
// TODO: consider optimizing the shape of embedders here to store an array of f32 rather than a JSON object
|
||||
"embeddings": entry.embeddings,
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
vectors_value = serde_json::value::to_raw_value(&vectors).unwrap();
|
||||
unordered_field_buffer.push((vectors_fid, &vectors_value));
|
||||
}
|
||||
|
||||
unordered_field_buffer.sort_by_key(|(fid, _)| *fid);
|
||||
for (fid, value) in unordered_field_buffer.iter() {
|
||||
writer.insert(*fid, value.get().as_bytes()).unwrap();
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
Ok(KvReaderFieldId::from_slice(document_buffer))
|
||||
}
|
||||
|
||||
pub type Entry<'doc> = (&'doc str, &'doc RawValue);
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Versions<'doc> {
|
||||
data: RawMap<'doc>,
|
||||
}
|
||||
|
||||
impl<'doc> Versions<'doc> {
|
||||
pub fn multiple(
|
||||
mut versions: impl Iterator<Item = Result<RawMap<'doc>>>,
|
||||
) -> Result<Option<Self>> {
|
||||
let Some(data) = versions.next() else { return Ok(None) };
|
||||
let mut data = data?;
|
||||
for future_version in versions {
|
||||
let future_version = future_version?;
|
||||
for (field, value) in future_version {
|
||||
data.insert(field, value);
|
||||
}
|
||||
}
|
||||
Ok(Some(Self::single(data)))
|
||||
}
|
||||
|
||||
pub fn single(version: RawMap<'doc>) -> Self {
|
||||
Self { data: version }
|
||||
}
|
||||
|
||||
pub fn iter_top_level_fields(&self) -> impl Iterator<Item = (&'doc str, &'doc RawValue)> + '_ {
|
||||
self.data.iter().filter(|(k, _)| *k != RESERVED_VECTORS_FIELD_NAME && *k != "_geo")
|
||||
}
|
||||
|
||||
pub fn vectors_field(&self) -> Option<&'doc RawValue> {
|
||||
self.data.get(RESERVED_VECTORS_FIELD_NAME)
|
||||
}
|
||||
|
||||
pub fn geo_field(&self) -> Option<&'doc RawValue> {
|
||||
self.data.get("_geo")
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
self.data.len()
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.data.is_empty()
|
||||
}
|
||||
pub fn top_level_field(&self, k: &str) -> Option<&'doc RawValue> {
|
||||
self.data.get(k)
|
||||
}
|
||||
}
|
||||
191
crates/milli/src/update/new/document_change.rs
Normal file
191
crates/milli/src/update/new/document_change.rs
Normal file
@@ -0,0 +1,191 @@
|
||||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::document::{DocumentFromDb, DocumentFromVersions, MergedDocument, Versions};
|
||||
use super::vector_document::{
|
||||
MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions,
|
||||
};
|
||||
use crate::documents::FieldIdMapper;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{DocumentId, Index, Result};
|
||||
|
||||
pub enum DocumentChange<'doc> {
|
||||
Deletion(Deletion<'doc>),
|
||||
Update(Update<'doc>),
|
||||
Insertion(Insertion<'doc>),
|
||||
}
|
||||
|
||||
pub struct Deletion<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
}
|
||||
|
||||
pub struct Update<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
new: Versions<'doc>,
|
||||
has_deletion: bool,
|
||||
}
|
||||
|
||||
pub struct Insertion<'doc> {
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
new: Versions<'doc>,
|
||||
}
|
||||
|
||||
impl<'doc> DocumentChange<'doc> {
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
match &self {
|
||||
Self::Deletion(inner) => inner.docid(),
|
||||
Self::Update(inner) => inner.docid(),
|
||||
Self::Insertion(inner) => inner.docid(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn external_docid(&self) -> &'doc str {
|
||||
match self {
|
||||
DocumentChange::Deletion(deletion) => deletion.external_document_id(),
|
||||
DocumentChange::Update(update) => update.external_document_id(),
|
||||
DocumentChange::Insertion(insertion) => insertion.external_document_id(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Deletion<'doc> {
|
||||
pub fn create(docid: DocumentId, external_document_id: &'doc str) -> Self {
|
||||
Self { docid, external_document_id }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn external_document_id(&self) -> &'doc str {
|
||||
self.external_document_id
|
||||
}
|
||||
|
||||
pub fn current<'a, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
mapper: &'a Mapper,
|
||||
) -> Result<DocumentFromDb<'a, Mapper>> {
|
||||
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
||||
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||
)?)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Insertion<'doc> {
|
||||
pub fn create(docid: DocumentId, external_document_id: &'doc str, new: Versions<'doc>) -> Self {
|
||||
Insertion { docid, external_document_id, new }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn external_document_id(&self) -> &'doc str {
|
||||
self.external_document_id
|
||||
}
|
||||
pub fn inserted(&self) -> DocumentFromVersions<'_, 'doc> {
|
||||
DocumentFromVersions::new(&self.new)
|
||||
}
|
||||
|
||||
pub fn inserted_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
||||
VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'doc> Update<'doc> {
|
||||
pub fn create(
|
||||
docid: DocumentId,
|
||||
external_document_id: &'doc str,
|
||||
new: Versions<'doc>,
|
||||
has_deletion: bool,
|
||||
) -> Self {
|
||||
Update { docid, new, external_document_id, has_deletion }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn external_document_id(&self) -> &'doc str {
|
||||
self.external_document_id
|
||||
}
|
||||
pub fn current<'a, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
mapper: &'a Mapper,
|
||||
) -> Result<DocumentFromDb<'a, Mapper>> {
|
||||
Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper)?.ok_or(
|
||||
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||
)?)
|
||||
}
|
||||
|
||||
pub fn current_vectors<'a, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
mapper: &'a Mapper,
|
||||
doc_alloc: &'a Bump,
|
||||
) -> Result<VectorDocumentFromDb<'a>> {
|
||||
Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or(
|
||||
crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid },
|
||||
)?)
|
||||
}
|
||||
|
||||
pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> {
|
||||
DocumentFromVersions::new(&self.new)
|
||||
}
|
||||
|
||||
pub fn merged<'t, Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
index: &'t Index,
|
||||
mapper: &'t Mapper,
|
||||
) -> Result<MergedDocument<'_, 'doc, 't, Mapper>> {
|
||||
if self.has_deletion {
|
||||
Ok(MergedDocument::without_db(DocumentFromVersions::new(&self.new)))
|
||||
} else {
|
||||
MergedDocument::with_db(
|
||||
self.docid,
|
||||
rtxn,
|
||||
index,
|
||||
mapper,
|
||||
DocumentFromVersions::new(&self.new),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn updated_vectors(
|
||||
&self,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
) -> Result<Option<VectorDocumentFromVersions<'doc>>> {
|
||||
VectorDocumentFromVersions::new(&self.new, doc_alloc, embedders)
|
||||
}
|
||||
|
||||
pub fn merged_vectors<Mapper: FieldIdMapper>(
|
||||
&self,
|
||||
rtxn: &'doc RoTxn,
|
||||
index: &'doc Index,
|
||||
mapper: &'doc Mapper,
|
||||
doc_alloc: &'doc Bump,
|
||||
embedders: &'doc EmbeddingConfigs,
|
||||
) -> Result<Option<MergedVectorDocument<'doc>>> {
|
||||
if self.has_deletion {
|
||||
MergedVectorDocument::without_db(&self.new, doc_alloc, embedders)
|
||||
} else {
|
||||
MergedVectorDocument::with_db(
|
||||
self.docid, index, rtxn, mapper, &self.new, doc_alloc, embedders,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
729
crates/milli/src/update/new/extract/cache.rs
Normal file
729
crates/milli/src/update/new/extract/cache.rs
Normal file
@@ -0,0 +1,729 @@
|
||||
//! # How the Merge Algorithm works
|
||||
//!
|
||||
//! Each extractor create #Threads caches and balances the entries
|
||||
//! based on the hash of the keys. To do that we can use the
|
||||
//! hashbrown::hash_map::RawEntryBuilderMut::from_key_hashed_nocheck.
|
||||
//! This way we can compute the hash on our own, decide on the cache to
|
||||
//! target, and insert it into the right HashMap.
|
||||
//!
|
||||
//! #Thread -> caches
|
||||
//! t1 -> [t1c1, t1c2, t1c3]
|
||||
//! t2 -> [t2c1, t2c2, t2c3]
|
||||
//! t3 -> [t3c1, t3c2, t3c3]
|
||||
//!
|
||||
//! When the extractors are done filling the caches, we want to merge
|
||||
//! the content of all the caches. We do a transpose and each thread is
|
||||
//! assigned the associated cache. By doing that we know that every key
|
||||
//! is put in a known cache and will collide with keys in the other
|
||||
//! caches of the other threads.
|
||||
//!
|
||||
//! #Thread -> caches
|
||||
//! t1 -> [t1c1, t2c1, t3c1]
|
||||
//! t2 -> [t1c2, t2c2, t3c2]
|
||||
//! t3 -> [t1c3, t2c3, t3c3]
|
||||
//!
|
||||
//! When we encountered a miss in the other caches we must still try
|
||||
//! to find it in the spilled entries. This is the reason why we use
|
||||
//! a grenad sorter/reader so that we can seek "efficiently" for a key.
|
||||
//!
|
||||
//! ## More Detailled Algorithm
|
||||
//!
|
||||
//! Each sub-cache has an in-memory HashMap and some spilled
|
||||
//! lexicographically ordered entries on disk (grenad). We first iterate
|
||||
//! over the spilled entries of all the caches at once by using a merge
|
||||
//! join algorithm. This algorithm will merge the entries by using its
|
||||
//! merge function.
|
||||
//!
|
||||
//! Everytime a merged entry is emited by the merge join algorithm we also
|
||||
//! fetch the value from the other in-memory caches (HashMaps) to finish
|
||||
//! the merge. Everytime we retrieve an entry from the in-memory caches
|
||||
//! we mark them with a tombstone for later.
|
||||
//!
|
||||
//! Once we are done with the spilled entries we iterate over the in-memory
|
||||
//! HashMaps. We iterate over the first one, retrieve the content from the
|
||||
//! other onces and mark them with a tombstone again. We also make sure
|
||||
//! to ignore the dead (tombstoned) ones.
|
||||
//!
|
||||
//! ## Memory Control
|
||||
//!
|
||||
//! We can detect that there are no more memory available when the
|
||||
//! bump allocator reaches a threshold. When this is the case we
|
||||
//! freeze the cache. There is one bump allocator by thread and the
|
||||
//! memory must be well balanced as we manage one type of extraction
|
||||
//! at a time with well-balanced documents.
|
||||
//!
|
||||
//! It means that the unknown new keys added to the
|
||||
//! cache are directly spilled to disk: basically a key followed by a
|
||||
//! del/add bitmap. For the known keys we can keep modifying them in
|
||||
//! the materialized version in the cache: update the del/add bitmaps.
|
||||
//!
|
||||
//! For now we can use a grenad sorter for spilling even thought I think
|
||||
//! it's not the most efficient way (too many files open, sorting entries).
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::binary_heap::PeekMut;
|
||||
use std::collections::BinaryHeap;
|
||||
use std::fs::File;
|
||||
use std::hash::BuildHasher;
|
||||
use std::io::BufReader;
|
||||
use std::{io, iter, mem};
|
||||
|
||||
use bumpalo::Bump;
|
||||
use grenad::ReaderCursor;
|
||||
use hashbrown::hash_map::RawEntryMut;
|
||||
use hashbrown::HashMap;
|
||||
use raw_collections::bbbul::{BitPacker, BitPacker4x};
|
||||
use raw_collections::map::FrozenMap;
|
||||
use raw_collections::{Bbbul, FrozenBbbul};
|
||||
use roaring::RoaringBitmap;
|
||||
use rustc_hash::FxBuildHasher;
|
||||
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::new::indexer::document_changes::MostlySend;
|
||||
use crate::update::new::KvReaderDelAdd;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{CboRoaringBitmapCodec, Result};
|
||||
|
||||
/// A cache that stores bytes keys associated to CboDelAddRoaringBitmaps.
|
||||
///
|
||||
/// Internally balances the content over `N` buckets for future merging.
|
||||
pub struct BalancedCaches<'extractor> {
|
||||
hasher: FxBuildHasher,
|
||||
alloc: &'extractor Bump,
|
||||
max_memory: Option<usize>,
|
||||
caches: InnerCaches<'extractor>,
|
||||
}
|
||||
|
||||
enum InnerCaches<'extractor> {
|
||||
Normal(NormalCaches<'extractor>),
|
||||
Spilling(SpillingCaches<'extractor>),
|
||||
}
|
||||
|
||||
impl<'extractor> BalancedCaches<'extractor> {
|
||||
pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
|
||||
Self {
|
||||
hasher: FxBuildHasher,
|
||||
max_memory,
|
||||
caches: InnerCaches::Normal(NormalCaches {
|
||||
caches: iter::repeat_with(|| HashMap::with_hasher_in(FxBuildHasher, alloc))
|
||||
.take(buckets)
|
||||
.collect(),
|
||||
}),
|
||||
alloc,
|
||||
}
|
||||
}
|
||||
|
||||
fn buckets(&self) -> usize {
|
||||
match &self.caches {
|
||||
InnerCaches::Normal(caches) => caches.caches.len(),
|
||||
InnerCaches::Spilling(caches) => caches.caches.len(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) -> Result<()> {
|
||||
if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) {
|
||||
self.start_spilling()?;
|
||||
}
|
||||
|
||||
let buckets = self.buckets();
|
||||
match &mut self.caches {
|
||||
InnerCaches::Normal(normal) => {
|
||||
normal.insert_del_u32(&self.hasher, self.alloc, buckets, key, n);
|
||||
Ok(())
|
||||
}
|
||||
InnerCaches::Spilling(spilling) => {
|
||||
spilling.insert_del_u32(&self.hasher, self.alloc, buckets, key, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_add_u32(&mut self, key: &[u8], n: u32) -> Result<()> {
|
||||
if self.max_memory.map_or(false, |mm| self.alloc.allocated_bytes() >= mm) {
|
||||
self.start_spilling()?;
|
||||
}
|
||||
|
||||
let buckets = self.buckets();
|
||||
match &mut self.caches {
|
||||
InnerCaches::Normal(normal) => {
|
||||
normal.insert_add_u32(&self.hasher, self.alloc, buckets, key, n);
|
||||
Ok(())
|
||||
}
|
||||
InnerCaches::Spilling(spilling) => {
|
||||
spilling.insert_add_u32(&self.hasher, self.alloc, buckets, key, n)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Make sure the cache is no longer allocating data
|
||||
/// and writes every new and unknow entry to disk.
|
||||
fn start_spilling(&mut self) -> Result<()> {
|
||||
let BalancedCaches { hasher: _, alloc, max_memory: _, caches } = self;
|
||||
|
||||
if let InnerCaches::Normal(normal_caches) = caches {
|
||||
eprintln!(
|
||||
"We are spilling after we allocated {} bytes on thread #{}",
|
||||
alloc.allocated_bytes(),
|
||||
rayon::current_thread_index().unwrap_or(0)
|
||||
);
|
||||
|
||||
let allocated: usize = normal_caches.caches.iter().map(|m| m.allocation_size()).sum();
|
||||
eprintln!("The last allocated HashMap took {allocated} bytes");
|
||||
|
||||
let dummy = NormalCaches { caches: Vec::new() };
|
||||
let NormalCaches { caches: cache_maps } = mem::replace(normal_caches, dummy);
|
||||
*caches = InnerCaches::Spilling(SpillingCaches::from_cache_maps(cache_maps));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn freeze(&mut self) -> Result<Vec<FrozenCache<'_, 'extractor>>> {
|
||||
match &mut self.caches {
|
||||
InnerCaches::Normal(NormalCaches { caches }) => caches
|
||||
.iter_mut()
|
||||
.enumerate()
|
||||
.map(|(bucket, map)| {
|
||||
// safety: we are transmuting the Bbbul into a FrozenBbbul
|
||||
// that are the same size.
|
||||
let map = unsafe {
|
||||
std::mem::transmute::<
|
||||
&mut HashMap<
|
||||
&[u8],
|
||||
DelAddBbbul<BitPacker4x>, // from this
|
||||
FxBuildHasher,
|
||||
&Bump,
|
||||
>,
|
||||
&mut HashMap<
|
||||
&[u8],
|
||||
FrozenDelAddBbbul<BitPacker4x>, // to that
|
||||
FxBuildHasher,
|
||||
&Bump,
|
||||
>,
|
||||
>(map)
|
||||
};
|
||||
Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled: Vec::new() })
|
||||
})
|
||||
.collect(),
|
||||
InnerCaches::Spilling(SpillingCaches { caches, spilled_entries, .. }) => caches
|
||||
.iter_mut()
|
||||
.zip(mem::take(spilled_entries))
|
||||
.enumerate()
|
||||
.map(|(bucket, (map, sorter))| {
|
||||
let spilled = sorter
|
||||
.into_reader_cursors()?
|
||||
.into_iter()
|
||||
.map(ReaderCursor::into_inner)
|
||||
.map(BufReader::new)
|
||||
.map(|bufreader| grenad::Reader::new(bufreader).map_err(Into::into))
|
||||
.collect::<Result<_>>()?;
|
||||
// safety: we are transmuting the Bbbul into a FrozenBbbul
|
||||
// that are the same size.
|
||||
let map = unsafe {
|
||||
std::mem::transmute::<
|
||||
&mut HashMap<
|
||||
&[u8],
|
||||
DelAddBbbul<BitPacker4x>, // from this
|
||||
FxBuildHasher,
|
||||
&Bump,
|
||||
>,
|
||||
&mut HashMap<
|
||||
&[u8],
|
||||
FrozenDelAddBbbul<BitPacker4x>, // to that
|
||||
FxBuildHasher,
|
||||
&Bump,
|
||||
>,
|
||||
>(map)
|
||||
};
|
||||
Ok(FrozenCache { bucket, cache: FrozenMap::new(map), spilled })
|
||||
})
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
unsafe impl MostlySend for BalancedCaches<'_> {}
|
||||
|
||||
struct NormalCaches<'extractor> {
|
||||
caches: Vec<
|
||||
HashMap<
|
||||
&'extractor [u8],
|
||||
DelAddBbbul<'extractor, BitPacker4x>,
|
||||
FxBuildHasher,
|
||||
&'extractor Bump,
|
||||
>,
|
||||
>,
|
||||
}
|
||||
|
||||
impl<'extractor> NormalCaches<'extractor> {
|
||||
pub fn insert_del_u32(
|
||||
&mut self,
|
||||
hasher: &FxBuildHasher,
|
||||
alloc: &'extractor Bump,
|
||||
buckets: usize,
|
||||
key: &[u8],
|
||||
n: u32,
|
||||
) {
|
||||
let hash = hasher.hash_one(key);
|
||||
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||
|
||||
match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
|
||||
RawEntryMut::Occupied(mut entry) => {
|
||||
entry.get_mut().del.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
|
||||
}
|
||||
RawEntryMut::Vacant(entry) => {
|
||||
entry.insert_hashed_nocheck(
|
||||
hash,
|
||||
alloc.alloc_slice_copy(key),
|
||||
DelAddBbbul::new_del_u32_in(n, alloc),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_add_u32(
|
||||
&mut self,
|
||||
hasher: &FxBuildHasher,
|
||||
alloc: &'extractor Bump,
|
||||
buckets: usize,
|
||||
key: &[u8],
|
||||
n: u32,
|
||||
) {
|
||||
let hash = hasher.hash_one(key);
|
||||
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||
match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
|
||||
RawEntryMut::Occupied(mut entry) => {
|
||||
entry.get_mut().add.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
|
||||
}
|
||||
RawEntryMut::Vacant(entry) => {
|
||||
entry.insert_hashed_nocheck(
|
||||
hash,
|
||||
alloc.alloc_slice_copy(key),
|
||||
DelAddBbbul::new_add_u32_in(n, alloc),
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct SpillingCaches<'extractor> {
|
||||
caches: Vec<
|
||||
HashMap<
|
||||
&'extractor [u8],
|
||||
DelAddBbbul<'extractor, BitPacker4x>,
|
||||
FxBuildHasher,
|
||||
&'extractor Bump,
|
||||
>,
|
||||
>,
|
||||
spilled_entries: Vec<grenad::Sorter<MergeDeladdCboRoaringBitmaps>>,
|
||||
deladd_buffer: Vec<u8>,
|
||||
cbo_buffer: Vec<u8>,
|
||||
}
|
||||
|
||||
impl<'extractor> SpillingCaches<'extractor> {
|
||||
fn from_cache_maps(
|
||||
caches: Vec<
|
||||
HashMap<
|
||||
&'extractor [u8],
|
||||
DelAddBbbul<'extractor, BitPacker4x>,
|
||||
FxBuildHasher,
|
||||
&'extractor Bump,
|
||||
>,
|
||||
>,
|
||||
) -> SpillingCaches<'extractor> {
|
||||
SpillingCaches {
|
||||
spilled_entries: iter::repeat_with(|| {
|
||||
let mut builder = grenad::SorterBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
builder.dump_threshold(0);
|
||||
builder.allow_realloc(false);
|
||||
builder.build()
|
||||
})
|
||||
.take(caches.len())
|
||||
.collect(),
|
||||
caches,
|
||||
deladd_buffer: Vec::new(),
|
||||
cbo_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_del_u32(
|
||||
&mut self,
|
||||
hasher: &FxBuildHasher,
|
||||
alloc: &'extractor Bump,
|
||||
buckets: usize,
|
||||
key: &[u8],
|
||||
n: u32,
|
||||
) -> Result<()> {
|
||||
let hash = hasher.hash_one(key);
|
||||
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||
match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
|
||||
RawEntryMut::Occupied(mut entry) => {
|
||||
entry.get_mut().del.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
|
||||
Ok(())
|
||||
}
|
||||
RawEntryMut::Vacant(_entry) => spill_entry_to_sorter(
|
||||
&mut self.spilled_entries[bucket],
|
||||
&mut self.deladd_buffer,
|
||||
&mut self.cbo_buffer,
|
||||
key,
|
||||
DelAddRoaringBitmap::new_del_u32(n),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_add_u32(
|
||||
&mut self,
|
||||
hasher: &FxBuildHasher,
|
||||
alloc: &'extractor Bump,
|
||||
buckets: usize,
|
||||
key: &[u8],
|
||||
n: u32,
|
||||
) -> Result<()> {
|
||||
let hash = hasher.hash_one(key);
|
||||
let bucket = compute_bucket_from_hash(buckets, hash);
|
||||
match self.caches[bucket].raw_entry_mut().from_hash(hash, |&k| k == key) {
|
||||
RawEntryMut::Occupied(mut entry) => {
|
||||
entry.get_mut().add.get_or_insert_with(|| Bbbul::new_in(alloc)).insert(n);
|
||||
Ok(())
|
||||
}
|
||||
RawEntryMut::Vacant(_entry) => spill_entry_to_sorter(
|
||||
&mut self.spilled_entries[bucket],
|
||||
&mut self.deladd_buffer,
|
||||
&mut self.cbo_buffer,
|
||||
key,
|
||||
DelAddRoaringBitmap::new_add_u32(n),
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn compute_bucket_from_hash(buckets: usize, hash: u64) -> usize {
|
||||
hash as usize % buckets
|
||||
}
|
||||
|
||||
fn spill_entry_to_sorter(
|
||||
spilled_entries: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
deladd_buffer: &mut Vec<u8>,
|
||||
cbo_buffer: &mut Vec<u8>,
|
||||
key: &[u8],
|
||||
deladd: DelAddRoaringBitmap,
|
||||
) -> Result<()> {
|
||||
deladd_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(deladd_buffer);
|
||||
|
||||
match deladd {
|
||||
DelAddRoaringBitmap { del: Some(del), add: None } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: Some(del), add: Some(add) } => {
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&del, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, &cbo_buffer)?;
|
||||
|
||||
cbo_buffer.clear();
|
||||
CboRoaringBitmapCodec::serialize_into(&add, cbo_buffer);
|
||||
value_writer.insert(DelAdd::Addition, &cbo_buffer)?;
|
||||
}
|
||||
DelAddRoaringBitmap { del: None, add: None } => return Ok(()),
|
||||
}
|
||||
|
||||
let bytes = value_writer.into_inner().unwrap();
|
||||
spilled_entries.insert(key, bytes).map_err(Into::into)
|
||||
}
|
||||
|
||||
pub struct FrozenCache<'a, 'extractor> {
|
||||
bucket: usize,
|
||||
cache: FrozenMap<
|
||||
'a,
|
||||
'extractor,
|
||||
&'extractor [u8],
|
||||
FrozenDelAddBbbul<'extractor, BitPacker4x>,
|
||||
FxBuildHasher,
|
||||
>,
|
||||
spilled: Vec<grenad::Reader<BufReader<File>>>,
|
||||
}
|
||||
|
||||
pub fn transpose_and_freeze_caches<'a, 'extractor>(
|
||||
caches: &'a mut [BalancedCaches<'extractor>],
|
||||
) -> Result<Vec<Vec<FrozenCache<'a, 'extractor>>>> {
|
||||
let width = caches.first().map(BalancedCaches::buckets).unwrap_or(0);
|
||||
let mut bucket_caches: Vec<_> = iter::repeat_with(Vec::new).take(width).collect();
|
||||
|
||||
for thread_cache in caches {
|
||||
for frozen in thread_cache.freeze()? {
|
||||
bucket_caches[frozen.bucket].push(frozen);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(bucket_caches)
|
||||
}
|
||||
|
||||
/// Merges the caches that must be all associated to the same bucket.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// - If the bucket IDs in these frozen caches are not exactly the same.
|
||||
pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()>
|
||||
where
|
||||
F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>,
|
||||
{
|
||||
let mut maps = Vec::new();
|
||||
let mut readers = Vec::new();
|
||||
let mut current_bucket = None;
|
||||
for FrozenCache { bucket, cache, ref mut spilled } in frozen {
|
||||
assert_eq!(*current_bucket.get_or_insert(bucket), bucket);
|
||||
maps.push(cache);
|
||||
readers.append(spilled);
|
||||
}
|
||||
|
||||
// First manage the spilled entries by looking into the HashMaps,
|
||||
// merge them and mark them as dummy.
|
||||
let mut heap = BinaryHeap::new();
|
||||
for (source_index, source) in readers.into_iter().enumerate() {
|
||||
let mut cursor = source.into_cursor()?;
|
||||
if cursor.move_on_next()?.is_some() {
|
||||
heap.push(Entry { cursor, source_index });
|
||||
}
|
||||
}
|
||||
|
||||
loop {
|
||||
let mut first_entry = match heap.pop() {
|
||||
Some(entry) => entry,
|
||||
None => break,
|
||||
};
|
||||
|
||||
let (first_key, first_value) = match first_entry.cursor.current() {
|
||||
Some((key, value)) => (key, value),
|
||||
None => break,
|
||||
};
|
||||
|
||||
let mut output = DelAddRoaringBitmap::from_bytes(first_value)?;
|
||||
while let Some(mut entry) = heap.peek_mut() {
|
||||
if let Some((key, _value)) = entry.cursor.current() {
|
||||
if first_key == key {
|
||||
let new = DelAddRoaringBitmap::from_bytes(first_value)?;
|
||||
output = output.merge(new);
|
||||
// When we are done we the current value of this entry move make
|
||||
// it move forward and let the heap reorganize itself (on drop)
|
||||
if entry.cursor.move_on_next()?.is_none() {
|
||||
PeekMut::pop(entry);
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Once we merged all of the spilled bitmaps we must also
|
||||
// fetch the entries from the non-spilled entries (the HashMaps).
|
||||
for (map_index, map) in maps.iter_mut().enumerate() {
|
||||
if first_entry.source_index != map_index {
|
||||
if let Some(new) = map.get_mut(first_key) {
|
||||
output.append_and_clear_bbbul(new);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We send the merged entry outside.
|
||||
(f)(first_key, output)?;
|
||||
|
||||
// Don't forget to put the first entry back into the heap.
|
||||
if first_entry.cursor.move_on_next()?.is_some() {
|
||||
heap.push(first_entry)
|
||||
}
|
||||
}
|
||||
|
||||
// Then manage the content on the HashMap entries that weren't taken (mem::take).
|
||||
while let Some(mut map) = maps.pop() {
|
||||
for (key, bbbul) in map.iter_mut() {
|
||||
let mut output = DelAddRoaringBitmap::empty();
|
||||
output.append_and_clear_bbbul(bbbul);
|
||||
|
||||
// Make sure we don't try to work with entries already managed by the spilled
|
||||
if !bbbul.is_empty() {
|
||||
for rhs in maps.iter_mut() {
|
||||
if let Some(new) = rhs.get_mut(key) {
|
||||
output.append_and_clear_bbbul(new);
|
||||
}
|
||||
}
|
||||
|
||||
// We send the merged entry outside.
|
||||
(f)(key, output)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
struct Entry<R> {
|
||||
cursor: ReaderCursor<R>,
|
||||
source_index: usize,
|
||||
}
|
||||
|
||||
impl<R> Ord for Entry<R> {
|
||||
fn cmp(&self, other: &Entry<R>) -> Ordering {
|
||||
let skey = self.cursor.current().map(|(k, _)| k);
|
||||
let okey = other.cursor.current().map(|(k, _)| k);
|
||||
skey.cmp(&okey).then(self.source_index.cmp(&other.source_index)).reverse()
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> Eq for Entry<R> {}
|
||||
|
||||
impl<R> PartialEq for Entry<R> {
|
||||
fn eq(&self, other: &Entry<R>) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<R> PartialOrd for Entry<R> {
|
||||
fn partial_cmp(&self, other: &Entry<R>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DelAddBbbul<'bump, B> {
|
||||
pub del: Option<Bbbul<'bump, B>>,
|
||||
pub add: Option<Bbbul<'bump, B>>,
|
||||
}
|
||||
|
||||
impl<'bump, B: BitPacker> DelAddBbbul<'bump, B> {
|
||||
pub fn insert_del_u32_in(&mut self, n: u32, bump: &'bump Bump) {
|
||||
self.del.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n);
|
||||
}
|
||||
|
||||
pub fn insert_add_u32_in(&mut self, n: u32, bump: &'bump Bump) {
|
||||
self.add.get_or_insert_with(|| Bbbul::new_in(bump)).insert(n);
|
||||
}
|
||||
|
||||
pub fn new_del_u32_in(n: u32, bump: &'bump Bump) -> Self {
|
||||
let mut bbbul = Bbbul::new_in(bump);
|
||||
bbbul.insert(n);
|
||||
DelAddBbbul { del: Some(bbbul), add: None }
|
||||
}
|
||||
|
||||
pub fn new_add_u32_in(n: u32, bump: &'bump Bump) -> Self {
|
||||
let mut bbbul = Bbbul::new_in(bump);
|
||||
bbbul.insert(n);
|
||||
DelAddBbbul { del: None, add: Some(bbbul) }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FrozenDelAddBbbul<'bump, B> {
|
||||
pub del: Option<FrozenBbbul<'bump, B>>,
|
||||
pub add: Option<FrozenBbbul<'bump, B>>,
|
||||
}
|
||||
|
||||
impl<'bump, B> FrozenDelAddBbbul<'bump, B> {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.del.is_none() && self.add.is_none()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct DelAddRoaringBitmap {
|
||||
pub del: Option<RoaringBitmap>,
|
||||
pub add: Option<RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl DelAddRoaringBitmap {
|
||||
fn from_bytes(bytes: &[u8]) -> io::Result<DelAddRoaringBitmap> {
|
||||
let reader = KvReaderDelAdd::from_slice(bytes);
|
||||
|
||||
let del = match reader.get(DelAdd::Deletion) {
|
||||
Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
|
||||
None => None,
|
||||
};
|
||||
|
||||
let add = match reader.get(DelAdd::Addition) {
|
||||
Some(bytes) => CboRoaringBitmapCodec::deserialize_from(bytes).map(Some)?,
|
||||
None => None,
|
||||
};
|
||||
|
||||
Ok(DelAddRoaringBitmap { del, add })
|
||||
}
|
||||
|
||||
pub fn empty() -> DelAddRoaringBitmap {
|
||||
DelAddRoaringBitmap { del: None, add: None }
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
let DelAddRoaringBitmap { del, add } = self;
|
||||
del.is_none() && add.is_none()
|
||||
}
|
||||
|
||||
pub fn insert_del_u32(&mut self, n: u32) {
|
||||
self.del.get_or_insert_with(RoaringBitmap::new).insert(n);
|
||||
}
|
||||
|
||||
pub fn insert_add_u32(&mut self, n: u32) {
|
||||
self.add.get_or_insert_with(RoaringBitmap::new).insert(n);
|
||||
}
|
||||
|
||||
pub fn new_del_u32(n: u32) -> Self {
|
||||
DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None }
|
||||
}
|
||||
|
||||
pub fn new_add_u32(n: u32) -> Self {
|
||||
DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
|
||||
}
|
||||
|
||||
pub fn append_and_clear_bbbul<B: BitPacker>(&mut self, bbbul: &mut FrozenDelAddBbbul<'_, B>) {
|
||||
let FrozenDelAddBbbul { del, add } = bbbul;
|
||||
|
||||
if let Some(ref mut bbbul) = del.take() {
|
||||
let del = self.del.get_or_insert_with(RoaringBitmap::new);
|
||||
let mut iter = bbbul.iter_and_clear();
|
||||
while let Some(block) = iter.next_block() {
|
||||
del.append(block.iter().copied());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref mut bbbul) = add.take() {
|
||||
let add = self.add.get_or_insert_with(RoaringBitmap::new);
|
||||
let mut iter = bbbul.iter_and_clear();
|
||||
while let Some(block) = iter.next_block() {
|
||||
add.append(block.iter().copied());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn merge(self, rhs: DelAddRoaringBitmap) -> DelAddRoaringBitmap {
|
||||
let DelAddRoaringBitmap { del, add } = self;
|
||||
let DelAddRoaringBitmap { del: ndel, add: nadd } = rhs;
|
||||
|
||||
let del = match (del, ndel) {
|
||||
(None, None) => None,
|
||||
(None, Some(del)) | (Some(del), None) => Some(del),
|
||||
(Some(del), Some(ndel)) => Some(del | ndel),
|
||||
};
|
||||
|
||||
let add = match (add, nadd) {
|
||||
(None, None) => None,
|
||||
(None, Some(add)) | (Some(add), None) => Some(add),
|
||||
(Some(add), Some(nadd)) => Some(add | nadd),
|
||||
};
|
||||
|
||||
DelAddRoaringBitmap { del, add }
|
||||
}
|
||||
|
||||
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) {
|
||||
let DelAddRoaringBitmap { del, add } = self;
|
||||
|
||||
if let Some(del) = del {
|
||||
*documents_ids -= del;
|
||||
}
|
||||
|
||||
if let Some(add) = add {
|
||||
*documents_ids |= add;
|
||||
}
|
||||
}
|
||||
}
|
||||
73
crates/milli/src/update/new/extract/documents.rs
Normal file
73
crates/milli/src/update/new/extract/documents.rs
Normal file
@@ -0,0 +1,73 @@
|
||||
use std::cell::RefCell;
|
||||
|
||||
use bumpalo::Bump;
|
||||
|
||||
use super::DelAddRoaringBitmap;
|
||||
use crate::update::new::channel::DocumentsSender;
|
||||
use crate::update::new::document::write_to_obkv;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
DocumentChangeContext, Extractor, FullySend, RefCellExt as _,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::Result;
|
||||
|
||||
pub struct DocumentsExtractor<'a> {
|
||||
documents_sender: &'a DocumentsSender<'a>,
|
||||
}
|
||||
|
||||
impl<'a> DocumentsExtractor<'a> {
|
||||
pub fn new(documents_sender: &'a DocumentsSender<'a>) -> Self {
|
||||
Self { documents_sender }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a> {
|
||||
type Data = FullySend<RefCell<DelAddRoaringBitmap>>;
|
||||
|
||||
fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(FullySend(RefCell::new(DelAddRoaringBitmap::empty())))
|
||||
}
|
||||
|
||||
fn process(
|
||||
&self,
|
||||
change: DocumentChange,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
let mut document_buffer = Vec::new();
|
||||
let mut delta_documents_ids = context.data.0.borrow_mut_or_yield();
|
||||
|
||||
let new_fields_ids_map = context.new_fields_ids_map.borrow_or_yield();
|
||||
let new_fields_ids_map = &*new_fields_ids_map;
|
||||
let new_fields_ids_map = new_fields_ids_map.local_map();
|
||||
|
||||
let external_docid = change.external_docid().to_owned();
|
||||
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
match change {
|
||||
DocumentChange::Deletion(deletion) => {
|
||||
let docid = deletion.docid();
|
||||
self.documents_sender.delete(docid, external_docid).unwrap();
|
||||
delta_documents_ids.insert_del_u32(docid);
|
||||
}
|
||||
/// TODO: change NONE by SOME(vector) when implemented
|
||||
DocumentChange::Update(update) => {
|
||||
let docid = update.docid();
|
||||
let content =
|
||||
update.new(&context.txn, context.index, &context.db_fields_ids_map)?;
|
||||
let content =
|
||||
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
|
||||
self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
}
|
||||
DocumentChange::Insertion(insertion) => {
|
||||
let docid = insertion.docid();
|
||||
let content = insertion.new();
|
||||
let content =
|
||||
write_to_obkv(&content, None, new_fields_ids_map, &mut document_buffer)?;
|
||||
self.documents_sender.uncompressed(docid, external_docid, content).unwrap();
|
||||
delta_documents_ids.insert_add_u32(docid);
|
||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
271
crates/milli/src/update/new/extract/faceted/extract_facets.rs
Normal file
271
crates/milli/src/update/new/extract/faceted/extract_facets.rs
Normal file
@@ -0,0 +1,271 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashSet;
|
||||
use std::ops::DerefMut as _;
|
||||
|
||||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::super::cache::BalancedCaches;
|
||||
use super::facet_document::extract_document_facets;
|
||||
use super::FacetKind;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::new::extract::DocidsExtractor;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
|
||||
Progress, RefCellExt, ThreadLocal,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
pub struct FacetedExtractorData<'a> {
|
||||
attributes_to_extract: &'a [&'a str],
|
||||
grenad_parameters: GrenadParameters,
|
||||
buckets: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> {
|
||||
type Data = RefCell<BalancedCaches<'extractor>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(BalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory,
|
||||
extractor_alloc,
|
||||
)))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
FacetedDocidsExtractor::extract_document_change(
|
||||
context,
|
||||
self.attributes_to_extract,
|
||||
change,
|
||||
)?
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetedDocidsExtractor;
|
||||
|
||||
impl FacetedDocidsExtractor {
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
attributes_to_extract: &[&str],
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let index = &context.index;
|
||||
let rtxn = &context.txn;
|
||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||
let mut cached_sorter = context.data.borrow_mut_or_yield();
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map.deref_mut(),
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
&context.doc_alloc,
|
||||
cached_sorter.deref_mut(),
|
||||
BalancedCaches::insert_del_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
),
|
||||
DocumentChange::Update(inner) => {
|
||||
extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map.deref_mut(),
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
&context.doc_alloc,
|
||||
cached_sorter.deref_mut(),
|
||||
BalancedCaches::insert_del_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.merged(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map.deref_mut(),
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
&context.doc_alloc,
|
||||
cached_sorter.deref_mut(),
|
||||
BalancedCaches::insert_add_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
DocumentChange::Insertion(inner) => extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.inserted(),
|
||||
new_fields_ids_map.deref_mut(),
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
&context.doc_alloc,
|
||||
cached_sorter.deref_mut(),
|
||||
BalancedCaches::insert_add_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn facet_fn_with_options<'extractor>(
|
||||
doc_alloc: &Bump,
|
||||
cached_sorter: &mut BalancedCaches<'extractor>,
|
||||
cache_fn: impl Fn(&mut BalancedCaches<'extractor>, &[u8], u32) -> Result<()>,
|
||||
docid: DocumentId,
|
||||
fid: FieldId,
|
||||
value: &Value,
|
||||
) -> Result<()> {
|
||||
let mut buffer = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||
// Exists
|
||||
// key: fid
|
||||
buffer.push(FacetKind::Exists as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
cache_fn(cached_sorter, &buffer, docid)?;
|
||||
|
||||
match value {
|
||||
// Number
|
||||
// key: fid - level - orderedf64 - orignalf64
|
||||
Value::Number(number) => {
|
||||
if let Some((n, ordered)) =
|
||||
number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered)))
|
||||
{
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Number as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(0); // level 0
|
||||
buffer.extend_from_slice(&ordered);
|
||||
buffer.extend_from_slice(&n.to_be_bytes());
|
||||
cache_fn(cached_sorter, &buffer, docid)
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
// String
|
||||
// key: fid - level - truncated_string
|
||||
Value::String(s) => {
|
||||
let normalized = crate::normalize_facet(s);
|
||||
let truncated = truncate_str(&normalized);
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::String as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(0); // level 0
|
||||
buffer.extend_from_slice(truncated.as_bytes());
|
||||
cache_fn(cached_sorter, &buffer, docid)
|
||||
}
|
||||
// Null
|
||||
// key: fid
|
||||
Value::Null => {
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Null as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
cache_fn(cached_sorter, &buffer, docid)
|
||||
}
|
||||
// Empty
|
||||
// key: fid
|
||||
Value::Array(a) if a.is_empty() => {
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Empty as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
cache_fn(cached_sorter, &buffer, docid)
|
||||
}
|
||||
Value::Object(o) if o.is_empty() => {
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Empty as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
cache_fn(cached_sorter, &buffer, docid)
|
||||
}
|
||||
// Otherwise, do nothing
|
||||
/// TODO: What about Value::Bool?
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
|
||||
index.user_defined_faceted_fields(rtxn)
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_str(s: &str) -> &str {
|
||||
let index = s
|
||||
.char_indices()
|
||||
.map(|(idx, _)| idx)
|
||||
.chain(std::iter::once(s.len()))
|
||||
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||
.last();
|
||||
|
||||
&s[..index.unwrap_or(0)]
|
||||
}
|
||||
|
||||
impl DocidsExtractor for FacetedDocidsExtractor {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_steps: u16,
|
||||
total_steps: u16,
|
||||
step_name: &'static str,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||
let attributes_to_extract: Vec<_> =
|
||||
attributes_to_extract.iter().map(|s| s.as_ref()).collect();
|
||||
let datastore = ThreadLocal::new();
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
|
||||
let extractor = FacetedExtractorData {
|
||||
attributes_to_extract: &attributes_to_extract,
|
||||
grenad_parameters,
|
||||
buckets: rayon::current_num_threads(),
|
||||
};
|
||||
extract(
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
finished_steps,
|
||||
total_steps,
|
||||
step_name,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
|
||||
|
||||
pub fn extract_document_facets<'doc>(
|
||||
attributes_to_extract: &[&str],
|
||||
document: impl Document<'doc>,
|
||||
field_id_map: &mut GlobalFieldsIdsMap,
|
||||
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
for res in document.iter_top_level_fields() {
|
||||
let (field_name, value) = res?;
|
||||
|
||||
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
|
||||
Some(field_id) => facet_fn(field_id, value),
|
||||
None => Err(UserError::AttributeLimitReached.into()),
|
||||
};
|
||||
|
||||
// if the current field is searchable or contains a searchable attribute
|
||||
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
|
||||
// parse json.
|
||||
match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
||||
&object,
|
||||
Some(attributes_to_extract),
|
||||
&[], // skip no attributes
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
|
||||
&array,
|
||||
Some(attributes_to_extract),
|
||||
&[], // skip no attributes
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
value => tokenize_field(field_name, &value)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
34
crates/milli/src/update/new/extract/faceted/mod.rs
Normal file
34
crates/milli/src/update/new/extract/faceted/mod.rs
Normal file
@@ -0,0 +1,34 @@
|
||||
mod extract_facets;
|
||||
mod facet_document;
|
||||
|
||||
pub use extract_facets::FacetedDocidsExtractor;
|
||||
|
||||
#[repr(u8)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum FacetKind {
|
||||
Number = 0,
|
||||
String = 1,
|
||||
Null = 2,
|
||||
Empty = 3,
|
||||
Exists,
|
||||
}
|
||||
|
||||
impl From<u8> for FacetKind {
|
||||
fn from(value: u8) -> Self {
|
||||
match value {
|
||||
0 => Self::Number,
|
||||
1 => Self::String,
|
||||
2 => Self::Null,
|
||||
3 => Self::Empty,
|
||||
4 => Self::Exists,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetKind {
|
||||
pub fn extract_from_key(key: &[u8]) -> (FacetKind, &[u8]) {
|
||||
debug_assert!(key.len() > 3);
|
||||
(FacetKind::from(key[0]), &key[1..])
|
||||
}
|
||||
}
|
||||
146
crates/milli/src/update/new/extract/mod.rs
Normal file
146
crates/milli/src/update/new/extract/mod.rs
Normal file
@@ -0,0 +1,146 @@
|
||||
mod cache;
|
||||
mod documents;
|
||||
mod faceted;
|
||||
mod searchable;
|
||||
mod vectors;
|
||||
|
||||
use bumpalo::Bump;
|
||||
pub use cache::{merge_caches, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap};
|
||||
pub use documents::*;
|
||||
pub use faceted::*;
|
||||
pub use searchable::*;
|
||||
pub use vectors::EmbeddingExtractor;
|
||||
|
||||
use super::indexer::document_changes::{
|
||||
DocumentChanges, FullySend, IndexingContext, Progress, ThreadLocal,
|
||||
};
|
||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::Result;
|
||||
|
||||
pub trait DocidsExtractor {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_steps: u16,
|
||||
total_steps: u16,
|
||||
step_name: &'static str,
|
||||
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync;
|
||||
}
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
pub mod perm_json_p {
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::Result;
|
||||
const SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// Returns `true` if the `selector` match the `key`.
|
||||
///
|
||||
/// ```text
|
||||
/// Example:
|
||||
/// `animaux` match `animaux`
|
||||
/// `animaux.chien` match `animaux`
|
||||
/// `animaux.chien` match `animaux`
|
||||
/// `animaux.chien.nom` match `animaux`
|
||||
/// `animaux.chien.nom` match `animaux.chien`
|
||||
/// -----------------------------------------
|
||||
/// `animaux` doesn't match `animaux.chien`
|
||||
/// `animaux.` doesn't match `animaux`
|
||||
/// `animaux.ch` doesn't match `animaux.chien`
|
||||
/// `animau` doesn't match `animaux`
|
||||
/// ```
|
||||
pub fn contained_in(selector: &str, key: &str) -> bool {
|
||||
selector.starts_with(key)
|
||||
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
||||
}
|
||||
|
||||
pub fn seek_leaf_values_in_object(
|
||||
value: &Map<String, Value>,
|
||||
selectors: Option<&[&str]>,
|
||||
skip_selectors: &[&str],
|
||||
base_key: &str,
|
||||
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
if value.is_empty() {
|
||||
seeker(base_key, &Value::Object(Map::with_capacity(0)))?;
|
||||
}
|
||||
|
||||
for (key, value) in value.iter() {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
||||
// so we check the contained_in on both side
|
||||
let should_continue = select_field(&base_key, selectors, skip_selectors);
|
||||
if should_continue {
|
||||
match value {
|
||||
Value::Object(object) => seek_leaf_values_in_object(
|
||||
object,
|
||||
selectors,
|
||||
skip_selectors,
|
||||
&base_key,
|
||||
seeker,
|
||||
),
|
||||
Value::Array(array) => seek_leaf_values_in_array(
|
||||
array,
|
||||
selectors,
|
||||
skip_selectors,
|
||||
&base_key,
|
||||
seeker,
|
||||
),
|
||||
value => seeker(&base_key, value),
|
||||
}?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn seek_leaf_values_in_array(
|
||||
values: &[Value],
|
||||
selectors: Option<&[&str]>,
|
||||
skip_selectors: &[&str],
|
||||
base_key: &str,
|
||||
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
if values.is_empty() {
|
||||
seeker(base_key, &Value::Array(vec![]))?;
|
||||
}
|
||||
|
||||
for value in values {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
|
||||
}
|
||||
Value::Array(array) => {
|
||||
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
|
||||
}
|
||||
value => seeker(base_key, value),
|
||||
}?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn select_field(
|
||||
field_name: &str,
|
||||
selectors: Option<&[&str]>,
|
||||
skip_selectors: &[&str],
|
||||
) -> bool {
|
||||
selectors.map_or(true, |selectors| {
|
||||
selectors.iter().any(|selector| {
|
||||
contained_in(selector, field_name) || contained_in(field_name, selector)
|
||||
})
|
||||
}) && !skip_selectors.iter().any(|skip_selector| {
|
||||
contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,400 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::HashMap;
|
||||
use std::mem::size_of;
|
||||
use std::ops::DerefMut as _;
|
||||
|
||||
use bumpalo::collections::vec::Vec as BumpVec;
|
||||
use bumpalo::Bump;
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::extract::perm_json_p::contained_in;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
for_each_document_change, DocumentChangeContext, DocumentChanges, Extractor, FullySend,
|
||||
IndexingContext, MostlySend, RefCellExt, ThreadLocal,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{bucketed_position, DocumentId, FieldId, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
pub struct WordDocidsBalancedCaches<'extractor> {
|
||||
word_fid_docids: BalancedCaches<'extractor>,
|
||||
word_docids: BalancedCaches<'extractor>,
|
||||
exact_word_docids: BalancedCaches<'extractor>,
|
||||
word_position_docids: BalancedCaches<'extractor>,
|
||||
fid_word_count_docids: BalancedCaches<'extractor>,
|
||||
fid_word_count: HashMap<FieldId, (usize, usize)>,
|
||||
current_docid: Option<DocumentId>,
|
||||
}
|
||||
|
||||
unsafe impl<'extractor> MostlySend for WordDocidsBalancedCaches<'extractor> {}
|
||||
|
||||
impl<'extractor> WordDocidsBalancedCaches<'extractor> {
|
||||
/// TODO Make sure to give the same max_memory to all of them, without splitting it
|
||||
pub fn new_in(buckets: usize, max_memory: Option<usize>, alloc: &'extractor Bump) -> Self {
|
||||
Self {
|
||||
word_fid_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
||||
word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
||||
exact_word_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
||||
word_position_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
||||
fid_word_count_docids: BalancedCaches::new_in(buckets, max_memory, alloc),
|
||||
fid_word_count: HashMap::new(),
|
||||
current_docid: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_add_u32(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
position: u16,
|
||||
word: &str,
|
||||
exact: bool,
|
||||
docid: u32,
|
||||
bump: &Bump,
|
||||
) -> Result<()> {
|
||||
let word_bytes = word.as_bytes();
|
||||
if exact {
|
||||
self.exact_word_docids.insert_add_u32(word_bytes, docid)?;
|
||||
} else {
|
||||
self.word_docids.insert_add_u32(word_bytes, docid)?;
|
||||
}
|
||||
|
||||
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
|
||||
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_fid_docids.insert_add_u32(&buffer, docid)?;
|
||||
|
||||
let position = bucketed_position(position);
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&position.to_be_bytes());
|
||||
self.word_position_docids.insert_add_u32(&buffer, docid)?;
|
||||
|
||||
if self.current_docid.map_or(false, |id| docid != id) {
|
||||
self.flush_fid_word_count(&mut buffer)?;
|
||||
}
|
||||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||
.or_insert((0, 1));
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_del_u32(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
position: u16,
|
||||
word: &str,
|
||||
exact: bool,
|
||||
docid: u32,
|
||||
bump: &Bump,
|
||||
) -> Result<()> {
|
||||
let word_bytes = word.as_bytes();
|
||||
if exact {
|
||||
self.exact_word_docids.insert_del_u32(word_bytes, docid)?;
|
||||
} else {
|
||||
self.word_docids.insert_del_u32(word_bytes, docid)?;
|
||||
}
|
||||
|
||||
let buffer_size = word_bytes.len() + 1 + size_of::<FieldId>();
|
||||
let mut buffer = BumpVec::with_capacity_in(buffer_size, bump);
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_fid_docids.insert_del_u32(&buffer, docid)?;
|
||||
|
||||
let position = bucketed_position(position);
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word_bytes);
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&position.to_be_bytes());
|
||||
self.word_position_docids.insert_del_u32(&buffer, docid)?;
|
||||
|
||||
if self.current_docid.map_or(false, |id| docid != id) {
|
||||
self.flush_fid_word_count(&mut buffer)?;
|
||||
}
|
||||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||
.or_insert((1, 0));
|
||||
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_fid_word_count(&mut self, buffer: &mut BumpVec<u8>) -> Result<()> {
|
||||
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
|
||||
if current_count != new_count {
|
||||
if current_count <= MAX_COUNTED_WORDS {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(current_count as u8);
|
||||
self.fid_word_count_docids
|
||||
.insert_del_u32(buffer, self.current_docid.unwrap())?;
|
||||
}
|
||||
if new_count <= MAX_COUNTED_WORDS {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(new_count as u8);
|
||||
self.fid_word_count_docids
|
||||
.insert_add_u32(buffer, self.current_docid.unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsCaches<'extractor> {
|
||||
pub word_docids: Vec<BalancedCaches<'extractor>>,
|
||||
pub word_fid_docids: Vec<BalancedCaches<'extractor>>,
|
||||
pub exact_word_docids: Vec<BalancedCaches<'extractor>>,
|
||||
pub word_position_docids: Vec<BalancedCaches<'extractor>>,
|
||||
pub fid_word_count_docids: Vec<BalancedCaches<'extractor>>,
|
||||
}
|
||||
|
||||
impl<'extractor> WordDocidsCaches<'extractor> {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
word_docids: Vec::new(),
|
||||
word_fid_docids: Vec::new(),
|
||||
exact_word_docids: Vec::new(),
|
||||
word_position_docids: Vec::new(),
|
||||
fid_word_count_docids: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn push(&mut self, other: WordDocidsBalancedCaches<'extractor>) -> Result<()> {
|
||||
let WordDocidsBalancedCaches {
|
||||
word_docids,
|
||||
word_fid_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
fid_word_count: _,
|
||||
current_docid: _,
|
||||
} = other;
|
||||
|
||||
self.word_docids.push(word_docids);
|
||||
self.word_fid_docids.push(word_fid_docids);
|
||||
self.exact_word_docids.push(exact_word_docids);
|
||||
self.word_position_docids.push(word_position_docids);
|
||||
self.fid_word_count_docids.push(fid_word_count_docids);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsExtractorData<'a> {
|
||||
tokenizer: &'a DocumentTokenizer<'a>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
buckets: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
|
||||
type Data = RefCell<Option<WordDocidsBalancedCaches<'extractor>>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(Some(WordDocidsBalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory,
|
||||
extractor_alloc,
|
||||
))))
|
||||
}
|
||||
|
||||
fn process(
|
||||
&self,
|
||||
change: DocumentChange,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
WordDocidsExtractors::extract_document_change(context, self.tokenizer, change)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsExtractors;
|
||||
|
||||
impl WordDocidsExtractors {
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
) -> Result<WordDocidsCaches<'extractor>> {
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
let stop_words = index.stop_words(&rtxn)?;
|
||||
let allowed_separators = index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.into_tokenizer();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
|
||||
let localized_attributes_rules =
|
||||
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
let datastore = ThreadLocal::new();
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
|
||||
let extractor = WordDocidsExtractorData {
|
||||
tokenizer: &document_tokenizer,
|
||||
grenad_parameters,
|
||||
buckets: rayon::current_num_threads(),
|
||||
};
|
||||
|
||||
for_each_document_change(
|
||||
document_changes,
|
||||
&extractor,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
)?;
|
||||
}
|
||||
|
||||
let mut merger = WordDocidsCaches::new();
|
||||
for cache in datastore.into_iter().flat_map(RefCell::into_inner) {
|
||||
merger.push(cache)?;
|
||||
}
|
||||
|
||||
Ok(merger)
|
||||
}
|
||||
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<Option<WordDocidsBalancedCaches>>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let index = &context.index;
|
||||
let rtxn = &context.txn;
|
||||
let mut cached_sorter_ref = context.data.borrow_mut_or_yield();
|
||||
let cached_sorter = cached_sorter_ref.as_mut().unwrap();
|
||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||
let new_fields_ids_map = new_fields_ids_map.deref_mut();
|
||||
let doc_alloc = &context.doc_alloc;
|
||||
|
||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||
let is_exact_attribute =
|
||||
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter.insert_del_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter.insert_del_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter.insert_add_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.new(rtxn, index, context.db_fields_ids_map)?,
|
||||
new_fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter.insert_add_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
doc_alloc,
|
||||
)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.new(),
|
||||
new_fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
let buffer_size = size_of::<FieldId>();
|
||||
let mut buffer = BumpVec::with_capacity_in(buffer_size, &context.doc_alloc);
|
||||
cached_sorter.flush_fid_word_count(&mut buffer)
|
||||
}
|
||||
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,178 @@
|
||||
use std::cell::RefCell;
|
||||
use std::collections::VecDeque;
|
||||
use std::rc::Rc;
|
||||
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use super::SearchableExtractor;
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::indexer::document_changes::{DocumentChangeContext, RefCellExt};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||
|
||||
pub struct WordPairProximityDocidsExtractor;
|
||||
|
||||
impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
// This method is reimplemented to count the number of words in the document in each field
|
||||
// and to store the docids of the documents that have a number of words in a given field
|
||||
// equal to or under than MAX_COUNTED_WORDS.
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let doc_alloc = &context.doc_alloc;
|
||||
|
||||
let index = context.index;
|
||||
let rtxn = &context.txn;
|
||||
|
||||
let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||
let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||
let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
|
||||
|
||||
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
|
||||
let new_fields_ids_map = &mut *new_fields_ids_map;
|
||||
|
||||
let mut cached_sorter = context.data.borrow_mut_or_yield();
|
||||
let cached_sorter = &mut *cached_sorter;
|
||||
|
||||
// is a vecdequeue, and will be smol, so can stay on the heap for now
|
||||
let mut word_positions: VecDeque<(Rc<str>, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
|
||||
let docid = document_change.docid();
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
new_fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
del_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let document = inner.current(rtxn, index, context.db_fields_ids_map)?;
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
new_fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
del_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
let document = inner.merged(rtxn, index, context.db_fields_ids_map)?;
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
new_fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
add_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let document = inner.inserted();
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
new_fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
add_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
del_word_pair_proximity.sort_unstable();
|
||||
del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
|
||||
for ((w1, w2), prox) in del_word_pair_proximity.iter() {
|
||||
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, docid)?;
|
||||
}
|
||||
|
||||
add_word_pair_proximity.sort_unstable();
|
||||
add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
|
||||
for ((w1, w2), prox) in add_word_pair_proximity.iter() {
|
||||
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, docid)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_key<'a>(
|
||||
prox: u8,
|
||||
w1: &str,
|
||||
w2: &str,
|
||||
key_buffer: &'a mut bumpalo::collections::Vec<u8>,
|
||||
) -> &'a [u8] {
|
||||
key_buffer.clear();
|
||||
key_buffer.push(prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
key_buffer.as_slice()
|
||||
}
|
||||
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(Rc<str>, u16)>,
|
||||
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
|
||||
) {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity((head_word.clone(), word.clone()), prox);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn process_document_tokens<'doc>(
|
||||
document: impl Document<'doc>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
word_positions: &mut VecDeque<(Rc<str>, u16)>,
|
||||
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
|
||||
) -> Result<()> {
|
||||
let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while word_positions
|
||||
.front()
|
||||
.map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
|
||||
{
|
||||
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
word_positions.push_back((Rc::from(word), pos));
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
|
||||
|
||||
while !word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
163
crates/milli/src/update/new/extract/searchable/mod.rs
Normal file
163
crates/milli/src/update/new/extract/searchable/mod.rs
Normal file
@@ -0,0 +1,163 @@
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod tokenize_document;
|
||||
|
||||
use std::cell::RefCell;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use bumpalo::Bump;
|
||||
pub use extract_word_docids::{WordDocidsCaches, WordDocidsExtractors};
|
||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||
use heed::RoTxn;
|
||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
|
||||
use super::cache::BalancedCaches;
|
||||
use super::DocidsExtractor;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, FullySend, IndexingContext,
|
||||
Progress, ThreadLocal,
|
||||
};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
|
||||
tokenizer: &'a DocumentTokenizer<'a>,
|
||||
grenad_parameters: GrenadParameters,
|
||||
buckets: usize,
|
||||
_ex: PhantomData<EX>,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
||||
for SearchableExtractorData<'a, EX>
|
||||
{
|
||||
type Data = RefCell<BalancedCaches<'extractor>>;
|
||||
|
||||
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
|
||||
Ok(RefCell::new(BalancedCaches::new_in(
|
||||
self.buckets,
|
||||
self.grenad_parameters.max_memory,
|
||||
extractor_alloc,
|
||||
)))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&self,
|
||||
changes: impl Iterator<Item = Result<DocumentChange<'doc>>>,
|
||||
context: &DocumentChangeContext<Self::Data>,
|
||||
) -> Result<()> {
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
EX::extract_document_change(context, self.tokenizer, change)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub trait SearchableExtractor: Sized + Sync {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_steps: u16,
|
||||
total_steps: u16,
|
||||
step_name: &'static str,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||
let allowed_separators = indexing_context.index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = indexing_context.index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.into_tokenizer();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, indexing_context.index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, indexing_context.index)?;
|
||||
let localized_attributes_rules =
|
||||
indexing_context.index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
let extractor_data: SearchableExtractorData<Self> = SearchableExtractorData {
|
||||
tokenizer: &document_tokenizer,
|
||||
grenad_parameters,
|
||||
buckets: rayon::current_num_threads(),
|
||||
_ex: PhantomData,
|
||||
};
|
||||
|
||||
let datastore = ThreadLocal::new();
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
extract(
|
||||
document_changes,
|
||||
&extractor_data,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
&datastore,
|
||||
finished_steps,
|
||||
total_steps,
|
||||
step_name,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(datastore.into_iter().map(RefCell::into_inner).collect())
|
||||
}
|
||||
|
||||
fn extract_document_change(
|
||||
context: &DocumentChangeContext<RefCell<BalancedCaches>>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()>;
|
||||
|
||||
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
|
||||
-> Result<Option<Vec<&'a str>>>;
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
|
||||
}
|
||||
|
||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
finished_steps: u16,
|
||||
total_steps: u16,
|
||||
step_name: &'static str,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
Self::run_extraction(
|
||||
grenad_parameters,
|
||||
document_changes,
|
||||
indexing_context,
|
||||
extractor_allocs,
|
||||
finished_steps,
|
||||
total_steps,
|
||||
step_name,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,275 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::new::document::Document;
|
||||
use crate::update::new::extract::perm_json_p::{
|
||||
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
||||
};
|
||||
use crate::{
|
||||
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
|
||||
MAX_WORD_LENGTH,
|
||||
};
|
||||
|
||||
pub struct DocumentTokenizer<'a> {
|
||||
pub tokenizer: &'a Tokenizer<'a>,
|
||||
pub attribute_to_extract: Option<&'a [&'a str]>,
|
||||
pub attribute_to_skip: &'a [&'a str],
|
||||
pub localized_attributes_rules: &'a [LocalizedAttributesRule],
|
||||
pub max_positions_per_attributes: u32,
|
||||
}
|
||||
|
||||
impl<'a> DocumentTokenizer<'a> {
|
||||
pub fn tokenize_document<'doc>(
|
||||
&self,
|
||||
document: impl Document<'doc>,
|
||||
field_id_map: &mut GlobalFieldsIdsMap,
|
||||
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut field_position = HashMap::new();
|
||||
|
||||
for entry in document.iter_top_level_fields() {
|
||||
let (field_name, value) = entry?;
|
||||
|
||||
let mut tokenize_field = |name: &str, value: &Value| {
|
||||
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
||||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
let position = field_position
|
||||
.entry(field_id)
|
||||
.and_modify(|counter| *counter += MAX_DISTANCE)
|
||||
.or_insert(0);
|
||||
if *position >= self.max_positions_per_attributes {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match value {
|
||||
Value::Number(n) => {
|
||||
let token = n.to_string();
|
||||
if let Ok(position) = (*position).try_into() {
|
||||
token_fn(name, field_id, position, token.as_str())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Value::String(text) => {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = self
|
||||
.localized_attributes_rules
|
||||
.iter()
|
||||
.find(|rule| rule.match_str(field_name))
|
||||
.map(|rule| rule.locales());
|
||||
let tokens = process_tokens(
|
||||
*position,
|
||||
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
|
||||
)
|
||||
.take_while(|(p, _)| *p < self.max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
*position = index;
|
||||
if let Ok(position) = (*position).try_into() {
|
||||
token_fn(name, field_id, position, token)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
_ => Ok(()),
|
||||
}
|
||||
};
|
||||
|
||||
// if the current field is searchable or contains a searchable attribute
|
||||
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
|
||||
// parse json.
|
||||
match serde_json::to_value(value).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(object) => seek_leaf_values_in_object(
|
||||
&object,
|
||||
self.attribute_to_extract,
|
||||
self.attribute_to_skip,
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
Value::Array(array) => seek_leaf_values_in_array(
|
||||
&array,
|
||||
self.attribute_to_extract,
|
||||
self.attribute_to_skip,
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
value => tokenize_field(field_name, &value)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
|
||||
/// else we keep the standard proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
start_offset: u32,
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (u32, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((start_offset, None), |(offset, prev_kind), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => token.kind = TokenKind::Unknown,
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
pub fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<&'a [u8]>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use bumpalo::Bump;
|
||||
use charabia::TokenizerBuilder;
|
||||
use meili_snap::snapshot;
|
||||
use raw_collections::RawMap;
|
||||
use serde_json::json;
|
||||
use serde_json::value::RawValue;
|
||||
|
||||
use super::*;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::update::new::document::{DocumentFromVersions, Versions};
|
||||
use crate::FieldsIdsMap;
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_document() {
|
||||
let mut fields_ids_map = FieldsIdsMap::new();
|
||||
|
||||
let document = json!({
|
||||
"doggo": { "name": "doggo",
|
||||
"age": 10,},
|
||||
"catto": {
|
||||
"catto": {
|
||||
"name": "pesti",
|
||||
"age": 23,
|
||||
}
|
||||
},
|
||||
"doggo.name": ["doggo", "catto"],
|
||||
"not-me": "UNSEARCHABLE",
|
||||
"me-nether": {"nope": "unsearchable"}
|
||||
});
|
||||
|
||||
let _field_1_id = fields_ids_map.insert("doggo").unwrap();
|
||||
let _field_2_id = fields_ids_map.insert("catto").unwrap();
|
||||
let _field_3_id = fields_ids_map.insert("doggo.name").unwrap();
|
||||
let _field_4_id = fields_ids_map.insert("not-me").unwrap();
|
||||
let _field_5_id = fields_ids_map.insert("me-nether").unwrap();
|
||||
|
||||
let mut tb = TokenizerBuilder::default();
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tb.build(),
|
||||
attribute_to_extract: None,
|
||||
attribute_to_skip: &["not-me", "me-nether.nope"],
|
||||
localized_attributes_rules: &[],
|
||||
max_positions_per_attributes: 1000,
|
||||
};
|
||||
|
||||
let fields_ids_map = FieldIdMapWithMetadata::new(
|
||||
fields_ids_map,
|
||||
MetadataBuilder::new(Default::default(), Default::default(), Default::default(), None),
|
||||
);
|
||||
|
||||
let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
|
||||
let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
||||
|
||||
let mut words = std::collections::BTreeMap::new();
|
||||
|
||||
let document = document.to_string();
|
||||
|
||||
let bump = Bump::new();
|
||||
let document: &RawValue = serde_json::from_str(&document).unwrap();
|
||||
let document = RawMap::from_raw_value(document, &bump).unwrap();
|
||||
|
||||
let document = Versions::single(document);
|
||||
let document = DocumentFromVersions::new(&document);
|
||||
|
||||
document_tokenizer
|
||||
.tokenize_document(
|
||||
document,
|
||||
&mut global_fields_ids_map,
|
||||
&mut |_fname, fid, pos, word| {
|
||||
words.insert([fid, pos], word.to_string());
|
||||
Ok(())
|
||||
},
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
snapshot!(format!("{:#?}", words), @r###"
|
||||
{
|
||||
[
|
||||
2,
|
||||
0,
|
||||
]: "doggo",
|
||||
[
|
||||
2,
|
||||
MAX_DISTANCE,
|
||||
]: "doggo",
|
||||
[
|
||||
2,
|
||||
16,
|
||||
]: "catto",
|
||||
[
|
||||
3,
|
||||
0,
|
||||
]: "10",
|
||||
[
|
||||
4,
|
||||
0,
|
||||
]: "pesti",
|
||||
[
|
||||
5,
|
||||
0,
|
||||
]: "23",
|
||||
}
|
||||
"###);
|
||||
}
|
||||
}
|
||||
432
crates/milli/src/update/new/extract/vectors/mod.rs
Normal file
432
crates/milli/src/update/new/extract/vectors/mod.rs
Normal file
@@ -0,0 +1,432 @@
|
||||
use std::cell::RefCell;
|
||||
|
||||
use bumpalo::collections::Vec as BVec;
|
||||
use bumpalo::Bump;
|
||||
use hashbrown::HashMap;
|
||||
|
||||
use super::cache::DelAddRoaringBitmap;
|
||||
use crate::error::FaultSource;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::new::channel::EmbeddingSender;
|
||||
use crate::update::new::indexer::document_changes::{Extractor, FullySend};
|
||||
use crate::update::new::vector_document::VectorDocument;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::vector::error::{
|
||||
EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump,
|
||||
};
|
||||
use crate::vector::{Embedder, Embedding, EmbeddingConfigs};
|
||||
use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError};
|
||||
|
||||
pub struct EmbeddingExtractor<'a> {
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
possible_embedding_mistakes: PossibleEmbeddingMistakes,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
}
|
||||
|
||||
impl<'a> EmbeddingExtractor<'a> {
|
||||
pub fn new(
|
||||
embedders: &'a EmbeddingConfigs,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
field_distribution: &'a FieldDistribution,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
) -> Self {
|
||||
let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution);
|
||||
Self { embedders, sender, threads, possible_embedding_mistakes }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||
type Data = FullySend<RefCell<HashMap<String, DelAddRoaringBitmap>>>;
|
||||
|
||||
fn init_data<'doc>(
|
||||
&'doc self,
|
||||
_extractor_alloc: raw_collections::alloc::RefBump<'extractor>,
|
||||
) -> crate::Result<Self::Data> {
|
||||
/// TODO: use the extractor_alloc in the hashbrown once you merge the branch where it is no longer a RefBump
|
||||
Ok(FullySend(Default::default()))
|
||||
}
|
||||
|
||||
fn process<'doc>(
|
||||
&'doc self,
|
||||
changes: impl Iterator<Item = crate::Result<DocumentChange<'doc>>>,
|
||||
context: &'doc crate::update::new::indexer::document_changes::DocumentChangeContext<
|
||||
Self::Data,
|
||||
>,
|
||||
) -> crate::Result<()> {
|
||||
let embedders = self.embedders.inner_as_ref();
|
||||
let mut unused_vectors_distribution =
|
||||
UnusedVectorsDistributionBump::new_in(&context.doc_alloc);
|
||||
|
||||
let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc);
|
||||
for (embedder_name, (embedder, prompt, _is_quantized)) in embedders {
|
||||
let embedder_id =
|
||||
context.index.embedder_category_id.get(&context.txn, embedder_name)?.ok_or_else(
|
||||
|| InternalError::DatabaseMissingEntry {
|
||||
db_name: "embedder_category_id",
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
all_chunks.push(Chunks::new(
|
||||
embedder,
|
||||
embedder_id,
|
||||
embedder_name,
|
||||
prompt,
|
||||
&context.data.0,
|
||||
&self.possible_embedding_mistakes,
|
||||
self.threads,
|
||||
self.sender,
|
||||
&context.doc_alloc,
|
||||
))
|
||||
}
|
||||
|
||||
for change in changes {
|
||||
let change = change?;
|
||||
match change {
|
||||
DocumentChange::Deletion(_deletion) => {
|
||||
// handled by document sender
|
||||
}
|
||||
DocumentChange::Update(update) => {
|
||||
let old_vectors = update.current_vectors(
|
||||
&context.txn,
|
||||
context.index,
|
||||
context.db_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?;
|
||||
|
||||
if let Some(new_vectors) = &new_vectors {
|
||||
unused_vectors_distribution.append(new_vectors);
|
||||
}
|
||||
|
||||
for chunks in &mut all_chunks {
|
||||
let embedder_name = chunks.embedder_name();
|
||||
let prompt = chunks.prompt();
|
||||
|
||||
let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap();
|
||||
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
||||
new_vectors.vectors_for_key(embedder_name).transpose()
|
||||
}) {
|
||||
let new_vectors = new_vectors?;
|
||||
match (old_vectors.regenerate, new_vectors.regenerate) {
|
||||
(true, true) | (false, false) => todo!(),
|
||||
_ => {
|
||||
chunks.set_regenerate(update.docid(), new_vectors.regenerate);
|
||||
}
|
||||
}
|
||||
// do we have set embeddings?
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
update.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
.map_err(|error| UserError::InvalidVectorsEmbedderConf {
|
||||
document_id: update.external_document_id().to_string(),
|
||||
error,
|
||||
})?,
|
||||
);
|
||||
} else if new_vectors.regenerate {
|
||||
let new_rendered = prompt.render_document(
|
||||
update.current(
|
||||
&context.txn,
|
||||
context.index,
|
||||
context.db_fields_ids_map,
|
||||
)?,
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
let old_rendered = prompt.render_document(
|
||||
update.merged(
|
||||
&context.txn,
|
||||
context.index,
|
||||
context.db_fields_ids_map,
|
||||
)?,
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
if new_rendered != old_rendered {
|
||||
chunks.set_autogenerated(
|
||||
update.docid(),
|
||||
new_rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
} else if old_vectors.regenerate {
|
||||
let old_rendered = prompt.render_document(
|
||||
update.current(
|
||||
&context.txn,
|
||||
context.index,
|
||||
context.db_fields_ids_map,
|
||||
)?,
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
let new_rendered = prompt.render_document(
|
||||
update.merged(
|
||||
&context.txn,
|
||||
context.index,
|
||||
context.db_fields_ids_map,
|
||||
)?,
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
if new_rendered != old_rendered {
|
||||
chunks.set_autogenerated(
|
||||
update.docid(),
|
||||
new_rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
DocumentChange::Insertion(insertion) => {
|
||||
let new_vectors =
|
||||
insertion.inserted_vectors(&context.doc_alloc, self.embedders)?;
|
||||
if let Some(new_vectors) = &new_vectors {
|
||||
unused_vectors_distribution.append(new_vectors);
|
||||
}
|
||||
|
||||
for chunks in &mut all_chunks {
|
||||
let embedder_name = chunks.embedder_name();
|
||||
let prompt = chunks.prompt();
|
||||
// if no inserted vectors, then regenerate: true + no embeddings => autogenerate
|
||||
if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| {
|
||||
new_vectors.vectors_for_key(embedder_name).transpose()
|
||||
}) {
|
||||
let new_vectors = new_vectors?;
|
||||
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||
if let Some(embeddings) = new_vectors.embeddings {
|
||||
chunks.set_vectors(
|
||||
insertion.docid(),
|
||||
embeddings
|
||||
.into_vec(&context.doc_alloc, embedder_name)
|
||||
.map_err(|error| UserError::InvalidVectorsEmbedderConf {
|
||||
document_id: insertion
|
||||
.external_document_id()
|
||||
.to_string(),
|
||||
error,
|
||||
})?,
|
||||
);
|
||||
} else if new_vectors.regenerate {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.inserted(),
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
chunks.set_autogenerated(
|
||||
insertion.docid(),
|
||||
rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
}
|
||||
} else {
|
||||
let rendered = prompt.render_document(
|
||||
insertion.inserted(),
|
||||
context.new_fields_ids_map,
|
||||
&context.doc_alloc,
|
||||
)?;
|
||||
chunks.set_autogenerated(
|
||||
insertion.docid(),
|
||||
rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for chunk in all_chunks {
|
||||
chunk.drain(&unused_vectors_distribution)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
// **Warning**: the destructor of this struct is not normally run, make sure that all its fields:
|
||||
// 1. don't have side effects tied to they destructors
|
||||
// 2. if allocated, are allocated inside of the bumpalo
|
||||
//
|
||||
// Currently this is the case as:
|
||||
// 1. BVec are inside of the bumaplo
|
||||
// 2. All other fields are either trivial (u8) or references.
|
||||
struct Chunks<'a> {
|
||||
texts: BVec<'a, &'a str>,
|
||||
ids: BVec<'a, DocumentId>,
|
||||
|
||||
embedder: &'a Embedder,
|
||||
embedder_id: u8,
|
||||
embedder_name: &'a str,
|
||||
prompt: &'a Prompt,
|
||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
}
|
||||
|
||||
impl<'a> Chunks<'a> {
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub fn new(
|
||||
embedder: &'a Embedder,
|
||||
embedder_id: u8,
|
||||
embedder_name: &'a str,
|
||||
prompt: &'a Prompt,
|
||||
user_provided: &'a RefCell<HashMap<String, DelAddRoaringBitmap>>,
|
||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
doc_alloc: &'a Bump,
|
||||
) -> Self {
|
||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||
Self {
|
||||
texts,
|
||||
ids,
|
||||
embedder,
|
||||
prompt,
|
||||
possible_embedding_mistakes,
|
||||
threads,
|
||||
sender,
|
||||
embedder_id,
|
||||
embedder_name,
|
||||
user_provided,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_autogenerated(
|
||||
&mut self,
|
||||
docid: DocumentId,
|
||||
rendered: &'a str,
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||
) -> Result<()> {
|
||||
if self.texts.len() < self.texts.capacity() {
|
||||
self.texts.push(rendered);
|
||||
self.ids.push(docid);
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
Self::embed_chunks(
|
||||
&mut self.texts,
|
||||
&mut self.ids,
|
||||
self.embedder,
|
||||
self.embedder_id,
|
||||
self.embedder_name,
|
||||
self.possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
self.threads,
|
||||
self.sender,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn drain(
|
||||
mut self,
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||
) -> Result<()> {
|
||||
let res = Self::embed_chunks(
|
||||
&mut self.texts,
|
||||
&mut self.ids,
|
||||
self.embedder,
|
||||
self.embedder_id,
|
||||
self.embedder_name,
|
||||
self.possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
self.threads,
|
||||
self.sender,
|
||||
);
|
||||
// optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
|
||||
std::mem::forget(self);
|
||||
res
|
||||
}
|
||||
|
||||
pub fn embed_chunks(
|
||||
texts: &mut BVec<'a, &'a str>,
|
||||
ids: &mut BVec<'a, DocumentId>,
|
||||
embedder: &Embedder,
|
||||
embedder_id: u8,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
sender: &EmbeddingSender<'a>,
|
||||
) -> Result<()> {
|
||||
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
||||
Ok(embeddings) => {
|
||||
for (docid, embedding) in ids.into_iter().zip(embeddings) {
|
||||
sender.set_vector(*docid, embedder_id, embedding).unwrap();
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(error) => {
|
||||
if let FaultSource::Bug = error.fault {
|
||||
Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(
|
||||
error.into(),
|
||||
)))
|
||||
} else {
|
||||
let mut msg = format!(
|
||||
r"While embedding documents for embedder `{embedder_name}`: {error}"
|
||||
);
|
||||
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
}
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
}
|
||||
};
|
||||
texts.clear();
|
||||
ids.clear();
|
||||
res
|
||||
}
|
||||
|
||||
pub fn prompt(&self) -> &'a Prompt {
|
||||
self.prompt
|
||||
}
|
||||
|
||||
pub fn embedder_name(&self) -> &'a str {
|
||||
self.embedder_name
|
||||
}
|
||||
|
||||
fn set_regenerate(&self, docid: DocumentId, regenerate: bool) {
|
||||
let mut user_provided = self.user_provided.borrow_mut();
|
||||
let user_provided = user_provided.entry_ref(self.embedder_name).or_default();
|
||||
if regenerate {
|
||||
// regenerate == !user_provided
|
||||
user_provided.del.get_or_insert(Default::default()).insert(docid);
|
||||
} else {
|
||||
user_provided.add.get_or_insert(Default::default()).insert(docid);
|
||||
}
|
||||
}
|
||||
|
||||
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
||||
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||
}
|
||||
}
|
||||
264
crates/milli/src/update/new/facet_search_builder.rs
Normal file
264
crates/milli/src/update/new/facet_search_builder.rs
Normal file
@@ -0,0 +1,264 @@
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
|
||||
use charabia::normalizer::NormalizerOption;
|
||||
use charabia::{Language, Normalize, StrDetection, Token};
|
||||
use grenad::Sorter;
|
||||
use heed::types::{Bytes, SerdeJson};
|
||||
use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn};
|
||||
|
||||
use super::extract::FacetKind;
|
||||
use super::fst_merger_builder::FstMergerBuilder;
|
||||
use super::KvReaderDelAdd;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::StrRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::{create_sorter, MergeDeladdBtreesetString};
|
||||
use crate::{
|
||||
BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result,
|
||||
MAX_FACET_VALUE_LENGTH,
|
||||
};
|
||||
|
||||
pub struct FacetSearchBuilder<'indexer> {
|
||||
registered_facets: HashMap<FieldId, usize>,
|
||||
normalized_facet_string_docids_sorter: Sorter<MergeDeladdBtreesetString>,
|
||||
global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
|
||||
localized_attributes_rules: Vec<LocalizedAttributesRule>,
|
||||
// Buffered data below
|
||||
buffer: Vec<u8>,
|
||||
localized_field_ids: HashMap<FieldId, Option<Vec<Language>>>,
|
||||
}
|
||||
|
||||
impl<'indexer> FacetSearchBuilder<'indexer> {
|
||||
pub fn new(
|
||||
global_fields_ids_map: GlobalFieldsIdsMap<'indexer>,
|
||||
localized_attributes_rules: Vec<LocalizedAttributesRule>,
|
||||
) -> Self {
|
||||
let registered_facets = HashMap::new();
|
||||
let normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdBtreesetString,
|
||||
grenad::CompressionType::None,
|
||||
None,
|
||||
None,
|
||||
Some(0),
|
||||
true,
|
||||
);
|
||||
|
||||
Self {
|
||||
registered_facets,
|
||||
normalized_facet_string_docids_sorter,
|
||||
buffer: Vec::new(),
|
||||
global_fields_ids_map,
|
||||
localized_attributes_rules,
|
||||
localized_field_ids: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_key_data<'k>(&self, key: &'k [u8]) -> Result<Option<FacetGroupKey<&'k str>>> {
|
||||
match FacetKind::from(key[0]) {
|
||||
// Only strings are searchable
|
||||
FacetKind::String => Ok(Some(
|
||||
FacetGroupKeyCodec::<StrRefCodec>::bytes_decode(&key[1..])
|
||||
.map_err(heed::Error::Encoding)?,
|
||||
)),
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn register_from_key(&mut self, deladd: DelAdd, facet_key: &[u8]) -> Result<()> {
|
||||
let Some(FacetGroupKey { field_id, level: _level, left_bound }) =
|
||||
self.extract_key_data(facet_key)?
|
||||
else {
|
||||
return Ok(());
|
||||
};
|
||||
|
||||
if deladd == DelAdd::Addition {
|
||||
self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1);
|
||||
}
|
||||
|
||||
let locales = self.locales(field_id);
|
||||
let hyper_normalized_value = normalize_facet_string(left_bound, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(left_bound));
|
||||
|
||||
// as the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
self.buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut self.buffer);
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd, val)?;
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
self.normalized_facet_string_docids_sorter.insert(key_bytes, &self.buffer)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> {
|
||||
if !self.localized_field_ids.contains_key(&field_id) {
|
||||
let Some(field_name) = self.global_fields_ids_map.name(field_id) else {
|
||||
unreachable!("Field id {} not found in the global fields ids map", field_id);
|
||||
};
|
||||
|
||||
let locales = self
|
||||
.localized_attributes_rules
|
||||
.iter()
|
||||
.find(|rule| rule.match_str(field_name))
|
||||
.map(|rule| rule.locales.clone());
|
||||
|
||||
self.localized_field_ids.insert(field_id, locales);
|
||||
}
|
||||
|
||||
self.localized_field_ids.get(&field_id).unwrap().as_deref()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_fst")]
|
||||
pub fn merge_and_write(self, index: &Index, wtxn: &mut RwTxn, rtxn: &RoTxn) -> Result<()> {
|
||||
let reader = self.normalized_facet_string_docids_sorter.into_reader_cursors()?;
|
||||
let mut builder = grenad::MergerBuilder::new(MergeDeladdBtreesetString);
|
||||
builder.extend(reader);
|
||||
|
||||
let database = index.facet_id_normalized_string_strings.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut merger_iter = builder.build().into_stream_merger_iter()?;
|
||||
let mut current_field_id = None;
|
||||
let mut fst;
|
||||
let mut fst_merger_builder: Option<FstMergerBuilder> = None;
|
||||
while let Some((key, deladd)) = merger_iter.next()? {
|
||||
let (field_id, normalized_facet_string) =
|
||||
BEU16StrCodec::bytes_decode(key).map_err(heed::Error::Encoding)?;
|
||||
|
||||
if current_field_id != Some(field_id) {
|
||||
if let Some(fst_merger_builder) = fst_merger_builder {
|
||||
let mmap = fst_merger_builder.build(&mut callback)?;
|
||||
index
|
||||
.facet_id_string_fst
|
||||
.remap_data_type::<Bytes>()
|
||||
.put(wtxn, &field_id, &mmap)?;
|
||||
}
|
||||
|
||||
fst = index.facet_id_string_fst.get(rtxn, &field_id)?;
|
||||
fst_merger_builder = Some(FstMergerBuilder::new(fst.as_ref())?);
|
||||
current_field_id = Some(field_id);
|
||||
}
|
||||
|
||||
let previous = database.get(rtxn, key)?;
|
||||
let deladd: &KvReaderDelAdd = deladd.into();
|
||||
let del = deladd.get(DelAdd::Deletion);
|
||||
let add = deladd.get(DelAdd::Addition);
|
||||
|
||||
match merge_btreesets(previous, del, add)? {
|
||||
Operation::Write(value) => {
|
||||
match fst_merger_builder.as_mut() {
|
||||
Some(fst_merger_builder) => {
|
||||
fst_merger_builder.register(
|
||||
DelAdd::Addition,
|
||||
normalized_facet_string.as_bytes(),
|
||||
&mut callback,
|
||||
)?;
|
||||
}
|
||||
None => unreachable!(),
|
||||
}
|
||||
let key = (field_id, normalized_facet_string);
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
database.put(wtxn, &key_bytes, &value)?;
|
||||
}
|
||||
Operation::Delete => {
|
||||
match fst_merger_builder.as_mut() {
|
||||
Some(fst_merger_builder) => {
|
||||
fst_merger_builder.register(
|
||||
DelAdd::Deletion,
|
||||
normalized_facet_string.as_bytes(),
|
||||
&mut callback,
|
||||
)?;
|
||||
}
|
||||
None => unreachable!(),
|
||||
}
|
||||
let key = (field_id, normalized_facet_string);
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
database.delete(wtxn, &key_bytes)?;
|
||||
}
|
||||
Operation::Ignore => (),
|
||||
}
|
||||
}
|
||||
|
||||
if let (Some(field_id), Some(fst_merger_builder)) = (current_field_id, fst_merger_builder) {
|
||||
let mmap = fst_merger_builder.build(&mut callback)?;
|
||||
index.facet_id_string_fst.remap_data_type::<Bytes>().put(wtxn, &field_id, &mmap)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn callback(_bytes: &[u8], _deladd: DelAdd, _is_modified: bool) -> Result<()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn merge_btreesets(
|
||||
current: Option<&[u8]>,
|
||||
del: Option<&[u8]>,
|
||||
add: Option<&[u8]>,
|
||||
) -> Result<Operation> {
|
||||
let mut result: BTreeSet<String> = match current {
|
||||
Some(current) => SerdeJson::bytes_decode(current).map_err(heed::Error::Encoding)?,
|
||||
None => BTreeSet::new(),
|
||||
};
|
||||
if let Some(del) = del {
|
||||
let del: BTreeSet<String> = SerdeJson::bytes_decode(del).map_err(heed::Error::Encoding)?;
|
||||
result = result.difference(&del).cloned().collect();
|
||||
}
|
||||
if let Some(add) = add {
|
||||
let add: BTreeSet<String> = SerdeJson::bytes_decode(add).map_err(heed::Error::Encoding)?;
|
||||
result.extend(add);
|
||||
}
|
||||
|
||||
/// TODO remove allocation
|
||||
let result = SerdeJson::bytes_encode(&result).map_err(heed::Error::Encoding)?.into_owned();
|
||||
if Some(result.as_ref()) == current {
|
||||
Ok(Operation::Ignore)
|
||||
} else if result.is_empty() {
|
||||
Ok(Operation::Delete)
|
||||
} else {
|
||||
Ok(Operation::Write(result))
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalizes the facet string and truncates it to the max length.
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
let script = detection.script();
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script,
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// truncate the facet string to the max length
|
||||
token
|
||||
.normalize(&options)
|
||||
.lemma
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect()
|
||||
}
|
||||
|
||||
enum Operation {
|
||||
Write(Vec<u8>),
|
||||
Delete,
|
||||
Ignore,
|
||||
}
|
||||
155
crates/milli/src/update/new/fst_merger_builder.rs
Normal file
155
crates/milli/src/update/new/fst_merger_builder.rs
Normal file
@@ -0,0 +1,155 @@
|
||||
use std::{fs::File, io::BufWriter};
|
||||
|
||||
use fst::{Set, SetBuilder, Streamer};
|
||||
use memmap2::Mmap;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use crate::{update::del_add::DelAdd, InternalError, Result};
|
||||
|
||||
pub struct FstMergerBuilder<'a> {
|
||||
stream: Option<fst::set::Stream<'a>>,
|
||||
fst_builder: SetBuilder<BufWriter<File>>,
|
||||
last: Option<Vec<u8>>,
|
||||
inserted_words: usize,
|
||||
}
|
||||
|
||||
impl<'a> FstMergerBuilder<'a> {
|
||||
pub fn new<D: AsRef<[u8]>>(fst: Option<&'a Set<D>>) -> Result<Self> {
|
||||
Ok(Self {
|
||||
stream: fst.map(|fst| fst.stream()),
|
||||
fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?,
|
||||
last: None,
|
||||
inserted_words: 0,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn register(
|
||||
&mut self,
|
||||
deladd: DelAdd,
|
||||
right: &[u8],
|
||||
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
if let Some(left) = self.last.take() {
|
||||
let (left_inserted, right_inserted) =
|
||||
self.compare_and_insert(deladd, left.as_slice(), right, insertion_callback)?;
|
||||
|
||||
// left was not inserted, so we keep it for the next iteration
|
||||
if !left_inserted {
|
||||
self.last = Some(left);
|
||||
}
|
||||
|
||||
// right was inserted, so we can stop
|
||||
if right_inserted {
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(mut stream) = self.stream.take() {
|
||||
while let Some(left) = stream.next() {
|
||||
let (left_inserted, right_inserted) =
|
||||
self.compare_and_insert(deladd, left, right, insertion_callback)?;
|
||||
|
||||
// left was not inserted, so we keep it for the next iteration
|
||||
if !left_inserted {
|
||||
self.last = Some(left.to_vec());
|
||||
}
|
||||
|
||||
// right was inserted, so we can stop
|
||||
if right_inserted {
|
||||
self.stream = Some(stream);
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we reach this point, it means that the stream is empty
|
||||
// and we need to insert the incoming word
|
||||
self.insert(right, deladd, true, insertion_callback)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn compare_and_insert(
|
||||
&mut self,
|
||||
deladd: DelAdd,
|
||||
left: &[u8],
|
||||
right: &[u8],
|
||||
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
|
||||
) -> Result<(bool, bool)> {
|
||||
let mut left_inserted = false;
|
||||
let mut right_inserted = false;
|
||||
match left.cmp(right) {
|
||||
std::cmp::Ordering::Less => {
|
||||
// We need to insert the last word from the current fst
|
||||
self.insert(left, DelAdd::Addition, false, insertion_callback)?;
|
||||
|
||||
left_inserted = true;
|
||||
}
|
||||
std::cmp::Ordering::Equal => {
|
||||
self.insert(right, deladd, true, insertion_callback)?;
|
||||
|
||||
left_inserted = true;
|
||||
right_inserted = true;
|
||||
}
|
||||
std::cmp::Ordering::Greater => {
|
||||
self.insert(right, deladd, true, insertion_callback)?;
|
||||
|
||||
right_inserted = true;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((left_inserted, right_inserted))
|
||||
}
|
||||
|
||||
fn insert(
|
||||
&mut self,
|
||||
bytes: &[u8],
|
||||
deladd: DelAdd,
|
||||
is_modified: bool,
|
||||
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
// Addition: We insert the word
|
||||
// Deletion: We delete the word by not inserting it
|
||||
if deladd == DelAdd::Addition {
|
||||
self.inserted_words += 1;
|
||||
self.fst_builder.insert(bytes)?;
|
||||
}
|
||||
|
||||
insertion_callback(bytes, deladd, is_modified)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn drain_stream(
|
||||
&mut self,
|
||||
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
if let Some(last) = self.last.take() {
|
||||
self.insert(last.as_slice(), DelAdd::Addition, false, insertion_callback)?;
|
||||
}
|
||||
|
||||
if let Some(mut stream) = self.stream.take() {
|
||||
while let Some(current) = stream.next() {
|
||||
self.insert(current, DelAdd::Addition, false, insertion_callback)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn build(
|
||||
mut self,
|
||||
insertion_callback: &mut impl FnMut(&[u8], DelAdd, bool) -> Result<()>,
|
||||
) -> Result<Mmap> {
|
||||
self.drain_stream(insertion_callback)?;
|
||||
|
||||
let fst_file = self
|
||||
.fst_builder
|
||||
.into_inner()?
|
||||
.into_inner()
|
||||
.map_err(|_| InternalError::IndexingMergingKeys { process: "building-fst" })?;
|
||||
let fst_mmap = unsafe { Mmap::map(&fst_file)? };
|
||||
|
||||
Ok(fst_mmap)
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user