mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-19 13:00:46 +00:00
Compare commits
28 Commits
tmp-use-we
...
diff-index
Author | SHA1 | Date | |
---|---|---|---|
02a40645e2 | |||
066221fd2b | |||
b8fed737ef | |||
c63ff5298b | |||
d50408d670 | |||
e0dc413521 | |||
0f6a0b1ab8 | |||
061f490204 | |||
5c43ff72c1 | |||
c445e9daec | |||
178a9802fa | |||
7d546b9c22 | |||
c829feb40b | |||
b88fd7994c | |||
096d7705c7 | |||
e8f8730467 | |||
26ef0b3a07 | |||
20394fda04 | |||
27161bcd05 | |||
04fd44b5e2 | |||
9078e60024 | |||
8fb96b8274 | |||
50ba751244 | |||
f36c36e368 | |||
c2dcd66d32 | |||
d4594306d3 | |||
93d0680903 | |||
01101d55ac |
1
Cargo.lock
generated
1
Cargo.lock
generated
@ -2704,6 +2704,7 @@ dependencies = [
|
|||||||
"logging_timer",
|
"logging_timer",
|
||||||
"maplit",
|
"maplit",
|
||||||
"md5",
|
"md5",
|
||||||
|
"meili-snap",
|
||||||
"memmap2",
|
"memmap2",
|
||||||
"mimalloc",
|
"mimalloc",
|
||||||
"obkv",
|
"obkv",
|
||||||
|
@ -79,6 +79,7 @@ big_s = "1.0.2"
|
|||||||
insta = "1.29.0"
|
insta = "1.29.0"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
md5 = "0.7.0"
|
md5 = "0.7.0"
|
||||||
|
meili-snap = { path = "../meili-snap" }
|
||||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
@ -60,12 +60,16 @@ impl CboRoaringBitmapCodec {
|
|||||||
/// if the merged values length is under the threshold, values are directly
|
/// if the merged values length is under the threshold, values are directly
|
||||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||||
/// values and is serialized in the buffer.
|
/// values and is serialized in the buffer.
|
||||||
pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
|
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||||
|
where
|
||||||
|
I: IntoIterator<Item = A>,
|
||||||
|
A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
let mut roaring = RoaringBitmap::new();
|
let mut roaring = RoaringBitmap::new();
|
||||||
let mut vec = Vec::new();
|
let mut vec = Vec::new();
|
||||||
|
|
||||||
for bytes in slices {
|
for bytes in slices {
|
||||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
||||||
let mut reader = bytes.as_ref();
|
let mut reader = bytes.as_ref();
|
||||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||||
vec.push(integer);
|
vec.push(integer);
|
||||||
@ -85,7 +89,7 @@ impl CboRoaringBitmapCodec {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// We can unwrap safely because the vector is sorted upper.
|
// We can unwrap safely because the vector is sorted upper.
|
||||||
let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
|
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||||
roaring.serialize_into(buffer)?;
|
roaring.serialize_into(buffer)?;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -119,16 +119,16 @@ pub struct Index {
|
|||||||
pub(crate) main: PolyDatabase,
|
pub(crate) main: PolyDatabase,
|
||||||
|
|
||||||
/// A word and all the documents ids containing the word.
|
/// A word and all the documents ids containing the word.
|
||||||
pub word_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
|
||||||
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
|
pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix.
|
/// A prefix of word and all the documents ids containing this prefix.
|
||||||
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
/// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
|
||||||
pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
/// Maps the proximity between a pair of words with all the docids where this relation appears.
|
||||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
@ -11,9 +11,7 @@ use super::interner::Interned;
|
|||||||
use super::Word;
|
use super::Word;
|
||||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||||
use crate::{
|
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
|
||||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
|
|
||||||
};
|
|
||||||
|
|
||||||
/// A cache storing pointers to values in the LMDB databases.
|
/// A cache storing pointers to values in the LMDB databases.
|
||||||
///
|
///
|
||||||
@ -168,7 +166,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
@ -182,7 +180,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
word: Interned<String>,
|
word: Interned<String>,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
word,
|
word,
|
||||||
self.word_interner.get(word).as_str(),
|
self.word_interner.get(word).as_str(),
|
||||||
@ -230,7 +228,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
merge_cbo_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
@ -244,7 +242,7 @@ impl<'ctx> SearchContext<'ctx> {
|
|||||||
&mut self,
|
&mut self,
|
||||||
prefix: Interned<String>,
|
prefix: Interned<String>,
|
||||||
) -> Result<Option<RoaringBitmap>> {
|
) -> Result<Option<RoaringBitmap>> {
|
||||||
DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
|
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
|
||||||
self.txn,
|
self.txn,
|
||||||
prefix,
|
prefix,
|
||||||
self.word_interner.get(prefix).as_str(),
|
self.word_interner.get(prefix).as_str(),
|
||||||
|
@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
|
|||||||
|
|
||||||
use big_s::S;
|
use big_s::S;
|
||||||
use maplit::hashset;
|
use maplit::hashset;
|
||||||
|
use meili_snap::insta;
|
||||||
|
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
use crate::search::new::tests::collect_field_values;
|
use crate::search::new::tests::collect_field_values;
|
||||||
|
104
milli/src/update/del_add.rs
Normal file
104
milli/src/update/del_add.rs
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
use obkv::Key;
|
||||||
|
|
||||||
|
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||||
|
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
||||||
|
|
||||||
|
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||||
|
///
|
||||||
|
/// Its used in an OBKV to be serialized in grenad files.
|
||||||
|
#[repr(u8)]
|
||||||
|
#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
|
||||||
|
pub enum DelAdd {
|
||||||
|
Deletion = 0,
|
||||||
|
Addition = 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Key for DelAdd {
|
||||||
|
const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
|
||||||
|
type BYTES = [u8; Self::BYTES_SIZE];
|
||||||
|
|
||||||
|
fn to_be_bytes(&self) -> Self::BYTES {
|
||||||
|
u8::to_be_bytes(*self as u8)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn from_be_bytes(array: Self::BYTES) -> Self {
|
||||||
|
match u8::from_be_bytes(array) {
|
||||||
|
0 => Self::Deletion,
|
||||||
|
1 => Self::Addition,
|
||||||
|
otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
|
||||||
|
///
|
||||||
|
/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
|
||||||
|
/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
|
||||||
|
/// if both deletion and addition are `true, the value will be inserted in both keys.
|
||||||
|
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||||
|
reader: obkv::KvReader<K>,
|
||||||
|
deletion: bool,
|
||||||
|
addition: bool,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
|
for (key, value) in reader.iter() {
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
if deletion {
|
||||||
|
value_writer.insert(DelAdd::Deletion, value)?;
|
||||||
|
}
|
||||||
|
if addition {
|
||||||
|
value_writer.insert(DelAdd::Addition, value)?;
|
||||||
|
}
|
||||||
|
value_writer.finish()?;
|
||||||
|
writer.insert(key, &value_buffer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
|
||||||
|
///
|
||||||
|
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||||
|
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||||
|
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||||
|
deletion: obkv::KvReader<K>,
|
||||||
|
addition: obkv::KvReader<K>,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) -> Result<(), std::io::Error> {
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
|
|
||||||
|
for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||||
|
value_buffer.clear();
|
||||||
|
match eob {
|
||||||
|
Left((k, v)) => {
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Deletion, v).unwrap();
|
||||||
|
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||||
|
}
|
||||||
|
Right((k, v)) => {
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Addition, v).unwrap();
|
||||||
|
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||||
|
}
|
||||||
|
Both((k, deletion), (_, addition)) => {
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||||
|
writer.insert(k, value_writer.into_inner()?).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
|
||||||
|
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||||
|
}
|
@ -16,9 +16,7 @@ use crate::facet::FacetType;
|
|||||||
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
use crate::heed_codec::facet::FieldDocIdFacetCodec;
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::index::Hnsw;
|
use crate::index::Hnsw;
|
||||||
use crate::{
|
use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
|
||||||
ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct DeleteDocuments<'t, 'u, 'i> {
|
pub struct DeleteDocuments<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -495,7 +493,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
fn remove_from_word_prefix_docids(
|
fn remove_from_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
db: &Database<Str, RoaringBitmapCodec>,
|
db: &Database<Str, CboRoaringBitmapCodec>,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
) -> Result<fst::Set<Vec<u8>>> {
|
) -> Result<fst::Set<Vec<u8>>> {
|
||||||
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
let mut prefixes_to_delete = fst::SetBuilder::memory();
|
||||||
@ -523,7 +521,7 @@ fn remove_from_word_prefix_docids(
|
|||||||
|
|
||||||
fn remove_from_word_docids(
|
fn remove_from_word_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
db: &heed::Database<Str, RoaringBitmapCodec>,
|
db: &heed::Database<Str, CboRoaringBitmapCodec>,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
words_to_keep: &mut BTreeSet<String>,
|
words_to_keep: &mut BTreeSet<String>,
|
||||||
words_to_remove: &mut BTreeSet<String>,
|
words_to_remove: &mut BTreeSet<String>,
|
||||||
|
@ -132,6 +132,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
|||||||
self.db.delete_range(wtxn, &range).map(drop)?;
|
self.db.delete_range(wtxn, &range).map(drop)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
|
||||||
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
|
||||||
let new_data = match self.new_data.take() {
|
let new_data = match self.new_data.take() {
|
||||||
Some(x) => x,
|
Some(x) => x,
|
||||||
|
@ -114,6 +114,7 @@ pub struct FacetsUpdate<'i> {
|
|||||||
min_level_size: u8,
|
min_level_size: u8,
|
||||||
}
|
}
|
||||||
impl<'i> FacetsUpdate<'i> {
|
impl<'i> FacetsUpdate<'i> {
|
||||||
|
// TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
|
||||||
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
|
||||||
let database = match facet_type {
|
let database = match facet_type {
|
||||||
FacetType::String => index
|
FacetType::String => index
|
||||||
|
@ -4,18 +4,16 @@ use std::fs::File;
|
|||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||||
use obkv::KvReader;
|
use obkv::{KvReader, KvWriterU16};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::{InternalError, SerializationError};
|
use crate::error::{InternalError, SerializationError};
|
||||||
use crate::update::index_documents::MergeFn;
|
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||||
use crate::{
|
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||||
absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
|
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||||
|
|
||||||
/// Extracts the word and positions where this word appear and
|
/// Extracts the word and positions where this word appear and
|
||||||
/// prefixes it by the document id.
|
/// prefixes it by the document id.
|
||||||
@ -38,18 +36,153 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
|
// initialize destination values.
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
let mut script_language_docids = HashMap::new();
|
let mut script_language_docids = HashMap::new();
|
||||||
let mut docid_word_positions_sorter = create_sorter(
|
let mut docid_word_positions_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
concat_u32s_array,
|
keep_latest_obkv,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut buffers = Buffers::default();
|
// initialize buffers.
|
||||||
|
let mut del_buffers = Buffers::default();
|
||||||
|
let mut add_buffers = Buffers::default();
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
|
|
||||||
|
// initialize tokenizer.
|
||||||
|
let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
|
||||||
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
|
// iterate over documents.
|
||||||
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let document_id = key
|
||||||
|
.try_into()
|
||||||
|
.map(u32::from_be_bytes)
|
||||||
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
|
let obkv = KvReader::<FieldId>::new(value);
|
||||||
|
|
||||||
|
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||||
|
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
documents_ids.push(document_id);
|
||||||
|
|
||||||
|
// Update key buffer prefix.
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||||
|
|
||||||
|
// Tokenize deletions and additions in 2 diffferent threads.
|
||||||
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||||
|
|| {
|
||||||
|
// deletions
|
||||||
|
lang_safe_tokens_from_document(
|
||||||
|
&obkv,
|
||||||
|
searchable_fields,
|
||||||
|
&tokenizer,
|
||||||
|
stop_words,
|
||||||
|
allowed_separators,
|
||||||
|
dictionary,
|
||||||
|
max_positions_per_attributes,
|
||||||
|
DelAdd::Deletion,
|
||||||
|
&mut del_buffers,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
// additions
|
||||||
|
lang_safe_tokens_from_document(
|
||||||
|
&obkv,
|
||||||
|
searchable_fields,
|
||||||
|
&tokenizer,
|
||||||
|
stop_words,
|
||||||
|
allowed_separators,
|
||||||
|
dictionary,
|
||||||
|
max_positions_per_attributes,
|
||||||
|
DelAdd::Addition,
|
||||||
|
&mut add_buffers,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
let (del_obkv, del_script_language_word_count) = del?;
|
||||||
|
let (add_obkv, add_script_language_word_count) = add?;
|
||||||
|
|
||||||
|
// merge deletions and additions.
|
||||||
|
value_buffer.clear();
|
||||||
|
del_add_from_two_obkvs(
|
||||||
|
KvReader::<FieldId>::new(del_obkv),
|
||||||
|
KvReader::<FieldId>::new(add_obkv),
|
||||||
|
&mut value_buffer,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// write them into the sorter.
|
||||||
|
let obkv = KvReader::<FieldId>::new(value);
|
||||||
|
for (field_id, value) in obkv.iter() {
|
||||||
|
key_buffer.truncate(mem::size_of::<u32>());
|
||||||
|
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// update script_language_docids deletions.
|
||||||
|
for (script, languages_frequency) in del_script_language_word_count {
|
||||||
|
for (language, _) in languages_frequency {
|
||||||
|
let entry = script_language_docids
|
||||||
|
.entry((script, language))
|
||||||
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||||
|
entry.0.push(document_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// update script_language_docids additions.
|
||||||
|
for (script, languages_frequency) in add_script_language_word_count {
|
||||||
|
for (language, _) in languages_frequency {
|
||||||
|
let entry = script_language_docids
|
||||||
|
.entry((script, language))
|
||||||
|
.or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
|
||||||
|
entry.1.push(document_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||||
|
.map(|reader| (documents_ids, reader, script_language_docids))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Check if any searchable fields of a document changed.
|
||||||
|
fn searchable_fields_changed(
|
||||||
|
obkv: &KvReader<FieldId>,
|
||||||
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
|
) -> bool {
|
||||||
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||||
|
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||||
|
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||||
|
// if both fields are None, check the next field.
|
||||||
|
(None, None) => (),
|
||||||
|
// if both contains a value and values are the same, check the next field.
|
||||||
|
(Some(del), Some(add)) if del == add => (),
|
||||||
|
// otherwise the fields are different, return true.
|
||||||
|
_otherwise => return true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Factorize tokenizer building.
|
||||||
|
fn tokenizer_builder<'a>(
|
||||||
|
stop_words: Option<&'a fst::Set<&[u8]>>,
|
||||||
|
allowed_separators: Option<&'a [&str]>,
|
||||||
|
dictionary: Option<&'a [&str]>,
|
||||||
|
script_language: Option<&'a HashMap<Script, Vec<Language>>>,
|
||||||
|
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||||
if let Some(stop_words) = stop_words {
|
if let Some(stop_words) = stop_words {
|
||||||
tokenizer_builder.stop_words(stop_words);
|
tokenizer_builder.stop_words(stop_words);
|
||||||
@ -60,130 +193,144 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
if let Some(separators) = allowed_separators {
|
if let Some(separators) = allowed_separators {
|
||||||
tokenizer_builder.separators(separators);
|
tokenizer_builder.separators(separators);
|
||||||
}
|
}
|
||||||
let tokenizer = tokenizer_builder.build();
|
|
||||||
|
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
if let Some(script_language) = script_language {
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
tokenizer_builder.allow_list(&script_language);
|
||||||
let document_id = key
|
}
|
||||||
.try_into()
|
|
||||||
.map(u32::from_be_bytes)
|
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
|
||||||
let obkv = KvReader::<FieldId>::new(value);
|
|
||||||
|
|
||||||
documents_ids.push(document_id);
|
tokenizer_builder
|
||||||
buffers.key_buffer.clear();
|
}
|
||||||
buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
|
||||||
|
|
||||||
let mut script_language_word_count = HashMap::new();
|
/// Extract words maped with their positions of a document,
|
||||||
|
/// ensuring no Language detection mistakes was made.
|
||||||
|
fn lang_safe_tokens_from_document<'a>(
|
||||||
|
obkv: &KvReader<FieldId>,
|
||||||
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
|
tokenizer: &Tokenizer,
|
||||||
|
stop_words: Option<&fst::Set<&[u8]>>,
|
||||||
|
allowed_separators: Option<&[&str]>,
|
||||||
|
dictionary: Option<&[&str]>,
|
||||||
|
max_positions_per_attributes: u32,
|
||||||
|
del_add: DelAdd,
|
||||||
|
buffers: &'a mut Buffers,
|
||||||
|
) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
|
||||||
|
let mut script_language_word_count = HashMap::new();
|
||||||
|
|
||||||
extract_tokens_from_document(
|
tokens_from_document(
|
||||||
&obkv,
|
&obkv,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
&tokenizer,
|
&tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
&mut buffers,
|
del_add,
|
||||||
&mut script_language_word_count,
|
buffers,
|
||||||
&mut docid_word_positions_sorter,
|
&mut script_language_word_count,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// if we detect a potetial mistake in the language detection,
|
// if we detect a potetial mistake in the language detection,
|
||||||
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
// we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
|
||||||
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
// context: https://github.com/meilisearch/meilisearch/issues/3565
|
||||||
if script_language_word_count
|
if script_language_word_count
|
||||||
.values()
|
.values()
|
||||||
.map(Vec::as_slice)
|
.map(Vec::as_slice)
|
||||||
.any(potential_language_detection_error)
|
.any(potential_language_detection_error)
|
||||||
{
|
{
|
||||||
// build an allow list with the most frequent detected languages in the document.
|
// build an allow list with the most frequent detected languages in the document.
|
||||||
let script_language: HashMap<_, _> =
|
let script_language: HashMap<_, _> =
|
||||||
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
script_language_word_count.iter().filter_map(most_frequent_languages).collect();
|
||||||
|
|
||||||
// if the allow list is empty, meaning that no Language is considered frequent,
|
// if the allow list is empty, meaning that no Language is considered frequent,
|
||||||
// then we don't rerun the extraction.
|
// then we don't rerun the extraction.
|
||||||
if !script_language.is_empty() {
|
if !script_language.is_empty() {
|
||||||
// build a new temporary tokenizer including the allow list.
|
// build a new temporary tokenizer including the allow list.
|
||||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
let mut builder = tokenizer_builder(
|
||||||
if let Some(stop_words) = stop_words {
|
stop_words,
|
||||||
tokenizer_builder.stop_words(stop_words);
|
dictionary,
|
||||||
}
|
allowed_separators,
|
||||||
tokenizer_builder.allow_list(&script_language);
|
Some(&script_language),
|
||||||
let tokenizer = tokenizer_builder.build();
|
);
|
||||||
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
script_language_word_count.clear();
|
script_language_word_count.clear();
|
||||||
|
|
||||||
// rerun the extraction.
|
// rerun the extraction.
|
||||||
extract_tokens_from_document(
|
tokens_from_document(
|
||||||
&obkv,
|
&obkv,
|
||||||
searchable_fields,
|
searchable_fields,
|
||||||
&tokenizer,
|
&tokenizer,
|
||||||
max_positions_per_attributes,
|
max_positions_per_attributes,
|
||||||
&mut buffers,
|
del_add,
|
||||||
&mut script_language_word_count,
|
buffers,
|
||||||
&mut docid_word_positions_sorter,
|
&mut script_language_word_count,
|
||||||
)?;
|
)?;
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (script, languages_frequency) in script_language_word_count {
|
|
||||||
for (language, _) in languages_frequency {
|
|
||||||
let entry = script_language_docids
|
|
||||||
.entry((script, language))
|
|
||||||
.or_insert_with(RoaringBitmap::new);
|
|
||||||
entry.push(document_id);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
Ok((&buffers.obkv_buffer, script_language_word_count))
|
||||||
.map(|reader| (documents_ids, reader, script_language_docids))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn extract_tokens_from_document(
|
/// Extract words maped with their positions of a document.
|
||||||
|
fn tokens_from_document<'a>(
|
||||||
obkv: &KvReader<FieldId>,
|
obkv: &KvReader<FieldId>,
|
||||||
searchable_fields: &Option<HashSet<FieldId>>,
|
searchable_fields: &Option<HashSet<FieldId>>,
|
||||||
tokenizer: &Tokenizer,
|
tokenizer: &Tokenizer,
|
||||||
max_positions_per_attributes: u32,
|
max_positions_per_attributes: u32,
|
||||||
buffers: &mut Buffers,
|
del_add: DelAdd,
|
||||||
|
buffers: &'a mut Buffers,
|
||||||
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
|
||||||
docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
|
) -> Result<&'a [u8]> {
|
||||||
) -> Result<()> {
|
buffers.obkv_buffer.clear();
|
||||||
|
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
|
// if field is searchable.
|
||||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||||
let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
// extract deletion or addition only.
|
||||||
buffers.field_buffer.clear();
|
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
// parse json.
|
||||||
let tokens = process_tokens(tokenizer.tokenize(field))
|
let value =
|
||||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
|
|
||||||
for (index, token) in tokens {
|
// prepare writting destination.
|
||||||
// if a language has been detected for the token, we update the counter.
|
buffers.obkv_positions_buffer.clear();
|
||||||
if let Some(language) = token.language {
|
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||||
let script = token.script;
|
|
||||||
let entry =
|
// convert json into an unique string.
|
||||||
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
buffers.field_buffer.clear();
|
||||||
match entry.iter_mut().find(|(l, _)| *l == language) {
|
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||||
Some((_, n)) => *n += 1,
|
// create an iterator of token with their positions.
|
||||||
None => entry.push((language, 1)),
|
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||||
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
|
for (index, token) in tokens {
|
||||||
|
// if a language has been detected for the token, we update the counter.
|
||||||
|
if let Some(language) = token.language {
|
||||||
|
let script = token.script;
|
||||||
|
let entry =
|
||||||
|
script_language_word_count.entry(script).or_insert_with(Vec::new);
|
||||||
|
match entry.iter_mut().find(|(l, _)| *l == language) {
|
||||||
|
Some((_, n)) => *n += 1,
|
||||||
|
None => entry.push((language, 1)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep a word only if it is not empty and fit in a LMDB key.
|
||||||
|
let token = token.lemma().trim();
|
||||||
|
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||||
|
let position: u16 = index
|
||||||
|
.try_into()
|
||||||
|
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||||
|
writer.insert(position, token.as_bytes())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let token = token.lemma().trim();
|
|
||||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
|
||||||
buffers.key_buffer.truncate(mem::size_of::<u32>());
|
|
||||||
buffers.key_buffer.extend_from_slice(token.as_bytes());
|
|
||||||
|
|
||||||
let position: u16 = index
|
// write positions into document.
|
||||||
.try_into()
|
let positions = writer.into_inner()?;
|
||||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
document_writer.insert(field_id, positions)?;
|
||||||
let position = absolute_from_relative_position(field_id, position);
|
|
||||||
docid_word_positions_sorter
|
|
||||||
.insert(&buffers.key_buffer, position.to_ne_bytes())?;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Transform a JSON value into a string that can be indexed.
|
/// Transform a JSON value into a string that can be indexed.
|
||||||
@ -286,10 +433,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
|
|||||||
|
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
struct Buffers {
|
struct Buffers {
|
||||||
// the key buffer is the concatenation of the internal document id with the field id.
|
|
||||||
// The buffer has to be completelly cleared between documents,
|
|
||||||
// and the field id part must be cleared between each field.
|
|
||||||
key_buffer: Vec<u8>,
|
|
||||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||||
field_buffer: String,
|
field_buffer: String,
|
||||||
|
// buffer used to store the value data containing an obkv.
|
||||||
|
obkv_buffer: Vec<u8>,
|
||||||
|
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||||
|
obkv_positions_buffer: Vec<u8>,
|
||||||
}
|
}
|
||||||
|
@ -4,11 +4,12 @@ use std::io;
|
|||||||
use heed::{BytesDecode, BytesEncode};
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||||
};
|
};
|
||||||
|
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||||
@ -17,7 +18,7 @@ use crate::Result;
|
|||||||
/// documents ids from the given chunk of docid facet number positions.
|
/// documents ids from the given chunk of docid facet number positions.
|
||||||
#[logging_timer::time]
|
#[logging_timer::time]
|
||||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||||
docid_fid_facet_number: grenad::Reader<R>,
|
fid_docid_facet_number: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
@ -26,21 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_number_docids_sorter = create_sorter(
|
let mut facet_number_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut cursor = docid_fid_facet_number.into_cursor()?;
|
let mut buffer = Vec::new();
|
||||||
while let Some((key_bytes, _)) = cursor.move_on_next()? {
|
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||||
|
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||||
let (field_id, document_id, number) =
|
let (field_id, document_id, number) =
|
||||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||||
|
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||||
facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
|
|
||||||
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
||||||
|
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
obkv.finish()?;
|
||||||
|
|
||||||
|
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||||
|
@ -1,13 +1,14 @@
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::{io, str};
|
||||||
|
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
|
||||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||||
use crate::heed_codec::StrRefCodec;
|
use crate::heed_codec::StrRefCodec;
|
||||||
use crate::update::index_documents::merge_cbo_roaring_bitmaps;
|
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
|
||||||
|
use crate::{FieldId, Result};
|
||||||
|
|
||||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||||
///
|
///
|
||||||
@ -24,15 +25,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut facet_string_docids_sorter = create_sorter(
|
let mut facet_string_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Stable,
|
grenad::SortAlgorithm::Stable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||||
while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
|
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||||
|
|
||||||
@ -40,21 +42,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
|||||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
|
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||||
|
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||||
let normalised_truncated_value: String;
|
|
||||||
if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
|
|
||||||
normalised_truncated_value = normalised_value
|
|
||||||
.char_indices()
|
|
||||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
|
||||||
.map(|(_, c)| c)
|
|
||||||
.collect();
|
|
||||||
normalised_value = normalised_truncated_value.as_str();
|
|
||||||
}
|
|
||||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
|
|
||||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||||
// document id is encoded in native-endian because of the CBO roaring bitmap codec
|
|
||||||
facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
|
||||||
|
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||||
|
}
|
||||||
|
obkv.finish()?;
|
||||||
|
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
sorter_into_reader(facet_string_docids_sorter, indexer)
|
sorter_into_reader(facet_string_docids_sorter, indexer)
|
||||||
|
@ -1,24 +1,36 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
use std::collections::{BTreeMap, HashSet};
|
use std::collections::{BTreeMap, HashSet};
|
||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
|
use grenad::Sorter;
|
||||||
use heed::zerocopy::AsBytes;
|
use heed::zerocopy::AsBytes;
|
||||||
use heed::BytesEncode;
|
use heed::BytesEncode;
|
||||||
|
use itertools::EitherOrBoth;
|
||||||
|
use ordered_float::OrderedFloat;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
|
use FilterableValues::{Empty, Null, Values};
|
||||||
|
|
||||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||||
use crate::error::InternalError;
|
use crate::error::InternalError;
|
||||||
use crate::facet::value_encoding::f64_into_bytes;
|
use crate::facet::value_encoding::f64_into_bytes;
|
||||||
|
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
|
use crate::{
|
||||||
|
CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
|
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||||
|
|
||||||
/// The extracted facet values stored in grenad files by type.
|
/// The extracted facet values stored in grenad files by type.
|
||||||
pub struct ExtractedFacetValues {
|
pub struct ExtractedFacetValues {
|
||||||
pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
|
pub fid_docid_facet_numbers_chunk: grenad::Reader<File>,
|
||||||
pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
|
pub fid_docid_facet_strings_chunk: grenad::Reader<File>,
|
||||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
|
||||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
|
||||||
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
|
||||||
@ -58,71 +70,150 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
max_memory.map(|m| m / 2),
|
max_memory.map(|m| m / 2),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
// The tuples represents the Del and Add side for a bitmap
|
||||||
let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
|
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||||
|
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||||
|
|
||||||
|
// We create two buffer for mutable ref issues with closures.
|
||||||
|
let mut numbers_key_buffer = Vec::new();
|
||||||
|
let mut strings_key_buffer = Vec::new();
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
|
|
||||||
for (field_id, field_bytes) in obkv.iter() {
|
for (field_id, field_bytes) in obkv.iter() {
|
||||||
if faceted_fields.contains(&field_id) {
|
if faceted_fields.contains(&field_id) {
|
||||||
key_buffer.clear();
|
numbers_key_buffer.clear();
|
||||||
|
strings_key_buffer.clear();
|
||||||
|
|
||||||
// Set key to the field_id
|
// Set key to the field_id
|
||||||
// Note: this encoding is consistent with FieldIdCodec
|
// Note: this encoding is consistent with FieldIdCodec
|
||||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
|
||||||
// Here, we know already that the document must be added to the “field id exists” database
|
|
||||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||||
let document = BEU32::from(document).get();
|
let document = BEU32::from(document).get();
|
||||||
|
|
||||||
facet_exists_docids.entry(field_id).or_default().insert(document);
|
|
||||||
|
|
||||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||||
key_buffer.extend_from_slice(docid_bytes);
|
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||||
|
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||||
|
|
||||||
let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||||
|
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||||
|
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||||
|
Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
|
||||||
|
None => None,
|
||||||
|
};
|
||||||
|
|
||||||
match extract_facet_values(
|
// We insert the document id on the Del and the Add side if the field exists.
|
||||||
&value,
|
let (ref mut del_exists, ref mut add_exists) =
|
||||||
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
|
facet_exists_docids.entry(field_id).or_default();
|
||||||
) {
|
let (ref mut del_is_null, ref mut add_is_null) =
|
||||||
FilterableValues::Null => {
|
facet_is_null_docids.entry(field_id).or_default();
|
||||||
facet_is_null_docids.entry(field_id).or_default().insert(document);
|
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||||
}
|
facet_is_empty_docids.entry(field_id).or_default();
|
||||||
FilterableValues::Empty => {
|
|
||||||
facet_is_empty_docids.entry(field_id).or_default().insert(document);
|
|
||||||
}
|
|
||||||
FilterableValues::Values { numbers, strings } => {
|
|
||||||
// insert facet numbers in sorter
|
|
||||||
for number in numbers {
|
|
||||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
|
||||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
|
||||||
key_buffer.extend_from_slice(&value_bytes);
|
|
||||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
|
||||||
|
|
||||||
fid_docid_facet_numbers_sorter
|
if del_value.is_some() {
|
||||||
.insert(&key_buffer, ().as_bytes())?;
|
del_exists.insert(document);
|
||||||
}
|
}
|
||||||
|
if add_value.is_some() {
|
||||||
|
add_exists.insert(document);
|
||||||
|
}
|
||||||
|
|
||||||
|
let geo_support =
|
||||||
|
geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||||
|
let del_filterable_values =
|
||||||
|
del_value.map(|value| extract_facet_values(&value, geo_support));
|
||||||
|
let add_filterable_values =
|
||||||
|
add_value.map(|value| extract_facet_values(&value, geo_support));
|
||||||
|
|
||||||
|
// Those closures are just here to simplify things a bit.
|
||||||
|
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||||
|
insert_numbers_diff(
|
||||||
|
&mut fid_docid_facet_numbers_sorter,
|
||||||
|
&mut numbers_key_buffer,
|
||||||
|
del_numbers,
|
||||||
|
add_numbers,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||||
|
insert_strings_diff(
|
||||||
|
&mut fid_docid_facet_strings_sorter,
|
||||||
|
&mut strings_key_buffer,
|
||||||
|
del_strings,
|
||||||
|
add_strings,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
match (del_filterable_values, add_filterable_values) {
|
||||||
|
(None, None) => (),
|
||||||
|
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||||
|
Null => {
|
||||||
|
del_is_null.insert(document);
|
||||||
}
|
}
|
||||||
|
Empty => {
|
||||||
// insert normalized and original facet string in sorter
|
del_is_empty.insert(document);
|
||||||
for (normalized, original) in
|
}
|
||||||
strings.into_iter().filter(|(n, _)| !n.is_empty())
|
Values { numbers, strings } => {
|
||||||
{
|
insert_numbers_diff(numbers, vec![])?;
|
||||||
let normalized_truncated_value: String = normalized
|
insert_strings_diff(strings, vec![])?;
|
||||||
.char_indices()
|
}
|
||||||
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
},
|
||||||
.map(|(_, c)| c)
|
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||||
.collect();
|
Null => {
|
||||||
|
add_is_null.insert(document);
|
||||||
key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
|
}
|
||||||
key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
|
Empty => {
|
||||||
fid_docid_facet_strings_sorter
|
add_is_empty.insert(document);
|
||||||
.insert(&key_buffer, original.as_bytes())?;
|
}
|
||||||
|
Values { numbers, strings } => {
|
||||||
|
insert_numbers_diff(vec![], numbers)?;
|
||||||
|
insert_strings_diff(vec![], strings)?;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||||
|
match (del_filterable_values, add_filterable_values) {
|
||||||
|
(Null, Null) | (Empty, Empty) => (),
|
||||||
|
(Null, Empty) => {
|
||||||
|
del_is_null.insert(document);
|
||||||
|
add_is_empty.insert(document);
|
||||||
|
}
|
||||||
|
(Empty, Null) => {
|
||||||
|
del_is_empty.insert(document);
|
||||||
|
add_is_null.insert(document);
|
||||||
|
}
|
||||||
|
(Null, Values { numbers, strings }) => {
|
||||||
|
insert_numbers_diff(vec![], numbers)?;
|
||||||
|
insert_strings_diff(vec![], strings)?;
|
||||||
|
del_is_null.insert(document);
|
||||||
|
}
|
||||||
|
(Empty, Values { numbers, strings }) => {
|
||||||
|
insert_numbers_diff(vec![], numbers)?;
|
||||||
|
insert_strings_diff(vec![], strings)?;
|
||||||
|
del_is_empty.insert(document);
|
||||||
|
}
|
||||||
|
(Values { numbers, strings }, Null) => {
|
||||||
|
add_is_null.insert(document);
|
||||||
|
insert_numbers_diff(numbers, vec![])?;
|
||||||
|
insert_strings_diff(strings, vec![])?;
|
||||||
|
}
|
||||||
|
(Values { numbers, strings }, Empty) => {
|
||||||
|
add_is_empty.insert(document);
|
||||||
|
insert_numbers_diff(numbers, vec![])?;
|
||||||
|
insert_strings_diff(strings, vec![])?;
|
||||||
|
}
|
||||||
|
(
|
||||||
|
Values { numbers: del_numbers, strings: del_strings },
|
||||||
|
Values { numbers: add_numbers, strings: add_strings },
|
||||||
|
) => {
|
||||||
|
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||||
|
insert_strings_diff(del_strings, add_strings)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -130,14 +221,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
let mut facet_exists_docids_writer = create_writer(
|
let mut facet_exists_docids_writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, bitmap) in facet_exists_docids.into_iter() {
|
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||||
|
|
||||||
@ -146,9 +238,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, bitmap) in facet_is_null_docids.into_iter() {
|
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||||
|
|
||||||
@ -157,21 +249,156 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
|||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
tempfile::tempfile()?,
|
tempfile::tempfile()?,
|
||||||
);
|
);
|
||||||
for (fid, bitmap) in facet_is_empty_docids.into_iter() {
|
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||||
let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
|
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||||
}
|
}
|
||||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||||
|
|
||||||
Ok(ExtractedFacetValues {
|
Ok(ExtractedFacetValues {
|
||||||
docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||||
docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||||
|
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
del_bitmap: &RoaringBitmap,
|
||||||
|
add_bitmap: &RoaringBitmap,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
buffer.clear();
|
||||||
|
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||||
|
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||||
|
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||||
|
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||||
|
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||||
|
obkv.finish()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Truncates a string to the biggest valid LMDB key size.
|
||||||
|
fn truncate_string(s: String) -> String {
|
||||||
|
s.char_indices()
|
||||||
|
.take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
|
||||||
|
.map(|(_, c)| c)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes the diff between both Del and Add numbers and
|
||||||
|
/// only inserts the parts that differ in the sorter.
|
||||||
|
fn insert_numbers_diff<MF>(
|
||||||
|
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
mut del_numbers: Vec<f64>,
|
||||||
|
mut add_numbers: Vec<f64>,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||||
|
{
|
||||||
|
// We sort and dedup the float numbers
|
||||||
|
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||||
|
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||||
|
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||||
|
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||||
|
|
||||||
|
let merged_numbers_iter = itertools::merge_join_by(
|
||||||
|
del_numbers.into_iter().map(OrderedFloat),
|
||||||
|
add_numbers.into_iter().map(OrderedFloat),
|
||||||
|
|del, add| del.cmp(add),
|
||||||
|
);
|
||||||
|
|
||||||
|
// insert facet numbers in sorter
|
||||||
|
for eob in merged_numbers_iter {
|
||||||
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||||
|
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||||
|
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||||
|
key_buffer.extend_from_slice(&value_bytes);
|
||||||
|
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||||
|
|
||||||
|
// We insert only the Del part of the Obkv to inform
|
||||||
|
// that we only want to remove all those numbers.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Deletion, ().as_bytes())?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||||
|
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||||
|
key_buffer.extend_from_slice(&value_bytes);
|
||||||
|
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||||
|
|
||||||
|
// We insert only the Del part of the Obkv to inform
|
||||||
|
// that we only want to remove all those numbers.
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Addition, ().as_bytes())?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Computes the diff between both Del and Add strings and
|
||||||
|
/// only inserts the parts that differ in the sorter.
|
||||||
|
fn insert_strings_diff<MF>(
|
||||||
|
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
mut del_strings: Vec<(String, String)>,
|
||||||
|
mut add_strings: Vec<(String, String)>,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||||
|
{
|
||||||
|
// We sort and dedup the normalized and original strings
|
||||||
|
del_strings.sort_unstable();
|
||||||
|
add_strings.sort_unstable();
|
||||||
|
del_strings.dedup();
|
||||||
|
add_strings.dedup();
|
||||||
|
|
||||||
|
let merged_strings_iter = itertools::merge_join_by(
|
||||||
|
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
|
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||||
|
|del, add| del.cmp(add),
|
||||||
|
);
|
||||||
|
|
||||||
|
// insert normalized and original facet string in sorter
|
||||||
|
for eob in merged_strings_iter {
|
||||||
|
key_buffer.truncate(TRUNCATE_SIZE);
|
||||||
|
match eob {
|
||||||
|
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||||
|
EitherOrBoth::Left((normalized, original)) => {
|
||||||
|
let truncated = truncate_string(normalized);
|
||||||
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Deletion, original)?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
EitherOrBoth::Right((normalized, original)) => {
|
||||||
|
let truncated = truncate_string(normalized);
|
||||||
|
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||||
|
|
||||||
|
let mut obkv = KvWriterDelAdd::memory();
|
||||||
|
obkv.insert(DelAdd::Addition, original)?;
|
||||||
|
let bytes = obkv.into_inner()?;
|
||||||
|
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
/// Represent what a document field contains.
|
/// Represent what a document field contains.
|
||||||
enum FilterableValues {
|
enum FilterableValues {
|
||||||
/// Corresponds to the JSON `null` value.
|
/// Corresponds to the JSON `null` value.
|
||||||
@ -182,6 +409,7 @@ enum FilterableValues {
|
|||||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extracts the facet values of a JSON field.
|
||||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||||
fn inner_extract_facet_values(
|
fn inner_extract_facet_values(
|
||||||
value: &Value,
|
value: &Value,
|
||||||
|
@ -1,16 +1,17 @@
|
|||||||
use std::collections::HashMap;
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
use grenad::Sorter;
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||||
try_split_array_at, GrenadParameters, MergeFn,
|
GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
|
use crate::Result;
|
||||||
|
|
||||||
|
const MAX_COUNTED_WORDS: usize = 30;
|
||||||
|
|
||||||
/// Extracts the field id word count and the documents ids where
|
/// Extracts the field id word count and the documents ids where
|
||||||
/// this field id with this amount of words appear.
|
/// this field id with this amount of words appear.
|
||||||
@ -35,63 +36,21 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
|||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
// This map is assumed to not consume a lot of memory.
|
let mut key_buffer = Vec::new();
|
||||||
let mut document_fid_wordcount = HashMap::new();
|
|
||||||
let mut current_document_id = None;
|
|
||||||
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, _word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
|
||||||
if curr_document_id != document_id {
|
if word_count <= MAX_COUNTED_WORDS {
|
||||||
drain_document_fid_wordcount_into_sorter(
|
|
||||||
&mut fid_word_count_docids_sorter,
|
|
||||||
&mut document_fid_wordcount,
|
|
||||||
curr_document_id,
|
|
||||||
)?;
|
|
||||||
current_document_id = Some(document_id);
|
|
||||||
}
|
|
||||||
|
|
||||||
for position in read_u32_ne_bytes(value) {
|
|
||||||
let (field_id, _) = relative_from_absolute_position(position);
|
|
||||||
|
|
||||||
let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
|
|
||||||
*value += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some(document_id) = current_document_id {
|
|
||||||
// We must make sure that don't lose the current document field id
|
|
||||||
// word count map if we break because we reached the end of the chunk.
|
|
||||||
drain_document_fid_wordcount_into_sorter(
|
|
||||||
&mut fid_word_count_docids_sorter,
|
|
||||||
&mut document_fid_wordcount,
|
|
||||||
document_id,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn drain_document_fid_wordcount_into_sorter(
|
|
||||||
fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
|
|
||||||
document_fid_wordcount: &mut HashMap<FieldId, u32>,
|
|
||||||
document_id: DocumentId,
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
|
|
||||||
for (fid, count) in document_fid_wordcount.drain() {
|
|
||||||
if count <= 30 {
|
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
key_buffer.extend_from_slice(fid_bytes);
|
||||||
key_buffer.push(count as u8);
|
key_buffer.push(word_count as u8);
|
||||||
|
|
||||||
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||||
}
|
}
|
||||||
|
@ -1,18 +1,20 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::{BTreeSet, HashSet};
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::iter::FromIterator;
|
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
use heed::BytesDecode;
|
||||||
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
|
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
||||||
try_split_array_at, GrenadParameters,
|
try_split_array_at, writer_into_reader, GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
|
use crate::heed_codec::StrBEU16Codec;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::update::index_documents::helpers::read_u32_ne_bytes;
|
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::{relative_from_absolute_position, FieldId, Result};
|
use crate::update::MergeFn;
|
||||||
|
use crate::{DocumentId, FieldId, Result};
|
||||||
|
|
||||||
/// Extracts the word and the documents ids where this word appear.
|
/// Extracts the word and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
@ -26,65 +28,148 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
|||||||
docid_word_positions: grenad::Reader<R>,
|
docid_word_positions: grenad::Reader<R>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
exact_attributes: &HashSet<FieldId>,
|
exact_attributes: &HashSet<FieldId>,
|
||||||
) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
|
) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
puffin::profile_function!();
|
puffin::profile_function!();
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_docids_sorter = create_sorter(
|
let mut word_fid_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|x| x / 2),
|
max_memory.map(|x| x / 3),
|
||||||
|
);
|
||||||
|
let mut key_buffer = Vec::new();
|
||||||
|
let mut del_words = BTreeSet::new();
|
||||||
|
let mut add_words = BTreeSet::new();
|
||||||
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
|
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
|
let fid = u16::from_be_bytes(fid_bytes);
|
||||||
|
|
||||||
|
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||||
|
// extract all unique words to remove.
|
||||||
|
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||||
|
for (_pos, word) in KvReaderU16::new(&deletion).iter() {
|
||||||
|
del_words.insert(word.to_vec());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// extract all unique additional words.
|
||||||
|
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||||
|
for (_pos, word) in KvReaderU16::new(&addition).iter() {
|
||||||
|
add_words.insert(word.to_vec());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
words_into_sorter(
|
||||||
|
document_id,
|
||||||
|
fid,
|
||||||
|
&mut key_buffer,
|
||||||
|
&del_words,
|
||||||
|
&add_words,
|
||||||
|
&mut word_fid_docids_sorter,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
del_words.clear();
|
||||||
|
add_words.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut word_docids_sorter = create_sorter(
|
||||||
|
grenad::SortAlgorithm::Unstable,
|
||||||
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory.map(|x| x / 3),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut exact_word_docids_sorter = create_sorter(
|
let mut exact_word_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory.map(|x| x / 2),
|
max_memory.map(|x| x / 3),
|
||||||
);
|
);
|
||||||
|
|
||||||
let mut value_buffer = Vec::new();
|
let mut word_fid_docids_writer = create_writer(
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
indexer.chunk_compression_type,
|
||||||
while let Some((key, positions)) = cursor.move_on_next()? {
|
indexer.chunk_compression_level,
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
|
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||||
|
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
||||||
|
while let Some((key, value)) = iter.next()? {
|
||||||
|
// only keep the value if their is a change to apply in the DB.
|
||||||
|
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||||
|
word_fid_docids_writer.insert(key, value)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
|
||||||
|
|
||||||
let bitmap = RoaringBitmap::from_iter(Some(document_id));
|
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||||
serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
|
if exact_attributes.contains(&fid) {
|
||||||
|
exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||||
// If there are no exact attributes, we do not need to iterate over positions.
|
|
||||||
if exact_attributes.is_empty() {
|
|
||||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
|
||||||
} else {
|
} else {
|
||||||
let mut added_to_exact = false;
|
word_docids_sorter.insert(word.as_bytes(), &value)?;
|
||||||
let mut added_to_word_docids = false;
|
|
||||||
for position in read_u32_ne_bytes(positions) {
|
|
||||||
// as soon as we know that this word had been to both readers, we don't need to
|
|
||||||
// iterate over the positions.
|
|
||||||
if added_to_exact && added_to_word_docids {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
let (fid, _) = relative_from_absolute_position(position);
|
|
||||||
if exact_attributes.contains(&fid) && !added_to_exact {
|
|
||||||
exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
|
||||||
added_to_exact = true;
|
|
||||||
} else if !added_to_word_docids {
|
|
||||||
word_docids_sorter.insert(word_bytes, &value_buffer)?;
|
|
||||||
added_to_word_docids = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||||
|
writer_into_reader(word_fid_docids_writer)?,
|
||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn words_into_sorter(
|
||||||
|
document_id: DocumentId,
|
||||||
|
fid: FieldId,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
del_words: &BTreeSet<Vec<u8>>,
|
||||||
|
add_words: &BTreeSet<Vec<u8>>,
|
||||||
|
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||||
|
buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
let word_bytes = match eob {
|
||||||
|
Left(word_bytes) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
word_bytes
|
||||||
|
}
|
||||||
|
Right(word_bytes) => {
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
word_bytes
|
||||||
|
}
|
||||||
|
Both(word_bytes, _) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
word_bytes
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(&word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||||
|
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
@ -1,51 +0,0 @@
|
|||||||
use std::fs::File;
|
|
||||||
use std::io;
|
|
||||||
|
|
||||||
use super::helpers::{
|
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
|
||||||
try_split_array_at, GrenadParameters,
|
|
||||||
};
|
|
||||||
use crate::error::SerializationError;
|
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
|
||||||
use crate::{relative_from_absolute_position, DocumentId, Result};
|
|
||||||
|
|
||||||
/// Extracts the word, field id, and the documents ids where this word appear at this field id.
|
|
||||||
#[logging_timer::time]
|
|
||||||
pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
|
|
||||||
docid_word_positions: grenad::Reader<R>,
|
|
||||||
indexer: GrenadParameters,
|
|
||||||
) -> Result<grenad::Reader<File>> {
|
|
||||||
puffin::profile_function!();
|
|
||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
|
||||||
|
|
||||||
let mut word_fid_docids_sorter = create_sorter(
|
|
||||||
grenad::SortAlgorithm::Unstable,
|
|
||||||
merge_cbo_roaring_bitmaps,
|
|
||||||
indexer.chunk_compression_type,
|
|
||||||
indexer.chunk_compression_level,
|
|
||||||
indexer.max_nb_chunks,
|
|
||||||
max_memory,
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
|
||||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
|
||||||
|
|
||||||
for position in read_u32_ne_bytes(value) {
|
|
||||||
key_buffer.clear();
|
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
|
||||||
key_buffer.push(0);
|
|
||||||
let (fid, _) = relative_from_absolute_position(position);
|
|
||||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
|
||||||
word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
|
|
||||||
|
|
||||||
Ok(word_fid_docids_reader)
|
|
||||||
}
|
|
@ -1,15 +1,17 @@
|
|||||||
use std::cmp::Ordering;
|
use std::collections::{BTreeMap, VecDeque};
|
||||||
use std::collections::{BinaryHeap, HashMap};
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::{cmp, io, mem, str, vec};
|
use std::{cmp, io};
|
||||||
|
|
||||||
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||||
try_split_array_at, GrenadParameters, MergeFn,
|
writer_into_reader, GrenadParameters, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::proximity::{positions_proximity, MAX_DISTANCE};
|
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::{DocumentId, Result};
|
use crate::{DocumentId, Result};
|
||||||
|
|
||||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||||
@ -25,58 +27,138 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let max_memory = indexer.max_memory_by_thread();
|
let max_memory = indexer.max_memory_by_thread();
|
||||||
|
|
||||||
let mut word_pair_proximity_docids_sorter = create_sorter(
|
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||||
grenad::SortAlgorithm::Unstable,
|
.into_iter()
|
||||||
merge_cbo_roaring_bitmaps,
|
.map(|_| {
|
||||||
indexer.chunk_compression_type,
|
create_sorter(
|
||||||
indexer.chunk_compression_level,
|
grenad::SortAlgorithm::Unstable,
|
||||||
indexer.max_nb_chunks,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
max_memory.map(|m| m / 2),
|
indexer.chunk_compression_type,
|
||||||
);
|
indexer.chunk_compression_level,
|
||||||
|
indexer.max_nb_chunks,
|
||||||
|
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
// This map is assumed to not consume a lot of memory.
|
let mut del_word_positions: VecDeque<(String, u16)> =
|
||||||
let mut document_word_positions_heap = BinaryHeap::new();
|
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||||
|
let mut add_word_positions: VecDeque<(String, u16)> =
|
||||||
|
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||||
|
let mut del_word_pair_proximity = BTreeMap::new();
|
||||||
|
let mut add_word_pair_proximity = BTreeMap::new();
|
||||||
let mut current_document_id = None;
|
let mut current_document_id = None;
|
||||||
|
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||||
let word = str::from_utf8(word_bytes)?;
|
|
||||||
|
|
||||||
let curr_document_id = *current_document_id.get_or_insert(document_id);
|
// if we change document, we fill the sorter
|
||||||
if curr_document_id != document_id {
|
if current_document_id.map_or(false, |id| id != document_id) {
|
||||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
puffin::profile_scope!("Document into sorter");
|
||||||
|
|
||||||
document_word_positions_into_sorter(
|
document_word_positions_into_sorter(
|
||||||
curr_document_id,
|
current_document_id.unwrap(),
|
||||||
document_word_positions_heap,
|
&del_word_pair_proximity,
|
||||||
&mut word_pair_proximity_docids_sorter,
|
&add_word_pair_proximity,
|
||||||
|
&mut word_pair_proximity_docids_sorters,
|
||||||
)?;
|
)?;
|
||||||
current_document_id = Some(document_id);
|
del_word_pair_proximity.clear();
|
||||||
|
add_word_pair_proximity.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
let word = word.to_string();
|
current_document_id = Some(document_id);
|
||||||
let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
|
|
||||||
positions.sort_unstable();
|
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||||
let mut iter = positions.into_iter();
|
|| {
|
||||||
if let Some(position) = iter.next() {
|
// deletions
|
||||||
document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
|
if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
|
||||||
}
|
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||||
|
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||||
|
while del_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||||
|
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||||
|
}) {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut del_word_positions,
|
||||||
|
&mut del_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// insert the new word.
|
||||||
|
let word = std::str::from_utf8(word)?;
|
||||||
|
del_word_positions.push_back((word.to_string(), position));
|
||||||
|
}
|
||||||
|
|
||||||
|
while !del_word_positions.is_empty() {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut del_word_positions,
|
||||||
|
&mut del_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
|| {
|
||||||
|
// additions
|
||||||
|
if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
|
||||||
|
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||||
|
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||||
|
while add_word_positions.get(0).map_or(false, |(_w, p)| {
|
||||||
|
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||||
|
}) {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut add_word_positions,
|
||||||
|
&mut add_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// insert the new word.
|
||||||
|
let word = std::str::from_utf8(word)?;
|
||||||
|
add_word_positions.push_back((word.to_string(), position));
|
||||||
|
}
|
||||||
|
|
||||||
|
while !add_word_positions.is_empty() {
|
||||||
|
word_positions_into_word_pair_proximity(
|
||||||
|
&mut add_word_positions,
|
||||||
|
&mut add_word_pair_proximity,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
|
del?;
|
||||||
|
add?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Some(document_id) = current_document_id {
|
if let Some(document_id) = current_document_id {
|
||||||
// We must make sure that don't lose the current document field id
|
puffin::profile_scope!("Final document into sorter");
|
||||||
// word count map if we break because we reached the end of the chunk.
|
|
||||||
let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
|
|
||||||
document_word_positions_into_sorter(
|
document_word_positions_into_sorter(
|
||||||
document_id,
|
document_id,
|
||||||
document_word_positions_heap,
|
&del_word_pair_proximity,
|
||||||
&mut word_pair_proximity_docids_sorter,
|
&add_word_pair_proximity,
|
||||||
|
&mut word_pair_proximity_docids_sorters,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
puffin::profile_scope!("sorter_into_reader");
|
||||||
|
let mut writer = create_writer(
|
||||||
|
indexer.chunk_compression_type,
|
||||||
|
indexer.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
|
||||||
sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
|
for sorter in word_pair_proximity_docids_sorters {
|
||||||
|
sorter.write_into_stream_writer(&mut writer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
writer_into_reader(writer)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||||
@ -85,96 +167,66 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
|||||||
/// close to each other.
|
/// close to each other.
|
||||||
fn document_word_positions_into_sorter(
|
fn document_word_positions_into_sorter(
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
|
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||||
word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||||
|
word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let mut word_pair_proximity = HashMap::new();
|
use itertools::merge_join_by;
|
||||||
let mut ordered_peeked_word_positions = Vec::new();
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
while !word_positions_heap.is_empty() {
|
|
||||||
while let Some(peeked_word_position) = word_positions_heap.pop() {
|
|
||||||
ordered_peeked_word_positions.push(peeked_word_position);
|
|
||||||
if ordered_peeked_word_positions.len() == 7 {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
|
|
||||||
for PeekedWordPosition { word, position, .. } in tail {
|
|
||||||
let prox = positions_proximity(head.position, *position);
|
|
||||||
if prox > 0 && prox < MAX_DISTANCE {
|
|
||||||
word_pair_proximity
|
|
||||||
.entry((head.word.clone(), word.clone()))
|
|
||||||
.and_modify(|p| {
|
|
||||||
*p = cmp::min(*p, prox);
|
|
||||||
})
|
|
||||||
.or_insert(prox);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Push the tail in the heap.
|
|
||||||
let tail_iter = ordered_peeked_word_positions.drain(1..);
|
|
||||||
word_positions_heap.extend(tail_iter);
|
|
||||||
|
|
||||||
// Advance the head and push it in the heap.
|
|
||||||
if let Some(mut head) = ordered_peeked_word_positions.pop() {
|
|
||||||
if let Some(next_position) = head.iter.next() {
|
|
||||||
let prox = positions_proximity(head.position, next_position);
|
|
||||||
|
|
||||||
if prox > 0 && prox < MAX_DISTANCE {
|
|
||||||
word_pair_proximity
|
|
||||||
.entry((head.word.clone(), head.word.clone()))
|
|
||||||
.and_modify(|p| {
|
|
||||||
*p = cmp::min(*p, prox);
|
|
||||||
})
|
|
||||||
.or_insert(prox);
|
|
||||||
}
|
|
||||||
|
|
||||||
word_positions_heap.push(PeekedWordPosition {
|
|
||||||
word: head.word,
|
|
||||||
position: next_position,
|
|
||||||
iter: head.iter,
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
for ((w1, w2), prox) in word_pair_proximity {
|
for eob in
|
||||||
|
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||||
|
d.cmp(a)
|
||||||
|
})
|
||||||
|
{
|
||||||
|
buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
let ((w1, w2), prox) = match eob {
|
||||||
|
Left(key_value) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_value
|
||||||
|
}
|
||||||
|
Right(key_value) => {
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_value
|
||||||
|
}
|
||||||
|
Both(key_value, _) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key_value
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.push(prox as u8);
|
key_buffer.push(*prox as u8);
|
||||||
key_buffer.extend_from_slice(w1.as_bytes());
|
key_buffer.extend_from_slice(w1.as_bytes());
|
||||||
key_buffer.push(0);
|
key_buffer.push(0);
|
||||||
key_buffer.extend_from_slice(w2.as_bytes());
|
key_buffer.extend_from_slice(w2.as_bytes());
|
||||||
|
|
||||||
word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
||||||
|
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
struct PeekedWordPosition<I> {
|
fn word_positions_into_word_pair_proximity(
|
||||||
word: String,
|
word_positions: &mut VecDeque<(String, u16)>,
|
||||||
position: u32,
|
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||||
iter: I,
|
) -> Result<()> {
|
||||||
}
|
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||||
|
for (word, position) in word_positions.iter() {
|
||||||
impl<I> Ord for PeekedWordPosition<I> {
|
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||||
self.position.cmp(&other.position).reverse()
|
word_pair_proximity
|
||||||
}
|
.entry((head_word.clone(), word.clone()))
|
||||||
}
|
.and_modify(|p| {
|
||||||
|
*p = cmp::min(*p, prox);
|
||||||
impl<I> PartialOrd for PeekedWordPosition<I> {
|
})
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
.or_insert(prox);
|
||||||
Some(self.cmp(other))
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<I> Eq for PeekedWordPosition<I> {}
|
|
||||||
|
|
||||||
impl<I> PartialEq for PeekedWordPosition<I> {
|
|
||||||
fn eq(&self, other: &Self) -> bool {
|
|
||||||
self.position == other.position
|
|
||||||
}
|
}
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,18 @@
|
|||||||
|
use std::collections::BTreeSet;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io;
|
use std::io;
|
||||||
|
|
||||||
|
use obkv::KvReaderU16;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
|
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||||
try_split_array_at, GrenadParameters,
|
GrenadParameters,
|
||||||
};
|
};
|
||||||
use crate::error::SerializationError;
|
use crate::error::SerializationError;
|
||||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||||
use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
|
use crate::update::MergeFn;
|
||||||
|
use crate::{bucketed_position, DocumentId, Result};
|
||||||
|
|
||||||
/// Extracts the word positions and the documents ids where this word appear.
|
/// Extracts the word positions and the documents ids where this word appear.
|
||||||
///
|
///
|
||||||
@ -24,32 +29,110 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut word_position_docids_sorter = create_sorter(
|
let mut word_position_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_cbo_roaring_bitmaps,
|
merge_deladd_cbo_roaring_bitmaps,
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
indexer.max_nb_chunks,
|
indexer.max_nb_chunks,
|
||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||||
|
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||||
|
let mut current_document_id: Option<u32> = None;
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut cursor = docid_word_positions.into_cursor()?;
|
let mut cursor = docid_word_positions.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (document_id_bytes, word_bytes) = try_split_array_at(key)
|
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||||
|
|
||||||
for position in read_u32_ne_bytes(value) {
|
if current_document_id.map_or(false, |id| document_id != id) {
|
||||||
key_buffer.clear();
|
words_position_into_sorter(
|
||||||
key_buffer.extend_from_slice(word_bytes);
|
current_document_id.unwrap(),
|
||||||
key_buffer.push(0);
|
&mut key_buffer,
|
||||||
let (_, position) = relative_from_absolute_position(position);
|
&del_word_positions,
|
||||||
let position = bucketed_position(position);
|
&add_word_positions,
|
||||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
&mut word_position_docids_sorter,
|
||||||
word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
|
)?;
|
||||||
|
del_word_positions.clear();
|
||||||
|
add_word_positions.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
current_document_id = Some(document_id);
|
||||||
|
|
||||||
|
let del_add_reader = KvReaderDelAdd::new(&value);
|
||||||
|
// extract all unique words to remove.
|
||||||
|
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||||
|
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||||
|
let position = bucketed_position(position);
|
||||||
|
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// extract all unique additional words.
|
||||||
|
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||||
|
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
|
||||||
|
let position = bucketed_position(position);
|
||||||
|
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if let Some(document_id) = current_document_id {
|
||||||
|
words_position_into_sorter(
|
||||||
|
document_id,
|
||||||
|
&mut key_buffer,
|
||||||
|
&del_word_positions,
|
||||||
|
&add_word_positions,
|
||||||
|
&mut word_position_docids_sorter,
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO remove noop DelAdd OBKV
|
||||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||||
|
|
||||||
Ok(word_position_docids_reader)
|
Ok(word_position_docids_reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn words_position_into_sorter(
|
||||||
|
document_id: DocumentId,
|
||||||
|
key_buffer: &mut Vec<u8>,
|
||||||
|
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||||
|
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||||
|
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
|
) -> Result<()> {
|
||||||
|
puffin::profile_function!();
|
||||||
|
|
||||||
|
use itertools::merge_join_by;
|
||||||
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
||||||
|
{
|
||||||
|
buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||||
|
let (position, word_bytes) = match eob {
|
||||||
|
Left(key) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key
|
||||||
|
}
|
||||||
|
Right(key) => {
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key
|
||||||
|
}
|
||||||
|
Both(key, _) => {
|
||||||
|
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||||
|
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||||
|
key
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
key_buffer.clear();
|
||||||
|
key_buffer.extend_from_slice(word_bytes);
|
||||||
|
key_buffer.push(0);
|
||||||
|
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||||
|
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
@ -6,7 +6,6 @@ mod extract_fid_word_count_docids;
|
|||||||
mod extract_geo_points;
|
mod extract_geo_points;
|
||||||
mod extract_vector_points;
|
mod extract_vector_points;
|
||||||
mod extract_word_docids;
|
mod extract_word_docids;
|
||||||
mod extract_word_fid_docids;
|
|
||||||
mod extract_word_pair_proximity_docids;
|
mod extract_word_pair_proximity_docids;
|
||||||
mod extract_word_position_docids;
|
mod extract_word_position_docids;
|
||||||
|
|
||||||
@ -25,12 +24,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
|||||||
use self::extract_geo_points::extract_geo_points;
|
use self::extract_geo_points::extract_geo_points;
|
||||||
use self::extract_vector_points::extract_vector_points;
|
use self::extract_vector_points::extract_vector_points;
|
||||||
use self::extract_word_docids::extract_word_docids;
|
use self::extract_word_docids::extract_word_docids;
|
||||||
use self::extract_word_fid_docids::extract_word_fid_docids;
|
|
||||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
|
as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
||||||
GrenadParameters, MergeFn, MergeableReader,
|
MergeableReader,
|
||||||
};
|
};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result};
|
||||||
@ -93,9 +91,9 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
let (
|
let (
|
||||||
docid_word_positions_chunks,
|
docid_word_positions_chunks,
|
||||||
(
|
(
|
||||||
docid_fid_facet_numbers_chunks,
|
fid_docid_facet_numbers_chunks,
|
||||||
(
|
(
|
||||||
docid_fid_facet_strings_chunks,
|
fid_docid_facet_strings_chunks,
|
||||||
(
|
(
|
||||||
facet_is_null_docids_chunks,
|
facet_is_null_docids_chunks,
|
||||||
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
(facet_is_empty_docids_chunks, facet_exists_docids_chunks),
|
||||||
@ -172,15 +170,22 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
|
spawn_extraction_task::<
|
||||||
|
_,
|
||||||
|
_,
|
||||||
|
Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)>,
|
||||||
|
>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
|
||||||
merge_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
|(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
|
|(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
|
||||||
word_docids_reader,
|
TypedChunk::WordDocids {
|
||||||
exact_word_docids_reader,
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
@ -194,18 +199,9 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
TypedChunk::WordPositionDocids,
|
TypedChunk::WordPositionDocids,
|
||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
|
||||||
docid_word_positions_chunks,
|
|
||||||
indexer,
|
|
||||||
lmdb_writer_sx.clone(),
|
|
||||||
extract_word_fid_docids,
|
|
||||||
merge_cbo_roaring_bitmaps,
|
|
||||||
TypedChunk::WordFidDocids,
|
|
||||||
"word-fid-docids",
|
|
||||||
);
|
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_strings_chunks,
|
fid_docid_facet_strings_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
extract_facet_string_docids,
|
extract_facet_string_docids,
|
||||||
@ -215,7 +211,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_numbers_chunks,
|
fid_docid_facet_numbers_chunks,
|
||||||
indexer,
|
indexer,
|
||||||
lmdb_writer_sx,
|
lmdb_writer_sx,
|
||||||
extract_facet_number_docids,
|
extract_facet_number_docids,
|
||||||
@ -348,7 +344,7 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
|
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||||
rayon::join(
|
rayon::join(
|
||||||
|| {
|
|| {
|
||||||
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
let (documents_ids, docid_word_positions_chunk, script_language_pair) =
|
||||||
@ -376,8 +372,8 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
},
|
},
|
||||||
|| {
|
|| {
|
||||||
let ExtractedFacetValues {
|
let ExtractedFacetValues {
|
||||||
docid_fid_facet_numbers_chunk,
|
fid_docid_facet_numbers_chunk,
|
||||||
docid_fid_facet_strings_chunk,
|
fid_docid_facet_strings_chunk,
|
||||||
fid_facet_is_null_docids_chunk,
|
fid_facet_is_null_docids_chunk,
|
||||||
fid_facet_is_empty_docids_chunk,
|
fid_facet_is_empty_docids_chunk,
|
||||||
fid_facet_exists_docids_chunk,
|
fid_facet_exists_docids_chunk,
|
||||||
@ -388,26 +384,26 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
geo_fields_ids,
|
geo_fields_ids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// send docid_fid_facet_numbers_chunk to DB writer
|
// send fid_docid_facet_numbers_chunk to DB writer
|
||||||
let docid_fid_facet_numbers_chunk =
|
let fid_docid_facet_numbers_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
|
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||||
docid_fid_facet_numbers_chunk.clone(),
|
fid_docid_facet_numbers_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
// send docid_fid_facet_strings_chunk to DB writer
|
// send fid_docid_facet_strings_chunk to DB writer
|
||||||
let docid_fid_facet_strings_chunk =
|
let fid_docid_facet_strings_chunk =
|
||||||
unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
|
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
||||||
|
|
||||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||||
docid_fid_facet_strings_chunk.clone(),
|
fid_docid_facet_strings_chunk.clone(),
|
||||||
)));
|
)));
|
||||||
|
|
||||||
Ok((
|
Ok((
|
||||||
docid_fid_facet_numbers_chunk,
|
fid_docid_facet_numbers_chunk,
|
||||||
(
|
(
|
||||||
docid_fid_facet_strings_chunk,
|
fid_docid_facet_strings_chunk,
|
||||||
(
|
(
|
||||||
fid_facet_is_null_docids_chunk,
|
fid_facet_is_null_docids_chunk,
|
||||||
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
(fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
|
||||||
@ -417,5 +413,5 @@ fn send_and_extract_flattened_documents_data(
|
|||||||
},
|
},
|
||||||
);
|
);
|
||||||
|
|
||||||
Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
|
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
||||||
}
|
}
|
||||||
|
@ -54,6 +54,7 @@ pub fn sorter_into_reader(
|
|||||||
sorter: grenad::Sorter<MergeFn>,
|
sorter: grenad::Sorter<MergeFn>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
) -> Result<grenad::Reader<File>> {
|
) -> Result<grenad::Reader<File>> {
|
||||||
|
puffin::profile_function!();
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
indexer.chunk_compression_type,
|
indexer.chunk_compression_type,
|
||||||
indexer.chunk_compression_level,
|
indexer.chunk_compression_level,
|
||||||
@ -113,6 +114,22 @@ impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
|
type Output = (grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>);
|
||||||
|
|
||||||
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
|
let mut m1 = MergerBuilder::new(merge_fn);
|
||||||
|
let mut m2 = MergerBuilder::new(merge_fn);
|
||||||
|
let mut m3 = MergerBuilder::new(merge_fn);
|
||||||
|
for (r1, r2, r3) in self.into_iter() {
|
||||||
|
m1.push(r1)?;
|
||||||
|
m2.push(r2)?;
|
||||||
|
m3.push(r3)?;
|
||||||
|
}
|
||||||
|
Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||||
|
|
||||||
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||||
|
@ -6,11 +6,13 @@ use std::result::Result as StdResult;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::transform::Operation;
|
use crate::update::index_documents::transform::Operation;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||||
|
|
||||||
|
#[allow(unused)]
|
||||||
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||||
if values.len() == 1 {
|
if values.len() == 1 {
|
||||||
Ok(values[0].clone())
|
Ok(values[0].clone())
|
||||||
@ -75,57 +77,123 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
|
|||||||
Ok(obkvs.last().unwrap().clone())
|
Ok(obkvs.last().unwrap().clone())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
pub fn merge_two_del_add_obkvs(
|
||||||
|
base: obkv::KvReaderU16,
|
||||||
|
update: obkv::KvReaderU16,
|
||||||
|
merge_additions: bool,
|
||||||
|
buffer: &mut Vec<u8>,
|
||||||
|
) {
|
||||||
use itertools::merge_join_by;
|
use itertools::merge_join_by;
|
||||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||||
|
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
|
|
||||||
let mut writer = obkv::KvWriter::new(buffer);
|
let mut writer = obkv::KvWriter::new(buffer);
|
||||||
|
let mut value_buffer = Vec::new();
|
||||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||||
match eob {
|
match eob {
|
||||||
Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
|
Left((k, v)) => {
|
||||||
|
if merge_additions {
|
||||||
|
writer.insert(k, v).unwrap()
|
||||||
|
} else {
|
||||||
|
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
let base_reader = KvReaderDelAdd::new(v);
|
||||||
|
|
||||||
|
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||||
|
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||||
|
value_writer.finish().unwrap();
|
||||||
|
writer.insert(k, &value_buffer).unwrap()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||||
|
Both((k, base), (_, update)) => {
|
||||||
|
// merge deletions and additions.
|
||||||
|
value_buffer.clear();
|
||||||
|
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||||
|
let base_reader = KvReaderDelAdd::new(base);
|
||||||
|
let update_reader = KvReaderDelAdd::new(update);
|
||||||
|
|
||||||
|
// keep newest deletion.
|
||||||
|
if let Some(deletion) = update_reader
|
||||||
|
.get(DelAdd::Deletion)
|
||||||
|
.or_else(|| base_reader.get(DelAdd::Deletion))
|
||||||
|
{
|
||||||
|
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep base addition only if merge_additions is true.
|
||||||
|
let base_addition =
|
||||||
|
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
||||||
|
// keep newest addition.
|
||||||
|
// TODO use or_else
|
||||||
|
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
||||||
|
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
value_writer.finish().unwrap();
|
||||||
|
writer.insert(k, &value_buffer).unwrap()
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.finish().unwrap();
|
writer.finish().unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Merge all the obks in the order we see them.
|
/// Merge all the obkvs from the newest to the oldest.
|
||||||
pub fn merge_obkvs_and_operations<'a>(
|
fn inner_merge_del_add_obkvs<'a>(
|
||||||
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
|
merge_additions: bool,
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
// pop the newest operation from the list.
|
||||||
|
let (newest, obkvs) = obkvs.split_last().unwrap();
|
||||||
|
// keep the operation type for the returned value.
|
||||||
|
let newest_operation_type = newest[0];
|
||||||
|
|
||||||
|
// treat the newest obkv as the starting point of the merge.
|
||||||
|
let mut acc_operation_type = newest_operation_type;
|
||||||
|
let mut acc = newest[1..].to_vec();
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
// reverse iter from the most recent to the oldest.
|
||||||
|
for current in obkvs.into_iter().rev() {
|
||||||
|
// if in the previous iteration there was a complete deletion,
|
||||||
|
// stop the merge process.
|
||||||
|
if acc_operation_type == Operation::Deletion as u8 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let newest = obkv::KvReader::new(&acc);
|
||||||
|
let oldest = obkv::KvReader::new(¤t[1..]);
|
||||||
|
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||||
|
|
||||||
|
// we want the result of the merge into our accumulator.
|
||||||
|
std::mem::swap(&mut acc, &mut buffer);
|
||||||
|
acc_operation_type = current[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
acc.insert(0, newest_operation_type);
|
||||||
|
Ok(Cow::from(acc))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Merge all the obkvs from the newest to the oldest.
|
||||||
|
pub fn obkvs_merge_additions_and_deletions<'a>(
|
||||||
_key: &[u8],
|
_key: &[u8],
|
||||||
obkvs: &[Cow<'a, [u8]>],
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
) -> Result<Cow<'a, [u8]>> {
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
// [add, add, delete, add, add]
|
inner_merge_del_add_obkvs(obkvs, true)
|
||||||
// we can ignore everything that happened before the last delete.
|
|
||||||
let starting_position =
|
|
||||||
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
|
|
||||||
|
|
||||||
// [add, add, delete]
|
|
||||||
// if the last operation was a deletion then we simply return the deletion
|
|
||||||
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
|
|
||||||
{
|
|
||||||
return Ok(obkvs[obkvs.len() - 1].clone());
|
|
||||||
}
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
|
|
||||||
// (add, add, delete) [add, add]
|
|
||||||
// in the other case, no deletion will be encountered during the merge
|
|
||||||
let mut ret =
|
|
||||||
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
|
|
||||||
let first = obkv::KvReader::new(&acc);
|
|
||||||
let second = obkv::KvReader::new(¤t[1..]);
|
|
||||||
merge_two_obkvs(first, second, &mut buffer);
|
|
||||||
|
|
||||||
// we want the result of the merge into our accumulator
|
|
||||||
std::mem::swap(&mut acc, &mut buffer);
|
|
||||||
acc
|
|
||||||
});
|
|
||||||
|
|
||||||
ret.insert(0, Operation::Addition as u8);
|
|
||||||
Ok(Cow::from(ret))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||||
|
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
|
||||||
|
_key: &[u8],
|
||||||
|
obkvs: &[Cow<'a, [u8]>],
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
inner_merge_del_add_obkvs(obkvs, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||||
_key: &[u8],
|
_key: &[u8],
|
||||||
values: &[Cow<'a, [u8]>],
|
values: &[Cow<'a, [u8]>],
|
||||||
@ -138,3 +206,36 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
|
|||||||
Ok(Cow::from(vec))
|
Ok(Cow::from(vec))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||||
|
/// separately and outputs a new DelAdd with both unions.
|
||||||
|
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||||
|
_key: &[u8],
|
||||||
|
values: &[Cow<'a, [u8]>],
|
||||||
|
) -> Result<Cow<'a, [u8]>> {
|
||||||
|
if values.len() == 1 {
|
||||||
|
Ok(values[0].clone())
|
||||||
|
} else {
|
||||||
|
// Retrieve the bitmaps from both sides
|
||||||
|
let mut del_bitmaps_bytes = Vec::new();
|
||||||
|
let mut add_bitmaps_bytes = Vec::new();
|
||||||
|
for value in values {
|
||||||
|
let obkv = KvReaderDelAdd::new(value);
|
||||||
|
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||||
|
del_bitmaps_bytes.push(bitmap_bytes);
|
||||||
|
}
|
||||||
|
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||||
|
add_bitmaps_bytes.push(bitmap_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||||
|
let mut buffer = Vec::new();
|
||||||
|
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||||
|
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||||
|
buffer.clear();
|
||||||
|
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||||
|
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||||
|
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -14,7 +14,8 @@ pub use grenad_helpers::{
|
|||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
|
||||||
merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
|
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
|
||||||
|
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
|
||||||
serialize_roaring_bitmap, MergeFn,
|
serialize_roaring_bitmap, MergeFn,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -44,6 +45,7 @@ where
|
|||||||
Some((head, tail))
|
Some((head, tail))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(unused)]
|
||||||
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
|
||||||
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
|
||||||
}
|
}
|
||||||
|
@ -38,7 +38,7 @@ use crate::update::{
|
|||||||
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||||
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||||
|
|
||||||
static MERGED_DATABASE_COUNT: usize = 7;
|
static MERGED_DATABASE_COUNT: usize = 7;
|
||||||
static PREFIX_DATABASE_COUNT: usize = 5;
|
static PREFIX_DATABASE_COUNT: usize = 5;
|
||||||
@ -406,13 +406,23 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
let typed_chunk = match result? {
|
let typed_chunk = match result? {
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
} => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||||
word_docids = Some(cloneable_chunk);
|
word_docids = Some(cloneable_chunk);
|
||||||
let cloneable_chunk =
|
let cloneable_chunk =
|
||||||
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
|
||||||
exact_word_docids = Some(cloneable_chunk);
|
exact_word_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||||
|
word_fid_docids = Some(cloneable_chunk);
|
||||||
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
TypedChunk::WordPairProximityDocids(chunk) => {
|
TypedChunk::WordPairProximityDocids(chunk) => {
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
||||||
@ -424,11 +434,6 @@ where
|
|||||||
word_position_docids = Some(cloneable_chunk);
|
word_position_docids = Some(cloneable_chunk);
|
||||||
TypedChunk::WordPositionDocids(chunk)
|
TypedChunk::WordPositionDocids(chunk)
|
||||||
}
|
}
|
||||||
TypedChunk::WordFidDocids(chunk) => {
|
|
||||||
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
|
|
||||||
word_fid_docids = Some(cloneable_chunk);
|
|
||||||
TypedChunk::WordFidDocids(chunk)
|
|
||||||
}
|
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -470,13 +475,14 @@ where
|
|||||||
let all_documents_ids = index_documents_ids | new_documents_ids;
|
let all_documents_ids = index_documents_ids | new_documents_ids;
|
||||||
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
|
||||||
|
|
||||||
self.execute_prefix_databases(
|
// TODO: reactivate prefix DB with diff-indexing
|
||||||
word_docids,
|
// self.execute_prefix_databases(
|
||||||
exact_word_docids,
|
// word_docids,
|
||||||
word_pair_proximity_docids,
|
// exact_word_docids,
|
||||||
word_position_docids,
|
// word_pair_proximity_docids,
|
||||||
word_fid_docids,
|
// word_position_docids,
|
||||||
)?;
|
// word_fid_docids,
|
||||||
|
// )?;
|
||||||
|
|
||||||
Ok(all_documents_ids.len())
|
Ok(all_documents_ids.len())
|
||||||
}
|
}
|
||||||
@ -690,8 +696,8 @@ where
|
|||||||
fn execute_word_prefix_docids(
|
fn execute_word_prefix_docids(
|
||||||
txn: &mut heed::RwTxn,
|
txn: &mut heed::RwTxn,
|
||||||
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
reader: grenad::Reader<Cursor<ClonableMmap>>,
|
||||||
word_docids_db: Database<Str, RoaringBitmapCodec>,
|
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||||
word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
|
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||||
indexer_config: &IndexerConfig,
|
indexer_config: &IndexerConfig,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
|
@ -7,18 +7,20 @@ use std::io::{Read, Seek};
|
|||||||
use fxhash::FxHashMap;
|
use fxhash::FxHashMap;
|
||||||
use heed::RoTxn;
|
use heed::RoTxn;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use obkv::{KvReader, KvWriter};
|
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use smartstring::SmartString;
|
use smartstring::SmartString;
|
||||||
|
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
|
create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions,
|
||||||
|
obkvs_merge_additions_and_deletions, MergeFn,
|
||||||
};
|
};
|
||||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||||
use crate::error::{Error, InternalError, UserError};
|
use crate::error::{Error, InternalError, UserError};
|
||||||
use crate::index::{db_name, main_key};
|
use crate::index::{db_name, main_key};
|
||||||
|
use crate::update::del_add::into_del_add_obkv;
|
||||||
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
|
||||||
use crate::{
|
use crate::{
|
||||||
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
|
||||||
@ -106,8 +108,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// We must choose the appropriate merge function for when two or more documents
|
// We must choose the appropriate merge function for when two or more documents
|
||||||
// with the same user id must be merged or fully replaced in the same batch.
|
// with the same user id must be merged or fully replaced in the same batch.
|
||||||
let merge_function = match index_documents_method {
|
let merge_function = match index_documents_method {
|
||||||
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
|
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
|
||||||
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
|
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
|
||||||
};
|
};
|
||||||
|
|
||||||
// We initialize the sorter with the user indexing settings.
|
// We initialize the sorter with the user indexing settings.
|
||||||
@ -223,19 +225,21 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
|
||||||
Entry::Occupied(entry) => *entry.get() as u32,
|
Entry::Occupied(entry) => *entry.get() as u32,
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(entry) => {
|
||||||
// If the document was already in the db we mark it as a replaced document.
|
let docid = match external_documents_ids.get(entry.key()) {
|
||||||
// It'll be deleted later.
|
Some(docid) => {
|
||||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
// If it was already in the list of replaced documents it means it was deleted
|
||||||
// If it was already in the list of replaced documents it means it was deleted
|
// by the remove_document method. We should starts as if it never existed.
|
||||||
// by the remove_document method. We should starts as if it never existed.
|
if self.replaced_documents_ids.insert(docid) {
|
||||||
if self.replaced_documents_ids.insert(docid) {
|
original_docid = Some(docid);
|
||||||
original_docid = Some(docid);
|
}
|
||||||
|
|
||||||
|
docid
|
||||||
}
|
}
|
||||||
}
|
None => self
|
||||||
let docid = self
|
.available_documents_ids
|
||||||
.available_documents_ids
|
.next()
|
||||||
.next()
|
.ok_or(UserError::DocumentLimitReached)?,
|
||||||
.ok_or(UserError::DocumentLimitReached)?;
|
};
|
||||||
entry.insert(docid as u64);
|
entry.insert(docid as u64);
|
||||||
docid
|
docid
|
||||||
}
|
}
|
||||||
@ -263,16 +267,28 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
skip_insertion = true;
|
skip_insertion = true;
|
||||||
} else {
|
} else {
|
||||||
// we associate the base document with the new key, everything will get merged later.
|
// we associate the base document with the new key, everything will get merged later.
|
||||||
|
let keep_original_version =
|
||||||
|
self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
document_sorter_buffer.extend_from_slice(base_obkv);
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(base_obkv),
|
||||||
|
true,
|
||||||
|
keep_original_version,
|
||||||
|
&mut document_sorter_buffer,
|
||||||
|
)?;
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
||||||
Some(flattened_obkv) => {
|
Some(flattened_obkv) => {
|
||||||
// we recreate our buffer with the flattened documents
|
// we recreate our buffer with the flattened documents
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&flattened_obkv),
|
||||||
|
true,
|
||||||
|
keep_original_version,
|
||||||
|
&mut document_sorter_buffer,
|
||||||
|
)?;
|
||||||
self.flattened_sorter
|
self.flattened_sorter
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||||
}
|
}
|
||||||
@ -288,7 +304,12 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
|
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
document_sorter_buffer.extend_from_slice(&obkv_buffer);
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&obkv_buffer),
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
&mut document_sorter_buffer,
|
||||||
|
)?;
|
||||||
// We use the extracted/generated user id as the key for this document.
|
// We use the extracted/generated user id as the key for this document.
|
||||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
|
|
||||||
@ -296,7 +317,12 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
Some(flattened_obkv) => {
|
Some(flattened_obkv) => {
|
||||||
document_sorter_buffer.clear();
|
document_sorter_buffer.clear();
|
||||||
document_sorter_buffer.push(Operation::Addition as u8);
|
document_sorter_buffer.push(Operation::Addition as u8);
|
||||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&flattened_obkv),
|
||||||
|
false,
|
||||||
|
true,
|
||||||
|
&mut document_sorter_buffer,
|
||||||
|
)?;
|
||||||
self.flattened_sorter
|
self.flattened_sorter
|
||||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||||
}
|
}
|
||||||
@ -354,19 +380,25 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||||
|
|
||||||
let mut documents_deleted = 0;
|
let mut documents_deleted = 0;
|
||||||
|
let mut document_sorter_buffer = Vec::new();
|
||||||
for to_remove in to_remove {
|
for to_remove in to_remove {
|
||||||
if should_abort() {
|
if should_abort() {
|
||||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||||
}
|
}
|
||||||
|
|
||||||
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
// Check if the document has been added in the current indexing process.
|
||||||
|
let deleted_from_current = match self
|
||||||
|
.new_external_documents_ids_builder
|
||||||
|
.entry((*to_remove).into())
|
||||||
|
{
|
||||||
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
||||||
Entry::Occupied(entry) => {
|
Entry::Occupied(entry) => {
|
||||||
let doc_id = *entry.get() as u32;
|
let doc_id = *entry.get() as u32;
|
||||||
self.original_sorter
|
document_sorter_buffer.clear();
|
||||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
document_sorter_buffer.push(Operation::Deletion as u8);
|
||||||
self.flattened_sorter
|
obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap();
|
||||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
|
self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
|
|
||||||
// we must NOT update the list of replaced_documents_ids
|
// we must NOT update the list of replaced_documents_ids
|
||||||
// Either:
|
// Either:
|
||||||
@ -375,21 +407,69 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// we're removing it there is nothing to do.
|
// we're removing it there is nothing to do.
|
||||||
self.new_documents_ids.remove(doc_id);
|
self.new_documents_ids.remove(doc_id);
|
||||||
entry.remove_entry();
|
entry.remove_entry();
|
||||||
|
true
|
||||||
}
|
}
|
||||||
Entry::Vacant(entry) => {
|
Entry::Vacant(_) => false,
|
||||||
// If the document was already in the db we mark it as a `to_delete` document.
|
|
||||||
// It'll be deleted later. We don't need to push anything to the sorters.
|
|
||||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
|
||||||
self.replaced_documents_ids.insert(docid);
|
|
||||||
} else {
|
|
||||||
// if the document is nowehere to be found, there is nothing to do and we must NOT
|
|
||||||
// increment the count of documents_deleted
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
documents_deleted += 1;
|
// If the document was already in the db we mark it as a `to_delete` document.
|
||||||
|
// Then we push the document in sorters in deletion mode.
|
||||||
|
let deleted_from_db = match external_documents_ids.get(&to_remove) {
|
||||||
|
Some(docid) => {
|
||||||
|
self.replaced_documents_ids.insert(docid);
|
||||||
|
|
||||||
|
// fetch the obkv document
|
||||||
|
let original_key = BEU32::new(docid);
|
||||||
|
let base_obkv = self
|
||||||
|
.index
|
||||||
|
.documents
|
||||||
|
.remap_data_type::<heed::types::ByteSlice>()
|
||||||
|
.get(wtxn, &original_key)?
|
||||||
|
.ok_or(InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: db_name::DOCUMENTS,
|
||||||
|
key: None,
|
||||||
|
})?;
|
||||||
|
|
||||||
|
// push it as to delete in the original_sorter
|
||||||
|
document_sorter_buffer.clear();
|
||||||
|
document_sorter_buffer.push(Operation::Deletion as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(base_obkv),
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
&mut document_sorter_buffer,
|
||||||
|
)?;
|
||||||
|
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
|
|
||||||
|
// flatten it and push it as to delete in the flattened_sorter
|
||||||
|
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
||||||
|
Some(flattened_obkv) => {
|
||||||
|
// we recreate our buffer with the flattened documents
|
||||||
|
document_sorter_buffer.clear();
|
||||||
|
document_sorter_buffer.push(Operation::Deletion as u8);
|
||||||
|
into_del_add_obkv(
|
||||||
|
KvReaderU16::new(&flattened_obkv),
|
||||||
|
true,
|
||||||
|
false,
|
||||||
|
&mut document_sorter_buffer,
|
||||||
|
)?;
|
||||||
|
self.flattened_sorter
|
||||||
|
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||||
|
}
|
||||||
|
None => self
|
||||||
|
.flattened_sorter
|
||||||
|
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
||||||
|
}
|
||||||
|
|
||||||
|
true
|
||||||
|
}
|
||||||
|
None => false,
|
||||||
|
};
|
||||||
|
|
||||||
|
// increase counter only if the document existed somewhere before.
|
||||||
|
if deleted_from_current || deleted_from_db {
|
||||||
|
documents_deleted += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(documents_deleted)
|
Ok(documents_deleted)
|
||||||
@ -589,9 +669,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let mut documents_count = 0;
|
let mut documents_count = 0;
|
||||||
|
|
||||||
while let Some((key, val)) = iter.next()? {
|
while let Some((key, val)) = iter.next()? {
|
||||||
if val[0] == Operation::Deletion as u8 {
|
// skip first byte corresponding to the operation type (Deletion or Addition).
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let val = &val[1..];
|
let val = &val[1..];
|
||||||
|
|
||||||
// send a callback to show at which step we are
|
// send a callback to show at which step we are
|
||||||
@ -631,9 +709,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
||||||
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
||||||
while let Some((key, val)) = iter.next()? {
|
while let Some((key, val)) = iter.next()? {
|
||||||
if val[0] == Operation::Deletion as u8 {
|
// skip first byte corresponding to the operation type (Deletion or Addition).
|
||||||
continue;
|
|
||||||
}
|
|
||||||
let val = &val[1..];
|
let val = &val[1..];
|
||||||
writer.insert(key, val)?;
|
writer.insert(key, val)?;
|
||||||
}
|
}
|
||||||
@ -711,6 +787,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
);
|
);
|
||||||
|
|
||||||
let mut obkv_buffer = Vec::new();
|
let mut obkv_buffer = Vec::new();
|
||||||
|
let mut document_sorter_buffer = Vec::new();
|
||||||
for result in self.index.all_documents(wtxn)? {
|
for result in self.index.all_documents(wtxn)? {
|
||||||
let (docid, obkv) = result?;
|
let (docid, obkv) = result?;
|
||||||
|
|
||||||
@ -725,7 +802,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let buffer = obkv_writer.into_inner()?;
|
let buffer = obkv_writer.into_inner()?;
|
||||||
original_writer.insert(docid.to_be_bytes(), &buffer)?;
|
document_sorter_buffer.clear();
|
||||||
|
into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?;
|
||||||
|
original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
|
|
||||||
// Once we have the document. We're going to flatten it
|
// Once we have the document. We're going to flatten it
|
||||||
// and insert it in the flattened sorter.
|
// and insert it in the flattened sorter.
|
||||||
@ -760,7 +839,9 @@ impl<'a, 'i> Transform<'a, 'i> {
|
|||||||
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
|
||||||
writer.insert(fid, &value)?;
|
writer.insert(fid, &value)?;
|
||||||
}
|
}
|
||||||
flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
|
document_sorter_buffer.clear();
|
||||||
|
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?;
|
||||||
|
flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Once we have written all the documents, we extract
|
// Once we have written all the documents, we extract
|
||||||
@ -824,38 +905,86 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn merge_obkvs() {
|
fn merge_obkvs() {
|
||||||
let mut doc_0 = Vec::new();
|
let mut additive_doc_0 = Vec::new();
|
||||||
let mut kv_writer = KvWriter::new(&mut doc_0);
|
let mut deletive_doc_0 = Vec::new();
|
||||||
|
let mut del_add_doc_0 = Vec::new();
|
||||||
|
let mut kv_writer = KvWriter::memory();
|
||||||
kv_writer.insert(0_u8, [0]).unwrap();
|
kv_writer.insert(0_u8, [0]).unwrap();
|
||||||
kv_writer.finish().unwrap();
|
let buffer = kv_writer.into_inner().unwrap();
|
||||||
doc_0.insert(0, Operation::Addition as u8);
|
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap();
|
||||||
|
additive_doc_0.insert(0, Operation::Addition as u8);
|
||||||
|
into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap();
|
||||||
|
deletive_doc_0.insert(0, Operation::Deletion as u8);
|
||||||
|
into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap();
|
||||||
|
del_add_doc_0.insert(0, Operation::Addition as u8);
|
||||||
|
|
||||||
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
|
let mut additive_doc_1 = Vec::new();
|
||||||
assert_eq!(*ret, doc_0);
|
let mut kv_writer = KvWriter::memory();
|
||||||
|
kv_writer.insert(1_u8, [1]).unwrap();
|
||||||
|
let buffer = kv_writer.into_inner().unwrap();
|
||||||
|
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap();
|
||||||
|
additive_doc_1.insert(0, Operation::Addition as u8);
|
||||||
|
|
||||||
let ret = merge_obkvs_and_operations(
|
let mut additive_doc_0_1 = Vec::new();
|
||||||
|
let mut kv_writer = KvWriter::memory();
|
||||||
|
kv_writer.insert(0_u8, [0]).unwrap();
|
||||||
|
kv_writer.insert(1_u8, [1]).unwrap();
|
||||||
|
let buffer = kv_writer.into_inner().unwrap();
|
||||||
|
into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap();
|
||||||
|
additive_doc_0_1.insert(0, Operation::Addition as u8);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, additive_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
&[],
|
&[],
|
||||||
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
|
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, doc_0);
|
assert_eq!(*ret, del_add_doc_0);
|
||||||
|
|
||||||
let ret = merge_obkvs_and_operations(
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
&[],
|
&[],
|
||||||
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
|
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, [Operation::Deletion as u8]);
|
assert_eq!(*ret, deletive_doc_0);
|
||||||
|
|
||||||
let ret = merge_obkvs_and_operations(
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
&[],
|
&[],
|
||||||
&[
|
&[
|
||||||
Cow::from([Operation::Addition as u8, 1].as_slice()),
|
Cow::from(additive_doc_1.as_slice()),
|
||||||
Cow::from([Operation::Deletion as u8].as_slice()),
|
Cow::from(deletive_doc_0.as_slice()),
|
||||||
Cow::from(doc_0.as_slice()),
|
Cow::from(additive_doc_0.as_slice()),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(*ret, doc_0);
|
assert_eq!(*ret, del_add_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_merge_additions_and_deletions(
|
||||||
|
&[],
|
||||||
|
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, additive_doc_0_1);
|
||||||
|
|
||||||
|
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||||
|
&[],
|
||||||
|
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, additive_doc_0);
|
||||||
|
|
||||||
|
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||||
|
&[],
|
||||||
|
&[
|
||||||
|
Cow::from(deletive_doc_0.as_slice()),
|
||||||
|
Cow::from(additive_doc_1.as_slice()),
|
||||||
|
Cow::from(additive_doc_0.as_slice()),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(*ret, del_add_doc_0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,9 +32,9 @@ pub(crate) enum TypedChunk {
|
|||||||
WordDocids {
|
WordDocids {
|
||||||
word_docids_reader: grenad::Reader<File>,
|
word_docids_reader: grenad::Reader<File>,
|
||||||
exact_word_docids_reader: grenad::Reader<File>,
|
exact_word_docids_reader: grenad::Reader<File>,
|
||||||
|
word_fid_docids_reader: grenad::Reader<File>,
|
||||||
},
|
},
|
||||||
WordPositionDocids(grenad::Reader<File>),
|
WordPositionDocids(grenad::Reader<File>),
|
||||||
WordFidDocids(grenad::Reader<File>),
|
|
||||||
WordPairProximityDocids(grenad::Reader<File>),
|
WordPairProximityDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetStringDocids(grenad::Reader<File>),
|
FieldIdFacetStringDocids(grenad::Reader<File>),
|
||||||
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
FieldIdFacetNumberDocids(grenad::Reader<File>),
|
||||||
@ -43,7 +43,7 @@ pub(crate) enum TypedChunk {
|
|||||||
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
|
||||||
GeoPoints(grenad::Reader<File>),
|
GeoPoints(grenad::Reader<File>),
|
||||||
VectorPoints(grenad::Reader<File>),
|
VectorPoints(grenad::Reader<File>),
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
|
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TypedChunk {
|
impl TypedChunk {
|
||||||
@ -64,17 +64,19 @@ impl TypedChunk {
|
|||||||
TypedChunk::NewDocumentsIds(grenad) => {
|
TypedChunk::NewDocumentsIds(grenad) => {
|
||||||
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
|
TypedChunk::WordDocids {
|
||||||
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
} => format!(
|
||||||
|
"WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
|
||||||
word_docids_reader.len(),
|
word_docids_reader.len(),
|
||||||
exact_word_docids_reader.len()
|
exact_word_docids_reader.len(),
|
||||||
|
word_fid_docids_reader.len()
|
||||||
),
|
),
|
||||||
TypedChunk::WordPositionDocids(grenad) => {
|
TypedChunk::WordPositionDocids(grenad) => {
|
||||||
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::WordFidDocids(grenad) => {
|
|
||||||
format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
|
|
||||||
}
|
|
||||||
TypedChunk::WordPairProximityDocids(grenad) => {
|
TypedChunk::WordPairProximityDocids(grenad) => {
|
||||||
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
@ -99,8 +101,8 @@ impl TypedChunk {
|
|||||||
TypedChunk::VectorPoints(grenad) => {
|
TypedChunk::VectorPoints(grenad) => {
|
||||||
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(grenad) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
|
format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -138,7 +140,11 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
TypedChunk::NewDocumentsIds(documents_ids) => {
|
TypedChunk::NewDocumentsIds(documents_ids) => {
|
||||||
return Ok((documents_ids, is_merged_database))
|
return Ok((documents_ids, is_merged_database))
|
||||||
}
|
}
|
||||||
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
|
TypedChunk::WordDocids {
|
||||||
|
word_docids_reader,
|
||||||
|
exact_word_docids_reader,
|
||||||
|
word_fid_docids_reader,
|
||||||
|
} => {
|
||||||
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
|
||||||
append_entries_into_database(
|
append_entries_into_database(
|
||||||
word_docids_iter.clone(),
|
word_docids_iter.clone(),
|
||||||
@ -146,7 +152,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
|value, _buffer| Ok(value),
|
||||||
merge_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
|
||||||
@ -156,7 +162,17 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
wtxn,
|
wtxn,
|
||||||
index_is_empty,
|
index_is_empty,
|
||||||
|value, _buffer| Ok(value),
|
|value, _buffer| Ok(value),
|
||||||
merge_roaring_bitmaps,
|
merge_cbo_roaring_bitmaps,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
|
||||||
|
append_entries_into_database(
|
||||||
|
word_fid_docids_iter,
|
||||||
|
&index.word_fid_docids,
|
||||||
|
wtxn,
|
||||||
|
index_is_empty,
|
||||||
|
|value, _buffer| Ok(value),
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
// create fst from word docids
|
// create fst from word docids
|
||||||
@ -182,17 +198,6 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
)?;
|
)?;
|
||||||
is_merged_database = true;
|
is_merged_database = true;
|
||||||
}
|
}
|
||||||
TypedChunk::WordFidDocids(word_fid_docids_iter) => {
|
|
||||||
append_entries_into_database(
|
|
||||||
word_fid_docids_iter,
|
|
||||||
&index.word_fid_docids,
|
|
||||||
wtxn,
|
|
||||||
index_is_empty,
|
|
||||||
|value, _buffer| Ok(value),
|
|
||||||
merge_cbo_roaring_bitmaps,
|
|
||||||
)?;
|
|
||||||
is_merged_database = true;
|
|
||||||
}
|
|
||||||
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
|
||||||
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
|
||||||
indexer.execute(wtxn)?;
|
indexer.execute(wtxn)?;
|
||||||
@ -339,22 +344,25 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
log::debug!("There are {} entries in the HNSW so far", hnsw_length);
|
||||||
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
index.put_vector_hnsw(wtxn, &new_hnsw)?;
|
||||||
}
|
}
|
||||||
TypedChunk::ScriptLanguageDocids(hash_pair) => {
|
TypedChunk::ScriptLanguageDocids(sl_map) => {
|
||||||
let mut buffer = Vec::new();
|
for (key, (deletion, addition)) in sl_map {
|
||||||
for (key, value) in hash_pair {
|
let mut db_key_exists = false;
|
||||||
buffer.clear();
|
|
||||||
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
let final_value = match index.script_language_docids.get(wtxn, &key)? {
|
||||||
Some(db_values) => {
|
Some(db_values) => {
|
||||||
let mut db_value_buffer = Vec::new();
|
db_key_exists = true;
|
||||||
serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
|
(db_values - deletion) | addition
|
||||||
let mut new_value_buffer = Vec::new();
|
|
||||||
serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
|
|
||||||
merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
|
|
||||||
RoaringBitmap::deserialize_from(&buffer[..])?
|
|
||||||
}
|
}
|
||||||
None => value,
|
None => addition,
|
||||||
};
|
};
|
||||||
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
|
||||||
|
if final_value.is_empty() {
|
||||||
|
// If the database entry exists, delete it.
|
||||||
|
if db_key_exists == true {
|
||||||
|
index.script_language_docids.delete(wtxn, &key)?;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
index.script_language_docids.put(wtxn, &key, &final_value)?;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -379,13 +387,6 @@ fn merge_word_docids_reader_into_fst(
|
|||||||
Ok(builder.into_set())
|
Ok(builder.into_set())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
|
|
||||||
let new_value = RoaringBitmap::deserialize_from(new_value)?;
|
|
||||||
let db_value = RoaringBitmap::deserialize_from(db_value)?;
|
|
||||||
let value = new_value | db_value;
|
|
||||||
Ok(serialize_roaring_bitmap(&value, buffer)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn merge_cbo_roaring_bitmaps(
|
fn merge_cbo_roaring_bitmaps(
|
||||||
new_value: &[u8],
|
new_value: &[u8],
|
||||||
db_value: &[u8],
|
db_value: &[u8],
|
||||||
@ -455,6 +456,7 @@ where
|
|||||||
R: io::Read + io::Seek,
|
R: io::Read + io::Seek,
|
||||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||||
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
|
||||||
|
K: for<'a> heed::BytesDecode<'a>,
|
||||||
{
|
{
|
||||||
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
puffin::profile_function!(format!("number of entries: {}", data.len()));
|
||||||
|
|
||||||
@ -475,6 +477,12 @@ where
|
|||||||
let mut cursor = data.into_cursor()?;
|
let mut cursor = data.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
if valid_lmdb_key(key) {
|
if valid_lmdb_key(key) {
|
||||||
|
debug_assert!(
|
||||||
|
K::bytes_decode(&key).is_some(),
|
||||||
|
"Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
|
||||||
|
key.len(),
|
||||||
|
&key
|
||||||
|
);
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
let value = serialize_value(value, &mut buffer)?;
|
let value = serialize_value(value, &mut buffer)?;
|
||||||
unsafe { database.append(key, value)? };
|
unsafe { database.append(key, value)? };
|
||||||
|
@ -21,6 +21,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
|
|||||||
|
|
||||||
mod available_documents_ids;
|
mod available_documents_ids;
|
||||||
mod clear_documents;
|
mod clear_documents;
|
||||||
|
pub(crate) mod del_add;
|
||||||
mod delete_documents;
|
mod delete_documents;
|
||||||
pub(crate) mod facet;
|
pub(crate) mod facet;
|
||||||
mod index_documents;
|
mod index_documents;
|
||||||
|
@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str};
|
|||||||
use heed::Database;
|
use heed::Database;
|
||||||
|
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
|
||||||
CursorClonableMmap, MergeFn,
|
CursorClonableMmap, MergeFn,
|
||||||
};
|
};
|
||||||
use crate::{Result, RoaringBitmapCodec};
|
use crate::{CboRoaringBitmapCodec, Result};
|
||||||
|
|
||||||
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
word_docids: Database<Str, RoaringBitmapCodec>,
|
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
word_docids: Database<Str, RoaringBitmapCodec>,
|
word_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
word_prefix_docids: Database<Str, RoaringBitmapCodec>,
|
word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
|
||||||
) -> WordPrefixDocids<'t, 'u, 'i> {
|
) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||||
WordPrefixDocids {
|
WordPrefixDocids {
|
||||||
wtxn,
|
wtxn,
|
||||||
@ -40,6 +40,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
#[logging_timer::time("WordPrefixDocids::{}")]
|
#[logging_timer::time("WordPrefixDocids::{}")]
|
||||||
pub fn execute(
|
pub fn execute(
|
||||||
self,
|
self,
|
||||||
|
// TODO grenad::Reader<onkv::Reader<Word, obkv::Reader<DelAdd, CboRoaringBitmap>>>
|
||||||
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
|
mut new_word_docids_iter: grenad::ReaderCursor<CursorClonableMmap>,
|
||||||
new_prefix_fst_words: &[String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&[String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
@ -51,7 +52,8 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
// and write into it at the same time, therefore we write into another file.
|
// and write into it at the same time, therefore we write into another file.
|
||||||
let mut prefix_docids_sorter = create_sorter(
|
let mut prefix_docids_sorter = create_sorter(
|
||||||
grenad::SortAlgorithm::Unstable,
|
grenad::SortAlgorithm::Unstable,
|
||||||
merge_roaring_bitmaps,
|
// TODO change to merge_deladd_cbo_roaring_bitmaps
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
self.chunk_compression_type,
|
self.chunk_compression_type,
|
||||||
self.chunk_compression_level,
|
self.chunk_compression_level,
|
||||||
self.max_nb_chunks,
|
self.max_nb_chunks,
|
||||||
@ -96,6 +98,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
let prefix = std::str::from_utf8(prefix.as_bytes())?;
|
let prefix = std::str::from_utf8(prefix.as_bytes())?;
|
||||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||||
let (_word, data) = result?;
|
let (_word, data) = result?;
|
||||||
|
// TODO fake a DelAdd -> Add(`data`)
|
||||||
prefix_docids_sorter.insert(prefix, data)?;
|
prefix_docids_sorter.insert(prefix, data)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -111,11 +114,14 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
|||||||
drop(iter);
|
drop(iter);
|
||||||
|
|
||||||
// We finally write the word prefix docids into the LMDB database.
|
// We finally write the word prefix docids into the LMDB database.
|
||||||
|
// TODO introduce a new function that is similar to `append_entries_into_database`
|
||||||
|
// and accepts the `merge_deladd_cbo_roaring_bitmaps` function
|
||||||
sorter_into_lmdb_database(
|
sorter_into_lmdb_database(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.word_prefix_docids.as_polymorph(),
|
*self.word_prefix_docids.as_polymorph(),
|
||||||
prefix_docids_sorter,
|
prefix_docids_sorter,
|
||||||
merge_roaring_bitmaps,
|
// TODO change to `merge_deladd_cbo_roaring_bitmaps`
|
||||||
|
merge_cbo_roaring_bitmaps,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -127,6 +133,7 @@ fn write_prefixes_in_sorter(
|
|||||||
sorter: &mut grenad::Sorter<MergeFn>,
|
sorter: &mut grenad::Sorter<MergeFn>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
for (key, data_slices) in prefixes.drain() {
|
for (key, data_slices) in prefixes.drain() {
|
||||||
|
// TODO merge keys before inserting them in the sorter
|
||||||
for data in data_slices {
|
for data in data_slices {
|
||||||
if valid_lmdb_key(&key) {
|
if valid_lmdb_key(&key) {
|
||||||
sorter.insert(&key, data)?;
|
sorter.insert(&key, data)?;
|
||||||
|
Reference in New Issue
Block a user