mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-29 01:31:00 +00:00
Merge branch 'main' into indexer-edition-2024
This commit is contained in:
@ -0,0 +1,24 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use memmap2::Mmap;
|
||||
|
||||
/// Wrapper around Mmap allowing to virtually clone grenad-chunks
|
||||
/// in a parallel process like the indexing.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ClonableMmap {
|
||||
inner: Arc<Mmap>,
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for ClonableMmap {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
self.inner.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Mmap> for ClonableMmap {
|
||||
fn from(inner: Mmap) -> ClonableMmap {
|
||||
ClonableMmap { inner: Arc::new(inner) }
|
||||
}
|
||||
}
|
||||
|
||||
pub type CursorClonableMmap = std::io::Cursor<ClonableMmap>;
|
@ -0,0 +1,217 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
|
||||
use grenad::{CompressionType, MergeFunction, Sorter};
|
||||
use heed::types::Bytes;
|
||||
|
||||
use super::ClonableMmap;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
/// This is something reasonable given the fact
|
||||
/// that there is one grenad sorter by thread.
|
||||
const MAX_GRENAD_SORTER_USAGE: usize = 500 * 1024 * 1024; // 500 MiB
|
||||
|
||||
pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
|
||||
|
||||
pub fn create_writer<R: io::Write>(
|
||||
typ: grenad::CompressionType,
|
||||
level: Option<u32>,
|
||||
file: R,
|
||||
) -> grenad::Writer<BufWriter<R>> {
|
||||
let mut builder = grenad::Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
builder.compression_level(level);
|
||||
}
|
||||
builder.build(BufWriter::new(file))
|
||||
}
|
||||
|
||||
/// A helper function that creates a grenad sorter
|
||||
/// with the given parameters. The max memory is
|
||||
/// clamped to something reasonable.
|
||||
pub fn create_sorter<MF: MergeFunction>(
|
||||
sort_algorithm: grenad::SortAlgorithm,
|
||||
merge: MF,
|
||||
chunk_compression_type: grenad::CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
sort_in_parallel: bool,
|
||||
) -> grenad::Sorter<MF> {
|
||||
let mut builder = grenad::Sorter::builder(merge);
|
||||
builder.chunk_compression_type(chunk_compression_type);
|
||||
if let Some(level) = chunk_compression_level {
|
||||
builder.chunk_compression_level(level);
|
||||
}
|
||||
if let Some(nb_chunks) = max_nb_chunks {
|
||||
builder.max_nb_chunks(nb_chunks);
|
||||
}
|
||||
if let Some(memory) = max_memory {
|
||||
builder.dump_threshold(memory.min(MAX_GRENAD_SORTER_USAGE));
|
||||
builder.allow_realloc(false);
|
||||
}
|
||||
builder.sort_algorithm(sort_algorithm);
|
||||
builder.sort_in_parallel(sort_in_parallel);
|
||||
builder.build()
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn sorter_into_reader<MF>(
|
||||
sorter: grenad::Sorter<MF>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>>
|
||||
where
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(
|
||||
writer: grenad::Writer<BufWriter<File>>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
|
||||
file.rewind()?;
|
||||
grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
|
||||
}
|
||||
|
||||
/// # Safety
|
||||
/// We use memory mapping inside. So, according to the Rust community, it's unsafe.
|
||||
pub unsafe fn as_cloneable_grenad(
|
||||
reader: &grenad::Reader<BufReader<File>>,
|
||||
) -> Result<grenad::Reader<CursorClonableMmap>> {
|
||||
let file = reader.get_ref().get_ref();
|
||||
let mmap = memmap2::Mmap::map(file)?;
|
||||
let cursor = io::Cursor::new(ClonableMmap::from(mmap));
|
||||
let reader = grenad::Reader::new(cursor)?;
|
||||
Ok(reader)
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct GrenadParameters {
|
||||
pub chunk_compression_type: CompressionType,
|
||||
pub chunk_compression_level: Option<u32>,
|
||||
pub max_memory: Option<usize>,
|
||||
pub max_nb_chunks: Option<usize>,
|
||||
}
|
||||
|
||||
impl Default for GrenadParameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
chunk_compression_type: CompressionType::None,
|
||||
chunk_compression_level: None,
|
||||
max_memory: None,
|
||||
max_nb_chunks: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl GrenadParameters {
|
||||
/// This function use the number of threads in the current threadpool to compute the value.
|
||||
///
|
||||
/// This should be called inside of a rayon thread pool,
|
||||
/// otherwise, it will take the global number of threads.
|
||||
///
|
||||
/// The max memory cannot exceed a given reasonable value.
|
||||
pub fn max_memory_by_thread(&self) -> Option<usize> {
|
||||
self.max_memory.map(|max_memory| {
|
||||
(max_memory / rayon::current_num_threads()).min(MAX_GRENAD_SORTER_USAGE)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator that outputs grenad readers of obkv documents
|
||||
/// with a maximum size of approximately `documents_chunks_size`.
|
||||
///
|
||||
/// The grenad obkv entries are composed of an incremental document id big-endian
|
||||
/// encoded as the key and an obkv object with an `u8` for the field as the key
|
||||
/// and a simple UTF-8 encoded string as the value.
|
||||
pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
documents_chunk_size: usize,
|
||||
) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
|
||||
let mut continue_reading = true;
|
||||
let mut cursor = reader.into_cursor()?;
|
||||
|
||||
let mut transposer = move || {
|
||||
if !continue_reading {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
let mut current_chunk_size = 0u64;
|
||||
let mut obkv_documents = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
while let Some((document_id, obkv)) = cursor.move_on_next()? {
|
||||
if !obkv.is_empty() {
|
||||
obkv_documents.insert(document_id, obkv)?;
|
||||
current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
|
||||
|
||||
if current_chunk_size >= documents_chunk_size as u64 {
|
||||
return writer_into_reader(obkv_documents).map(Some);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
continue_reading = false;
|
||||
writer_into_reader(obkv_documents).map(Some)
|
||||
};
|
||||
|
||||
Ok(std::iter::from_fn(move || transposer().transpose()))
|
||||
}
|
||||
|
||||
/// Write provided sorter in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn write_sorter_into_database<K, V, FS, FM, MF>(
|
||||
sorter: Sorter<MF>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
index_is_empty: bool,
|
||||
serialize_value: FS,
|
||||
merge_values: FM,
|
||||
) -> Result<()>
|
||||
where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
|
||||
let mut merger_iter = sorter.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = merger_iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
buffer.clear();
|
||||
let value = if index_is_empty {
|
||||
Some(serialize_value(value, &mut buffer)?)
|
||||
} else {
|
||||
match database.get(wtxn, key)? {
|
||||
Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
|
||||
None => Some(serialize_value(value, &mut buffer)?),
|
||||
}
|
||||
};
|
||||
match value {
|
||||
Some(value) => database.put(wtxn, key, value)?,
|
||||
None => {
|
||||
database.delete(wtxn, key)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
@ -0,0 +1,314 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use either::Either;
|
||||
use grenad::MergeFunction;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::transform::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub type EitherObkvMerge =
|
||||
Either<ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions>;
|
||||
|
||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
buffer.reserve(bitmap.serialized_size());
|
||||
bitmap.serialize_into(buffer)
|
||||
}
|
||||
|
||||
pub struct MergeRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let merged = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
let mut buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&merged, &mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeepFirst;
|
||||
|
||||
impl MergeFunction for KeepFirst {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Only the last value associated with an id is kept.
|
||||
pub struct KeepLatestObkv;
|
||||
|
||||
impl MergeFunction for KeepLatestObkv {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn merge_two_del_add_obkvs(
|
||||
base: &obkv::KvReaderU16,
|
||||
update: &obkv::KvReaderU16,
|
||||
merge_additions: bool,
|
||||
buffer: &mut Vec<u8>,
|
||||
) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
buffer.clear();
|
||||
|
||||
let mut writer = obkv::KvWriter::new(buffer);
|
||||
let mut value_buffer = Vec::new();
|
||||
for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
|
||||
match eob {
|
||||
Left((k, v)) => {
|
||||
if merge_additions {
|
||||
writer.insert(k, v).unwrap()
|
||||
} else {
|
||||
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::from_slice(v);
|
||||
|
||||
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
Right((k, v)) => writer.insert(k, v).unwrap(),
|
||||
Both((k, base), (_, update)) => {
|
||||
// merge deletions and additions.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::from_slice(base);
|
||||
let update_reader = KvReaderDelAdd::from_slice(update);
|
||||
|
||||
// keep newest deletion.
|
||||
if let Some(deletion) = update_reader
|
||||
.get(DelAdd::Deletion)
|
||||
.or_else(|| base_reader.get(DelAdd::Deletion))
|
||||
{
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
}
|
||||
|
||||
// keep base addition only if merge_additions is true.
|
||||
let base_addition =
|
||||
merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
|
||||
// keep newest addition.
|
||||
// TODO use or_else
|
||||
if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
|
||||
value_writer.insert(DelAdd::Addition, addition).unwrap();
|
||||
}
|
||||
|
||||
value_writer.finish().unwrap();
|
||||
writer.insert(k, &value_buffer).unwrap()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
fn inner_merge_del_add_obkvs<'a>(
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
merge_additions: bool,
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
// pop the newest operation from the list.
|
||||
let (newest, obkvs) = obkvs.split_last().unwrap();
|
||||
// keep the operation type for the returned value.
|
||||
let newest_operation_type = newest[0];
|
||||
|
||||
// treat the newest obkv as the starting point of the merge.
|
||||
let mut acc_operation_type = newest_operation_type;
|
||||
let mut acc = newest[1..].to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
// reverse iter from the most recent to the oldest.
|
||||
for current in obkvs.iter().rev() {
|
||||
// if in the previous iteration there was a complete deletion,
|
||||
// stop the merge process.
|
||||
if acc_operation_type == Operation::Deletion as u8 {
|
||||
break;
|
||||
}
|
||||
|
||||
let newest = obkv::KvReader::from_slice(&acc);
|
||||
let oldest = obkv::KvReader::from_slice(¤t[1..]);
|
||||
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator.
|
||||
std::mem::swap(&mut acc, &mut buffer);
|
||||
acc_operation_type = current[0];
|
||||
}
|
||||
|
||||
acc.insert(0, newest_operation_type);
|
||||
Ok(Cow::from(acc))
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ObkvsMergeAdditionsAndDeletions;
|
||||
|
||||
impl MergeFunction for ObkvsMergeAdditionsAndDeletions {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ObkvsKeepLastAdditionMergeDeletions;
|
||||
|
||||
impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, false)
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||
pub struct MergeCboRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeCboRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||
Ok(Cow::from(vec))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub struct MergeDeladdCboRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeDeladdCboRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_bitmaps_bytes = Vec::new();
|
||||
let mut add_bitmaps_bytes = Vec::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||
del_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||
add_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let mut buffer = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
|
||||
///
|
||||
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
|
||||
/// the second one is the CboRoaringBitmap to merge into.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
deladd_obkv: &[u8],
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::from_slice(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
}
|
||||
|
||||
/// Do a union of BtreeSet on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub struct MergeDeladdBtreesetString;
|
||||
|
||||
impl MergeFunction for MergeDeladdBtreesetString {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_set = BTreeSet::new();
|
||||
let mut add_set = BTreeSet::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
del_set.insert(value);
|
||||
}
|
||||
}
|
||||
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
add_set.insert(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let del = serde_json::to_vec(&del_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||
let add = serde_json::to_vec(&add_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||
pub struct MergeIgnoreValues;
|
||||
|
||||
impl MergeFunction for MergeIgnoreValues {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(
|
||||
&self,
|
||||
_key: &[u8],
|
||||
_values: &[Cow<'a, [u8]>],
|
||||
) -> std::result::Result<Cow<'a, [u8]>, Self::Error> {
|
||||
Ok(Cow::Owned(Vec::new()))
|
||||
}
|
||||
}
|
66
crates/milli/src/update/index_documents/helpers/mod.rs
Normal file
66
crates/milli/src/update/index_documents/helpers/mod.rs
Normal file
@ -0,0 +1,66 @@
|
||||
mod clonable_mmap;
|
||||
mod grenad_helpers;
|
||||
mod merge_functions;
|
||||
|
||||
use std::collections::HashSet;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
|
||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::*;
|
||||
pub use merge_functions::*;
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
pub fn valid_lmdb_key(key: impl AsRef<[u8]>) -> bool {
|
||||
key.as_ref().len() <= MAX_WORD_LENGTH * 2 && !key.as_ref().is_empty()
|
||||
}
|
||||
|
||||
/// Divides one slice into two at an index, returns `None` if mid is out of bounds.
|
||||
pub fn try_split_at<T>(slice: &[T], mid: usize) -> Option<(&[T], &[T])> {
|
||||
if mid <= slice.len() {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Divides one slice into an array and the tail at an index,
|
||||
/// returns `None` if `N` is out of bounds.
|
||||
pub fn try_split_array_at<T, const N: usize>(slice: &[T]) -> Option<([T; N], &[T])>
|
||||
where
|
||||
[T; N]: for<'a> TryFrom<&'a [T]>,
|
||||
{
|
||||
let (head, tail) = try_split_at(slice, N)?;
|
||||
let head = head.try_into().ok()?;
|
||||
Some((head, tail))
|
||||
}
|
||||
|
||||
/// Converts an fst Stream into an HashSet of Strings.
|
||||
pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut hashset = HashSet::new();
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some(value) = stream.next() {
|
||||
hashset.insert(value.to_owned());
|
||||
}
|
||||
hashset
|
||||
}
|
||||
|
||||
// Converts an fst Stream into a Vec of Strings.
|
||||
pub fn fst_stream_into_vec<'f, I, S>(stream: I) -> Vec<String>
|
||||
where
|
||||
I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>,
|
||||
S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>,
|
||||
{
|
||||
let mut strings = Vec::new();
|
||||
let mut stream = stream.into_stream();
|
||||
while let Some(word) = stream.next() {
|
||||
let s = std::str::from_utf8(word).unwrap();
|
||||
strings.push(s.to_owned());
|
||||
}
|
||||
strings
|
||||
}
|
Reference in New Issue
Block a user