format the whole project

This commit is contained in:
Tamo
2021-06-16 18:33:33 +02:00
parent ba30cef987
commit 9716fb3b36
68 changed files with 3327 additions and 2336 deletions

View File

@ -1,6 +1,7 @@
use std::iter::{Chain, FromIterator};
use std::ops::RangeInclusive;
use roaring::bitmap::{RoaringBitmap, IntoIter};
use roaring::bitmap::{IntoIter, RoaringBitmap};
pub struct AvailableDocumentsIds {
iter: Chain<IntoIter, RangeInclusive<u32>>,
@ -18,16 +19,12 @@ impl AvailableDocumentsIds {
None => 1..=0, // empty range iterator
};
AvailableDocumentsIds {
iter: available.into_iter().chain(iter),
}
},
AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
}
None => {
let empty = RoaringBitmap::new().into_iter();
AvailableDocumentsIds {
iter: empty.chain(0..=u32::max_value()),
}
},
AvailableDocumentsIds { iter: empty.chain(0..=u32::max_value()) }
}
}
}
}

View File

@ -1,7 +1,7 @@
use chrono::Utc;
use roaring::RoaringBitmap;
use crate::{ExternalDocumentsIds, Index, FieldsDistribution, Result};
use crate::{ExternalDocumentsIds, FieldsDistribution, Index, Result};
pub struct ClearDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -13,9 +13,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64
update_id: u64,
) -> ClearDocuments<'t, 'u, 'i> {
ClearDocuments { wtxn, index, _update_id: update_id }
}
@ -80,8 +79,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
mod tests {
use heed::EnvOpenOptions;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
#[test]
fn clear_documents() {

View File

@ -1,5 +1,5 @@
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use chrono::Utc;
use fst::IntoStreamer;
@ -7,11 +7,11 @@ use heed::types::{ByteSlice, Unit};
use roaring::RoaringBitmap;
use serde_json::Value;
use crate::error::{InternalError, FieldIdMapMissingEntry, UserError};
use super::ClearDocuments;
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key};
use crate::{Index, DocumentId, FieldId, BEU32, SmallString32, ExternalDocumentsIds, Result};
use super::ClearDocuments;
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -26,11 +26,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> Result<DeleteDocuments<'t, 'u, 'i>>
{
let external_documents_ids = index
.external_documents_ids(wtxn)?
.into_static();
) -> Result<DeleteDocuments<'t, 'u, 'i>> {
let external_documents_ids = index.external_documents_ids(wtxn)?.into_static();
Ok(DeleteDocuments {
wtxn,
@ -84,12 +81,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
key: Some(main_key::PRIMARY_KEY_KEY),
}
})?;
let id_field = fields_ids_map.id(primary_key).ok_or_else(|| {
FieldIdMapMissingEntry::FieldName {
let id_field =
fields_ids_map.id(primary_key).ok_or_else(|| FieldIdMapMissingEntry::FieldName {
field_name: primary_key.to_string(),
process: "DeleteDocuments::execute",
}
})?;
})?;
let Index {
env: _env,
@ -130,7 +126,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()),
Value::Number(number) => SmallString32::from(number.to_string()),
document_id => return Err(UserError::InvalidDocumentId { document_id }.into()),
document_id => {
return Err(UserError::InvalidDocumentId { document_id }.into())
}
};
external_ids.push(external_id);
}
@ -160,7 +158,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count)
Some(count) => entry.insert(count),
};
}
}
@ -206,9 +204,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
// We construct an FST set that contains the words to delete from the words FST.
let words_to_delete = words.iter().filter_map(|(word, must_remove)| {
if *must_remove { Some(word.as_ref()) } else { None }
});
let words_to_delete =
words.iter().filter_map(
|(word, must_remove)| {
if *must_remove {
Some(word.as_ref())
} else {
None
}
},
);
let words_to_delete = fst::Set::from_iter(words_to_delete)?;
let new_words_fst = {
@ -285,7 +290,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// We delete the documents ids that are under the pairs of words,
// it is faster and use no memory to iterate over all the words pairs than
// to compute the cartesian product of every words of the deleted documents.
let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
let mut iter =
word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@ -300,7 +306,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the word level position docids.
let mut iter = word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
let mut iter =
word_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@ -315,7 +322,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
drop(iter);
// We delete the documents ids that are under the word prefix level position docids.
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
let mut iter =
word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() {
let (bytes, mut docids) = result?;
let previous_len = docids.len();
@ -397,12 +405,11 @@ fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>(
convert: F,
) -> heed::Result<()>
where
C: heed::BytesDecode<'a, DItem=K> + heed::BytesEncode<'a, EItem=K>,
C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>,
F: Fn(K) -> DocumentId,
{
let mut iter = db.remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &[field_id])?
.remap_key_type::<C>();
let mut iter =
db.remap_key_type::<ByteSlice>().prefix_iter_mut(wtxn, &[field_id])?.remap_key_type::<C>();
while let Some(result) = iter.next() {
let (key, ()) = result?;
@ -441,8 +448,8 @@ where
mod tests {
use heed::EnvOpenOptions;
use crate::update::{IndexDocuments, UpdateFormat};
use super::*;
use crate::update::{IndexDocuments, UpdateFormat};
#[test]
fn delete_documents_with_numbers_as_primary_key() {

View File

@ -3,17 +3,18 @@ use std::fs::File;
use std::num::NonZeroUsize;
use chrono::Utc;
use grenad::{CompressionType, Reader, Writer, FileFuse};
use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesEncode, Error};
use log::debug;
use roaring::RoaringBitmap;
use crate::error::InternalError;
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
use crate::heed_codec::CboRoaringBitmapCodec;
use crate::update::index_documents::{
create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod,
};
use crate::{Index, Result};
pub struct Facets<'t, 'u, 'i> {
@ -32,8 +33,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> Facets<'t, 'u, 'i>
{
) -> Facets<'t, 'u, 'i> {
Facets {
wtxn,
index,
@ -72,11 +72,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
)?;
// Clear the facet number levels.
clear_field_number_levels(
self.wtxn,
self.index.facet_id_f64_docids,
field_id,
)?;
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
// Compute and store the faceted numbers documents ids.
let number_documents_ids = compute_faceted_documents_ids(
@ -96,8 +92,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
field_id,
)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &string_documents_ids)?;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &number_documents_ids)?;
self.index.put_string_faceted_documents_ids(
self.wtxn,
field_id,
&string_documents_ids,
)?;
self.index.put_number_faceted_documents_ids(
self.wtxn,
field_id,
&number_documents_ids,
)?;
write_into_lmdb_database(
self.wtxn,
@ -112,12 +116,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
}
}
fn clear_field_number_levels<'t, >(
fn clear_field_number_levels<'t>(
wtxn: &'t mut heed::RwTxn,
db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
field_id: u8,
) -> heed::Result<()>
{
) -> heed::Result<()> {
let left = (field_id, 1, f64::MIN, f64::MIN);
let right = (field_id, u8::MAX, f64::MAX, f64::MAX);
let range = left..=right;
@ -133,8 +136,7 @@ fn compute_facet_number_levels<'t>(
level_group_size: NonZeroUsize,
min_level_size: NonZeroUsize,
field_id: u8,
) -> Result<Reader<FileFuse>>
{
) -> Result<Reader<FileFuse>> {
let first_level_size = db
.remap_key_type::<ByteSlice>()
.prefix_iter(rtxn, &[field_id])?
@ -143,9 +145,8 @@ fn compute_facet_number_levels<'t>(
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(compression_type, compression_level, file)
})?;
let mut writer = tempfile::tempfile()
.and_then(|file| create_writer(compression_type, compression_level, file))?;
let level_0_range = {
let left = (field_id, 0, f64::MIN, f64::MIN);
@ -196,8 +197,7 @@ fn compute_faceted_documents_ids(
rtxn: &heed::RoTxn,
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
field_id: u8,
) -> Result<RoaringBitmap>
{
) -> Result<RoaringBitmap> {
let mut documents_ids = RoaringBitmap::new();
for result in db.prefix_iter(rtxn, &[field_id])? {
@ -215,8 +215,7 @@ fn write_number_entry(
left: f64,
right: f64,
ids: &RoaringBitmap,
) -> Result<()>
{
) -> Result<()> {
let key = (field_id, level, left, right);
let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, Seek, SeekFrom, BufReader, BufRead};
use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
use std::num::{NonZeroU32, NonZeroUsize};
use std::result::Result as StdResult;
use std::str;
@ -10,28 +10,26 @@ use std::time::Instant;
use bstr::ByteSlice as _;
use chrono::Utc;
use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
use heed::types::ByteSlice;
use log::{debug, info, error};
use log::{debug, error, info};
use memmap::Mmap;
use rayon::prelude::*;
use rayon::ThreadPool;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::error::{Error, InternalError};
use crate::{Index, Result};
use crate::update::{
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
WordPrefixPairProximityDocids,
};
use self::store::{Store, Readers};
pub use self::merge_function::{
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use self::store::{Readers, Store};
pub use self::transform::{Transform, TransformOutput};
use crate::MergeFn;
use super::UpdateBuilder;
use crate::error::{Error, InternalError};
use crate::update::{
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
};
use crate::{Index, MergeFn, Result};
mod merge_function;
mod store;
@ -48,7 +46,11 @@ pub enum WriteMethod {
GetMergePut,
}
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
pub fn create_writer(
typ: CompressionType,
level: Option<u32>,
file: File,
) -> io::Result<Writer<File>> {
let mut builder = Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
@ -64,8 +66,7 @@ pub fn create_sorter<E>(
chunk_fusing_shrink_size: Option<u64>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
) -> Sorter<MergeFn<E>>
{
) -> Sorter<MergeFn<E>> {
let mut builder = Sorter::builder(merge);
if let Some(shrink_size) = chunk_fusing_shrink_size {
builder.file_fusing_shrink_size(shrink_size);
@ -83,7 +84,10 @@ pub fn create_sorter<E>(
builder.build()
}
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> {
pub fn writer_into_reader(
writer: Writer<File>,
shrink_size: Option<u64>,
) -> Result<Reader<FileFuse>> {
let mut file = writer.into_inner()?;
file.seek(SeekFrom::Start(0))?;
let file = if let Some(shrink_size) = shrink_size {
@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res
pub fn merge_readers<E>(
sources: Vec<Reader<FileFuse>>,
merge: MergeFn<E>,
) -> Merger<FileFuse, MergeFn<E>>
{
) -> Merger<FileFuse, MergeFn<E>> {
let mut builder = Merger::builder(merge);
builder.extend(sources);
builder.build()
@ -118,13 +121,7 @@ where
let before = Instant::now();
let merger = merge_readers(sources, merge);
merger_iter_into_lmdb_database(
wtxn,
database,
merger.into_merge_iter()?,
merge,
method,
)?;
merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
Ok(())
@ -149,7 +146,7 @@ where
while let Some((k, v)) = reader.next()? {
out_iter.append(k, v)?;
}
},
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = reader.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -158,11 +155,11 @@ where
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
let val = merge(k, &vals)?;
iter.put_current(k, &val)?;
},
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
}
}
}
}
@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>(
) -> Result<()>
where
Error: From<E>,
Error: From<grenad::Error<E>>
Error: From<grenad::Error<E>>,
{
debug!("Writing MTBL sorter...");
let before = Instant::now();
merger_iter_into_lmdb_database(
wtxn,
database,
sorter.into_iter()?,
merge,
method,
)?;
merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(())
@ -214,7 +205,7 @@ where
while let Some((k, v)) = sorter.next()? {
out_iter.append(k, v)?;
}
},
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = sorter.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -226,14 +217,14 @@ where
InternalError::IndexingMergingKeys { process: "get-put-merge" }
})?;
iter.put_current(k, &val)?;
},
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
}
}
}
},
}
}
Ok(())
@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// Early return when there is no document to add
if reader.buffer().is_empty() {
return Ok(DocumentAdditionResult {
nb_documents: 0,
})
return Ok(DocumentAdditionResult { nb_documents: 0 });
}
self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let output = match self.update_format {
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?,
UpdateFormat::JsonStream => {
transform.output_from_json_stream(reader, &progress_callback)?
}
};
let nb_documents = output.documents_count;
@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync
F: Fn(UpdateIndexingStep) + Sync,
{
let before_indexing = Instant::now();
@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
&backup_pool
},
}
};
let readers = pool.install(|| {
@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
let contains_documents = !documents_ids.is_empty();
let write_method = if contains_documents {
WriteMethod::GetMergePut
} else {
WriteMethod::Append
};
let write_method =
if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
debug!("Writing using the write method: {:?}", write_method);
@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.docid_word_positions.as_polymorph(),
docid_word_positions_readers,
keep_first,
write_method
write_method,
)?;
database_count += 1;
@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.documents.as_polymorph(),
documents_readers,
keep_first,
write_method
write_method,
)?;
database_count += 1;
@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
fst_merge,
WriteMethod::GetMergePut,
)?;
},
}
DatabaseType::WordDocids => {
debug!("Writing the words docids into LMDB on disk...");
let db = *self.index.word_docids.as_polymorph();
@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::FacetLevel0NumbersDocids => {
debug!("Writing the facet numbers docids into LMDB on disk...");
let db = *self.index.facet_id_f64_docids.as_polymorph();
@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::FieldIdWordCountDocids => {
debug!("Writing the field id word count docids into LMDB on disk...");
let db = *self.index.field_id_word_count_docids.as_polymorph();
@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::WordLevel0PositionDocids => {
debug!("Writing the word level 0 positions docids into LMDB on disk...");
let db = *self.index.word_level_position_docids.as_polymorph();
@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
#[cfg(test)]
mod tests {
use super::*;
use heed::EnvOpenOptions;
use super::*;
#[test]
fn simple_document_replacement() {
let path = tempfile::tempdir().unwrap();
@ -1053,9 +1042,8 @@ mod tests {
assert_eq!(count, 3);
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
let (kevin_id, _) = docs.iter().find(|(_, d)| {
d.get(0).unwrap() == br#""updated kevin""#
}).unwrap();
let (kevin_id, _) =
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
let (id, doc) = docs[*kevin_id as usize];
assert_eq!(id, *kevin_id);

View File

@ -8,25 +8,29 @@ use std::{cmp, iter};
use bstr::ByteSlice as _;
use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
use heed::BytesEncode;
use linked_hash_map::LinkedHashMap;
use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value;
use tempfile::tempfile;
use super::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
use crate::error::{Error, InternalError, SerializationError};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_KILOBYTE: usize = 1024 * 1024;
@ -56,7 +60,8 @@ pub struct Store<'s, A> {
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize,
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids:
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
chunk_compression_level: Option<u32>,
chunk_fusing_shrink_size: Option<u64>,
stop_words: Option<&'s Set<A>>,
) -> Result<Self>
{
) -> Result<Self> {
// We divide the max memory by the number of sorter the Store have.
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Some(1024 * 1024 * 1024), // 1MB
);
let documents_writer = tempfile().and_then(|f| {
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let docid_word_positions_writer = tempfile().and_then(|f| {
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let documents_writer = tempfile()
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
let docid_word_positions_writer = tempfile()
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
let mut config = AnalyzerConfig::default();
if let Some(stop_words) = stop_words {
@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.word_docids.get_refresh(word.as_bytes()) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
let word_vec = SmallVec32::from(word.as_bytes());
// A newly inserted element is append at the end of the linked hash map.
@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId,
value: OrderedFloat<f64>,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_number_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId,
value: String,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Save the documents ids under the words pairs proximities that it contains.
fn insert_words_pairs_proximities_docids<'a>(
&mut self,
words_pairs_proximities: impl IntoIterator<Item=((&'a str, &'a str), u8)>,
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
for ((w1, w2), prox) in words_pairs_proximities {
let w1 = SmallVec32::from(w1.as_bytes());
let w2 = SmallVec32::from(w2.as_bytes());
@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// if get_refresh finds the element it is assured
// to be at the end of the linked hash map.
match self.words_pairs_proximities_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
let ids = RoaringBitmap::from_iter(Some(id));
@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Removing front elements is equivalent to removing the LRUs.
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
iter.take(overflow).for_each(|x| lrus.push(x));
Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?;
Self::write_words_pairs_proximities(
&mut self.words_pairs_proximities_docids_sorter,
lrus,
)?;
}
Ok(())
@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
record: &[u8],
) -> Result<()>
{
) -> Result<()> {
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
Self::write_docid_word_positions(
&mut self.docid_word_positions_writer,
document_id,
words_positions,
)?;
Self::write_word_position_docids(
&mut self.word_level_position_docids_sorter,
document_id,
words_positions,
)?;
words_positions.clear();
@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_words_pairs_proximities<E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: impl IntoIterator<Item=((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
) -> Result<()>
where
Error: From<E>,
@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
writer: &mut Writer<File>,
id: DocumentId,
words_positions: &HashMap<String, SmallVec32<Position>>,
) -> Result<()>
{
) -> Result<()> {
// We prefix the words by the document id.
let mut key = id.to_be_bytes().to_vec();
let mut buffer = Vec::new();
@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_string_docids<I, E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>,
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
Error: From<E>,
{
let mut key_buffer = Vec::new();
@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_number_docids<I, E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
Error: From<E>,
{
let mut data_buffer = Vec::new();
@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>,
I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>,
Error: From<E>,
{
let mut key = Vec::new();
@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
log_every_n: Option<usize>,
mut progress_callback: F,
) -> Result<Readers>
where F: FnMut(UpdateIndexingStep),
where
F: FnMut(UpdateIndexingStep),
{
debug!("{:?}: Indexing in a Store...", thread_index);
@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
if count % num_threads == thread_index {
// This is a log routine that we do every `log_every_n` documents.
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
info!(
"We have seen {} documents so far ({:.02?}).",
format_count(count),
before.elapsed()
);
progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count,
total_documents: documents_count,
@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
for (attr, content) in document.iter() {
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr)
{
let value =
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
let (facet_numbers, facet_strings) = extract_facet_values(&value);
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
facet_numbers_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_numbers);
facet_strings_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_strings);
if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) {
@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
last_pos = Some(pos);
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
words_positions
.entry(token.text().to_string())
.or_insert_with(SmallVec32::new)
.push(position);
}
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
let key = (attr, last_pos as u8 + 1);
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
self.field_id_word_count_docids
.entry(key)
.or_insert_with(RoaringBitmap::new)
.insert(document_id);
}
}
}
@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
self.facet_field_string_docids,
)?;
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut word_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut builder = fst::SetBuilder::memory();
let mut iter = self.word_docids_sorter.into_iter()?;
@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.main_sorter.write_into(&mut main_wtr)?;
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut words_pairs_proximities_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter
.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut word_level_position_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut field_id_word_count_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut facet_field_numbers_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut facet_field_strings_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_numbers_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
let mut field_id_docid_facet_strings_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter
.write_into(&mut field_id_docid_facet_strings_wtr)?;
let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let words_pairs_proximities_docids =
writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids =
writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids =
writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids =
writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids =
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers =
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings =
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions =
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
Ok(Readers {
@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
/// close to each other.
fn compute_words_pair_proximities(
word_positions: &HashMap<String, SmallVec32<Position>>,
) -> HashMap<(&str, &str), u8>
{
) -> HashMap<(&str, &str), u8> {
use itertools::Itertools;
let mut words_pair_proximities = HashMap::new();
@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool {
/// take an iterator on tokens and compute their relative position depending on separator kinds
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
/// else we keep the standart proximity of 1 between words.
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
fn process_tokens<'a>(
tokens: impl Iterator<Item = Token<'a>>,
) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.skip_while(|token| token.is_separator().is_some())
.scan((0, None), |(offset, prev_kind), token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
}
_ => (),
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
{
*prev_kind = Some(token.kind);
}
_ => (),
}
Some((*offset, token))
})
.filter(|(_, t)| t.is_word())
.filter(|(_, t)| t.is_word())
}
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
match value {
Value::Null => (),
Value::Bool(b) => output_strings.push(b.to_string()),
Value::Number(number) => if let Some(float) = number.as_f64() {
output_numbers.push(float);
},
Value::Number(number) => {
if let Some(float) = number.as_f64() {
output_numbers.push(float);
}
}
Value::String(string) => {
let string = string.trim().to_lowercase();
output_strings.push(string);
},
Value::Array(values) => if can_recurse {
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
}
Value::Array(values) => {
if can_recurse {
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
}
}
},
}
Value::Object(_) => (),
}
}

View File

@ -10,14 +10,15 @@ use log::info;
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
use crate::error::{Error, UserError, InternalError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
use crate::{Index, Result};
use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod};
use super::{create_sorter, create_writer, IndexDocumentsMethod};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32,
};
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
@ -64,7 +65,11 @@ impl Transform<'_, '_> {
self.output_from_generic_json(reader, false, progress_callback)
}
pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
pub fn output_from_json_stream<R, F>(
self,
reader: R,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
@ -86,14 +91,16 @@ impl Transform<'_, '_> {
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
// Deserialize the whole batch of documents in memory.
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream {
let mut documents: Peekable<
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
> = if is_stream {
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
} else {
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
let iter = vec.into_iter().map(Ok);
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
};
@ -104,15 +111,16 @@ impl Transform<'_, '_> {
Err(_) => {
let error = documents.next().unwrap().unwrap_err();
return Err(UserError::SerdeJson(error).into());
},
}
};
let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let alternative_name =
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let (primary_key_id, primary_key) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids
self.autogenerate_docids,
)?;
if documents.peek().is_none() {
@ -173,9 +181,11 @@ impl Transform<'_, '_> {
Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()),
content => return Err(UserError::InvalidDocumentId {
document_id: content.clone(),
}.into()),
content => {
return Err(
UserError::InvalidDocumentId { document_id: content.clone() }.into()
)
}
},
None => {
if !self.autogenerate_docids {
@ -183,7 +193,7 @@ impl Transform<'_, '_> {
}
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
Cow::Borrowed(uuid)
},
}
};
// We iterate in the fields ids ordered.
@ -194,7 +204,8 @@ impl Transform<'_, '_> {
// and this should be the document id we return the one we generated.
if let Some(value) = document.get(name) {
// We serialize the attribute values.
serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?;
serde_json::to_writer(&mut json_buffer, value)
.map_err(InternalError::SerdeJson)?;
writer.insert(field_id, &json_buffer)?;
}
@ -202,7 +213,8 @@ impl Transform<'_, '_> {
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}.into());
}
.into());
}
}
@ -248,9 +260,9 @@ impl Transform<'_, '_> {
// Extract the position of the primary key in the current headers, None if not found.
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
Some(primary_key) => {
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
},
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
}
None => headers.iter().position(is_primary_key),
};
@ -261,7 +273,7 @@ impl Transform<'_, '_> {
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids
self.autogenerate_docids,
)?;
// The primary key field is not present in the header, so we need to create it.
@ -308,27 +320,30 @@ impl Transform<'_, '_> {
// We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&external_id) {
Some(valid) => valid,
None => return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}.into()),
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}
.into())
}
}
},
}
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
};
// When the primary_key_field_id is found in the fields ids list
// we return the generated document id instead of the record field.
let iter = fields_ids.iter()
.map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
});
let iter = fields_ids.iter().map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
});
// We retrieve the field id based on the fields ids map fields ids order.
for (field_id, field) in iter {
// We serialize the attribute values as JSON strings.
json_buffer.clear();
serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?;
serde_json::to_writer(&mut json_buffer, &field)
.map_err(InternalError::SerdeJson)?;
writer.insert(*field_id, &json_buffer)?;
}
@ -410,26 +425,27 @@ impl Transform<'_, '_> {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let key = BEU32::new(docid);
let base_obkv = self.index.documents.get(&self.rtxn, &key)?
.ok_or(InternalError::DatabaseMissingEntry {
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
},
)?;
let update_obkv = obkv::KvReader::new(update_obkv);
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
(docid, obkv_buffer.as_slice())
}
}
},
}
None => {
// If this user id is new we add it to the external documents ids map
// for new ids and into the list of new documents.
let new_docid = available_documents_ids.next()
.ok_or(UserError::DocumentLimitReached)?;
let new_docid =
available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
new_documents_ids.insert(new_docid);
(new_docid, update_obkv)
},
}
};
// We insert the document under the documents ids map into the final file.
@ -450,7 +466,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
// Once we have written all the documents into the final sorter, we write the documents
// into this writer, extract the file and reset the seek to be able to read it again.
@ -485,8 +502,7 @@ impl Transform<'_, '_> {
primary_key: String,
old_fields_ids_map: FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput>
{
) -> Result<TransformOutput> {
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?;
@ -494,7 +510,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut obkv_buffer = Vec::new();
for result in self.index.documents.iter(self.rtxn)? {
@ -561,20 +578,19 @@ fn compute_primary_key_pair(
return Err(UserError::MissingPrimaryKey.into());
}
DEFAULT_PRIMARY_KEY_NAME.to_string()
},
}
};
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
Ok((id, name))
},
}
}
}
fn validate_document_id(document_id: &str) -> Option<&str> {
let document_id = document_id.trim();
Some(document_id).filter(|id| {
!id.is_empty() && id.chars().all(|c| {
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')
})
!id.is_empty()
&& id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
})
}
@ -583,8 +599,7 @@ mod test {
use super::*;
mod compute_primary_key {
use super::compute_primary_key_pair;
use super::FieldsIdsMap;
use super::{compute_primary_key_pair, FieldsIdsMap};
#[test]
fn should_return_primary_key_if_is_some() {
@ -594,7 +609,8 @@ mod test {
Some("toto"),
&mut fields_map,
Some("tata".to_string()),
false);
false,
);
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -602,11 +618,8 @@ mod test {
#[test]
fn should_return_alternative_if_primary_is_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
Some("tata".to_string()),
false);
let result =
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -614,23 +627,15 @@ mod test {
#[test]
fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
true);
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
assert_eq!(fields_map.len(), 1);
}
#[test]
fn should_return_err_if_both_are_none_and_recompute_is_false(){
fn should_return_err_if_both_are_none_and_recompute_is_false() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
false);
let result = compute_primary_key_pair(None, &mut fields_map, None, false);
assert!(result.is_err());
assert_eq!(fields_map.len(), 0);
}

View File

@ -2,7 +2,9 @@ pub use self::available_documents_ids::AvailableDocumentsIds;
pub use self::clear_documents::ClearDocuments;
pub use self::delete_documents::DeleteDocuments;
pub use self::facets::Facets;
pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat};
pub use self::index_documents::{
DocumentAdditionResult, IndexDocuments, IndexDocumentsMethod, UpdateFormat,
};
pub use self::settings::{Setting, Settings};
pub use self::update_builder::UpdateBuilder;
pub use self::update_step::UpdateIndexingStep;

View File

@ -34,17 +34,24 @@ impl<T> Setting<T> {
}
impl<T: Serialize> Serialize for Setting<T> {
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error> where S: Serializer {
fn serialize<S>(&self, serializer: S) -> StdResult<S::Ok, S::Error>
where
S: Serializer,
{
match self {
Self::Set(value) => Some(value),
// Usually not_set isn't serialized by setting skip_serializing_if field attribute
Self::NotSet | Self::Reset => None,
}.serialize(serializer)
}
.serialize(serializer)
}
}
impl<'de, T: Deserialize<'de>> Deserialize<'de> for Setting<T> {
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error> where D: Deserializer<'de> {
fn deserialize<D>(deserializer: D) -> StdResult<Self, D::Error>
where
D: Deserializer<'de>,
{
Deserialize::deserialize(deserializer).map(|x| match x {
Some(x) => Self::Set(x),
None => Self::Reset, // Reset is forced by sending null value
@ -141,11 +148,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
pub fn set_stop_words(&mut self, stop_words: BTreeSet<String>) {
self.stop_words = if stop_words.is_empty() {
Setting::Reset
} else {
Setting::Set(stop_words)
}
self.stop_words =
if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
}
pub fn reset_distinct_field(&mut self) {
@ -161,11 +165,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
}
pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
self.synonyms = if synonyms.is_empty() {
Setting::Reset
} else {
Setting::Set(synonyms)
}
self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
}
pub fn reset_primary_key(&mut self) {
@ -178,7 +178,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn reindex<F>(&mut self, cb: &F, old_fields_ids_map: FieldsIdsMap) -> Result<()>
where
F: Fn(UpdateIndexingStep, u64) + Sync
F: Fn(UpdateIndexingStep, u64) + Sync,
{
let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
let update_id = self.update_id;
@ -203,7 +203,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
};
// There already has been a document addition, the primary key should be set by now.
let primary_key = self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
let primary_key =
self.index.primary_key(&self.wtxn)?.ok_or(UserError::MissingPrimaryKey)?;
// We remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.remap_index_documents(
@ -236,21 +237,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
Setting::Set(ref fields) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
// fields are deduplicated, only the first occurrence is taken into account
let names: Vec<_> = fields
.iter()
.unique()
.map(String::as_str)
.collect();
let names: Vec<_> = fields.iter().unique().map(String::as_str).collect();
for name in names.iter() {
fields_ids_map
.insert(name)
.ok_or(UserError::AttributeLimitReached)?;
fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?;
}
self.index.put_displayed_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_displayed_fields(self.wtxn)?; }
Setting::Reset => {
self.index.delete_displayed_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -260,14 +257,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
match self.distinct_field {
Setting::Set(ref attr) => {
let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
fields_ids_map
.insert(attr)
.ok_or(UserError::AttributeLimitReached)?;
fields_ids_map.insert(attr).ok_or(UserError::AttributeLimitReached)?;
self.index.put_distinct_field(self.wtxn, &attr)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_distinct_field(self.wtxn)?; },
Setting::Reset => {
self.index.delete_distinct_field(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -285,30 +282,24 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let mut new_fields_ids_map = FieldsIdsMap::new();
// fields are deduplicated, only the first occurrence is taken into account
let names = fields
.iter()
.unique()
.map(String::as_str)
.collect::<Vec<_>>();
let names = fields.iter().unique().map(String::as_str).collect::<Vec<_>>();
// Add all the searchable attributes to the field map, and then add the
// remaining fields from the old field map to the new one
for name in names.iter() {
new_fields_ids_map
.insert(&name)
.ok_or(UserError::AttributeLimitReached)?;
new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
}
for (_, name) in old_fields_ids_map.iter() {
new_fields_ids_map
.insert(&name)
.ok_or(UserError::AttributeLimitReached)?;
new_fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
}
self.index.put_searchable_fields(self.wtxn, &names)?;
self.index.put_fields_ids_map(self.wtxn, &new_fields_ids_map)?;
}
Setting::Reset => { self.index.delete_searchable_fields(self.wtxn)?; }
Setting::Reset => {
self.index.delete_searchable_fields(self.wtxn)?;
}
Setting::NotSet => return Ok(false),
}
Ok(true)
@ -323,7 +314,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
let fst = fst::Set::from_iter(stop_words)?;
// Does the new FST differ from the previous one?
if current.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes()) {
if current
.map_or(true, |current| current.as_fst().as_bytes() != fst.as_fst().as_bytes())
{
// we want to re-create our FST.
self.index.put_stop_words(self.wtxn, &fst)?;
Ok(true)
@ -343,9 +336,13 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
analyzer
.analyze(text)
.tokens()
.filter_map(|token|
if token.is_word() { Some(token.text().to_string()) } else { None }
)
.filter_map(|token| {
if token.is_word() {
Some(token.text().to_string())
} else {
None
}
})
.collect::<Vec<_>>()
}
@ -360,25 +357,20 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
for (word, synonyms) in synonyms {
// Normalize both the word and associated synonyms.
let normalized_word = normalize(&analyzer, word);
let normalized_synonyms = synonyms
.iter()
.map(|synonym| normalize(&analyzer, synonym));
let normalized_synonyms =
synonyms.iter().map(|synonym| normalize(&analyzer, synonym));
// Store the normalized synonyms under the normalized word,
// merging the possible duplicate words.
let entry = new_synonyms
.entry(normalized_word)
.or_insert_with(Vec::new);
let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
entry.extend(normalized_synonyms);
}
// Make sure that we don't have duplicate synonyms.
new_synonyms
.iter_mut()
.for_each(|(_, synonyms)| {
synonyms.sort_unstable();
synonyms.dedup();
});
new_synonyms.iter_mut().for_each(|(_, synonyms)| {
synonyms.sort_unstable();
synonyms.dedup();
});
let old_synonyms = self.index.synonyms(self.wtxn)?;
@ -406,7 +398,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_filterable_fields(self.wtxn, &new_facets)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_filterable_fields(self.wtxn)?; }
Setting::Reset => {
self.index.delete_filterable_fields(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
@ -427,7 +421,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
self.index.put_criteria(self.wtxn, &new_criteria)?;
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
}
Setting::Reset => { self.index.delete_criteria(self.wtxn)?; }
Setting::Reset => {
self.index.delete_criteria(self.wtxn)?;
}
Setting::NotSet => (),
}
Ok(())
@ -445,7 +441,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} else {
Err(UserError::PrimaryKeyCannotBeChanged.into())
}
},
}
Setting::Reset => {
if self.index.number_of_documents(&self.wtxn)? == 0 {
self.index.delete_primary_key(self.wtxn)?;
@ -453,14 +449,14 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
} else {
Err(UserError::PrimaryKeyCannotBeReset.into())
}
},
}
Setting::NotSet => Ok(()),
}
}
pub fn execute<F>(mut self, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep, u64) + Sync
where
F: Fn(UpdateIndexingStep, u64) + Sync,
{
self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -493,17 +489,16 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
#[cfg(test)]
mod tests {
use heed::EnvOpenOptions;
use heed::types::ByteSlice;
use maplit::{btreeset, hashmap, hashset};
use big_s::S;
use heed::types::ByteSlice;
use heed::EnvOpenOptions;
use maplit::{btreeset, hashmap, hashset};
use super::*;
use crate::error::Error;
use crate::update::{IndexDocuments, UpdateFormat};
use crate::{Criterion, FilterCondition, SearchResult};
use super::*;
#[test]
fn set_and_reset_searchable_fields() {
let path = tempfile::tempdir().unwrap();
@ -674,7 +669,7 @@ mod tests {
// Set the filterable fields to be the age.
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_filterable_fields(hashset!{ S("age") });
builder.set_filterable_fields(hashset! { S("age") });
builder.execute(|_, _| ()).unwrap();
// Then index some documents.
@ -692,12 +687,15 @@ mod tests {
// Check that the displayed fields are correctly set.
let rtxn = index.read_txn().unwrap();
let fields_ids = index.filterable_fields(&rtxn).unwrap();
assert_eq!(fields_ids, hashset!{ S("age") });
assert_eq!(fields_ids, hashset! { S("age") });
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
.prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 3);
drop(rtxn);
@ -718,9 +716,12 @@ mod tests {
let rtxn = index.read_txn().unwrap();
// Only count the field_id 0 and level 0 facet values.
// TODO we must support typed CSVs for numbers to be understood.
let count = index.facet_id_f64_docids
let count = index
.facet_id_f64_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(&rtxn, &[0, 0]).unwrap().count();
.prefix_iter(&rtxn, &[0, 0])
.unwrap()
.count();
assert_eq!(count, 4);
}
@ -969,7 +970,7 @@ mod tests {
let mut wtxn = index.write_txn().unwrap();
let mut builder = Settings::new(&mut wtxn, &index, 0);
builder.set_displayed_fields(vec!["hello".to_string()]);
builder.set_filterable_fields(hashset!{ S("age"), S("toto") });
builder.set_filterable_fields(hashset! { S("age"), S("toto") });
builder.set_criteria(vec!["asc(toto)".to_string()]);
builder.execute(|_, _| ()).unwrap();
wtxn.commit().unwrap();

View File

@ -1,8 +1,8 @@
use grenad::CompressionType;
use rayon::ThreadPool;
use super::{ClearDocuments, DeleteDocuments, Facets, IndexDocuments, Settings};
use crate::{Index, Result};
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
pub struct UpdateBuilder<'a> {
pub(crate) log_every_n: Option<usize>,
@ -67,8 +67,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> ClearDocuments<'t, 'u, 'i>
{
) -> ClearDocuments<'t, 'u, 'i> {
ClearDocuments::new(wtxn, index, self.update_id)
}
@ -76,8 +75,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Result<DeleteDocuments<'t, 'u, 'i>>
{
) -> Result<DeleteDocuments<'t, 'u, 'i>> {
DeleteDocuments::new(wtxn, index, self.update_id)
}
@ -85,8 +83,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> IndexDocuments<'t, 'u, 'i, 'a>
{
) -> IndexDocuments<'t, 'u, 'i, 'a> {
let mut builder = IndexDocuments::new(wtxn, index, self.update_id);
builder.log_every_n = self.log_every_n;
@ -105,8 +102,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Settings<'a, 't, 'u, 'i>
{
) -> Settings<'a, 't, 'u, 'i> {
let mut builder = Settings::new(wtxn, index, self.update_id);
builder.log_every_n = self.log_every_n;
@ -125,8 +121,7 @@ impl<'a> UpdateBuilder<'a> {
self,
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> Facets<'t, 'u, 'i>
{
) -> Facets<'t, 'u, 'i> {
let mut builder = Facets::new(wtxn, index, self.update_id);
builder.chunk_compression_type = self.chunk_compression_type;

View File

@ -1,15 +1,13 @@
use std::str;
use crate::Index;
use fst::Streamer;
use grenad::CompressionType;
use heed::types::ByteSlice;
use crate::Result;
use crate::update::index_documents::WriteMethod;
use crate::update::index_documents::{
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database,
create_sorter, roaring_bitmap_merge, sorter_into_lmdb_database, WriteMethod,
};
use crate::{Index, Result};
pub struct WordPrefixDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -22,7 +20,10 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
}
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordPrefixDocids<'t, 'u, 'i> {
WordPrefixDocids {
wtxn,
index,

View File

@ -1,18 +1,17 @@
use std::str;
use fst::automaton::{Automaton, Str};
use fst::{Streamer, IntoStreamer};
use fst::{IntoStreamer, Streamer};
use grenad::CompressionType;
use heed::BytesEncode;
use heed::types::ByteSlice;
use heed::BytesEncode;
use log::debug;
use crate::{Index, Result};
use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::{
WriteMethod, create_sorter, sorter_into_lmdb_database,
cbo_roaring_bitmap_merge,
cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,
};
use crate::{Index, Result};
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -28,8 +27,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
{
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
WordPrefixPairProximityDocids {
wtxn,
index,

View File

@ -1,25 +1,23 @@
use std::{cmp, str};
use std::convert::TryFrom;
use std::fs::File;
use std::num::NonZeroU32;
use std::{cmp, str};
use fst::automaton::{self, Automaton};
use fst::{Streamer, IntoStreamer};
use grenad::{CompressionType, Reader, Writer, FileFuse};
use fst::{IntoStreamer, Streamer};
use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::{BytesEncode, Error};
use log::debug;
use roaring::RoaringBitmap;
use crate::error::InternalError;
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
use crate::Result;
use crate::update::index_documents::WriteMethod;
use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec};
use crate::update::index_documents::{
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
cbo_roaring_bitmap_merge, sorter_into_lmdb_database
cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database,
write_into_lmdb_database, writer_into_reader, WriteMethod,
};
use crate::{Index, TreeLevel};
use crate::{Index, Result, TreeLevel};
pub struct WordsLevelPositions<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -34,7 +32,10 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
}
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
) -> WordsLevelPositions<'t, 'u, 'i> {
WordsLevelPositions {
wtxn,
index,
@ -144,7 +145,9 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
self.wtxn,
*self.index.word_prefix_level_position_docids.as_polymorph(),
entries,
|_, _| Err(InternalError::IndexingMergingKeys { process: "word prefix level position" }),
|_, _| {
Err(InternalError::IndexingMergingKeys { process: "word prefix level position" })
},
WriteMethod::Append,
)?;
@ -176,13 +179,11 @@ fn compute_positions_levels(
shrink_size: Option<u64>,
level_group_size: NonZeroU32,
min_level_size: NonZeroU32,
) -> Result<Reader<FileFuse>>
{
) -> Result<Reader<FileFuse>> {
// It is forbidden to keep a cursor and write in a database at the same time with LMDB
// therefore we write the facet levels entries into a grenad file before transfering them.
let mut writer = tempfile::tempfile().and_then(|file| {
create_writer(compression_type, compression_level, file)
})?;
let mut writer = tempfile::tempfile()
.and_then(|file| create_writer(compression_type, compression_level, file))?;
for result in words_db.iter(rtxn)? {
let (word, ()) = result?;
@ -193,7 +194,8 @@ fn compute_positions_levels(
left..=right
};
let first_level_size = words_positions_db.remap_data_type::<DecodeIgnore>()
let first_level_size = words_positions_db
.remap_data_type::<DecodeIgnore>()
.range(rtxn, &level_0_range)?
.fold(Ok(0u32), |count, result| result.and(count).map(|c| c + 1))?;
@ -253,8 +255,7 @@ fn write_level_entry(
left: u32,
right: u32,
ids: &RoaringBitmap,
) -> Result<()>
{
) -> Result<()> {
let key = (word, level, left, right);
let key = StrLevelPositionCodec::bytes_encode(&key).ok_or(Error::Encoding)?;
let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?;

View File

@ -2,7 +2,8 @@ use std::iter::FromIterator;
use std::str;
use fst::Streamer;
use crate::{Index, SmallString32, Result};
use crate::{Index, Result, SmallString32};
pub struct WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -17,8 +18,7 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
update_id: u64,
) -> WordsPrefixesFst<'t, 'u, 'i>
{
) -> WordsPrefixesFst<'t, 'u, 'i> {
WordsPrefixesFst {
wtxn,
index,
@ -55,7 +55,6 @@ impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
for n in 1..=self.max_prefix_length {
let mut current_prefix = SmallString32::new();
let mut current_prefix_count = 0;
let mut builder = fst::SetBuilder::memory();