format the whole project

This commit is contained in:
Tamo
2021-06-16 18:33:33 +02:00
parent ba30cef987
commit 9716fb3b36
68 changed files with 3327 additions and 2336 deletions

View File

@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::fs::File;
use std::io::{self, Seek, SeekFrom, BufReader, BufRead};
use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
use std::num::{NonZeroU32, NonZeroUsize};
use std::result::Result as StdResult;
use std::str;
@ -10,28 +10,26 @@ use std::time::Instant;
use bstr::ByteSlice as _;
use chrono::Utc;
use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
use heed::types::ByteSlice;
use log::{debug, info, error};
use log::{debug, error, info};
use memmap::Mmap;
use rayon::prelude::*;
use rayon::ThreadPool;
use serde::{Serialize, Deserialize};
use serde::{Deserialize, Serialize};
use crate::error::{Error, InternalError};
use crate::{Index, Result};
use crate::update::{
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
WordPrefixPairProximityDocids,
};
use self::store::{Store, Readers};
pub use self::merge_function::{
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use self::store::{Readers, Store};
pub use self::transform::{Transform, TransformOutput};
use crate::MergeFn;
use super::UpdateBuilder;
use crate::error::{Error, InternalError};
use crate::update::{
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
WordsLevelPositions, WordsPrefixesFst,
};
use crate::{Index, MergeFn, Result};
mod merge_function;
mod store;
@ -48,7 +46,11 @@ pub enum WriteMethod {
GetMergePut,
}
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
pub fn create_writer(
typ: CompressionType,
level: Option<u32>,
file: File,
) -> io::Result<Writer<File>> {
let mut builder = Writer::builder();
builder.compression_type(typ);
if let Some(level) = level {
@ -64,8 +66,7 @@ pub fn create_sorter<E>(
chunk_fusing_shrink_size: Option<u64>,
max_nb_chunks: Option<usize>,
max_memory: Option<usize>,
) -> Sorter<MergeFn<E>>
{
) -> Sorter<MergeFn<E>> {
let mut builder = Sorter::builder(merge);
if let Some(shrink_size) = chunk_fusing_shrink_size {
builder.file_fusing_shrink_size(shrink_size);
@ -83,7 +84,10 @@ pub fn create_sorter<E>(
builder.build()
}
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> {
pub fn writer_into_reader(
writer: Writer<File>,
shrink_size: Option<u64>,
) -> Result<Reader<FileFuse>> {
let mut file = writer.into_inner()?;
file.seek(SeekFrom::Start(0))?;
let file = if let Some(shrink_size) = shrink_size {
@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res
pub fn merge_readers<E>(
sources: Vec<Reader<FileFuse>>,
merge: MergeFn<E>,
) -> Merger<FileFuse, MergeFn<E>>
{
) -> Merger<FileFuse, MergeFn<E>> {
let mut builder = Merger::builder(merge);
builder.extend(sources);
builder.build()
@ -118,13 +121,7 @@ where
let before = Instant::now();
let merger = merge_readers(sources, merge);
merger_iter_into_lmdb_database(
wtxn,
database,
merger.into_merge_iter()?,
merge,
method,
)?;
merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
Ok(())
@ -149,7 +146,7 @@ where
while let Some((k, v)) = reader.next()? {
out_iter.append(k, v)?;
}
},
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = reader.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -158,11 +155,11 @@ where
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
let val = merge(k, &vals)?;
iter.put_current(k, &val)?;
},
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
}
}
}
}
@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>(
) -> Result<()>
where
Error: From<E>,
Error: From<grenad::Error<E>>
Error: From<grenad::Error<E>>,
{
debug!("Writing MTBL sorter...");
let before = Instant::now();
merger_iter_into_lmdb_database(
wtxn,
database,
sorter.into_iter()?,
merge,
method,
)?;
merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(())
@ -214,7 +205,7 @@ where
while let Some((k, v)) = sorter.next()? {
out_iter.append(k, v)?;
}
},
}
WriteMethod::GetMergePut => {
while let Some((k, v)) = sorter.next()? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
@ -226,14 +217,14 @@ where
InternalError::IndexingMergingKeys { process: "get-put-merge" }
})?;
iter.put_current(k, &val)?;
},
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
},
}
}
}
},
}
}
Ok(())
@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// Early return when there is no document to add
if reader.buffer().is_empty() {
return Ok(DocumentAdditionResult {
nb_documents: 0,
})
return Ok(DocumentAdditionResult { nb_documents: 0 });
}
self.index.set_updated_at(self.wtxn, &Utc::now())?;
@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let output = match self.update_format {
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?,
UpdateFormat::JsonStream => {
transform.output_from_json_stream(reader, &progress_callback)?
}
};
let nb_documents = output.documents_count;
@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
where
F: Fn(UpdateIndexingStep) + Sync
F: Fn(UpdateIndexingStep) + Sync,
{
let before_indexing = Instant::now();
@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// settings if none have already been set.
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
&backup_pool
},
}
};
let readers = pool.install(|| {
@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
let contains_documents = !documents_ids.is_empty();
let write_method = if contains_documents {
WriteMethod::GetMergePut
} else {
WriteMethod::Append
};
let write_method =
if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
debug!("Writing using the write method: {:?}", write_method);
@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.docid_word_positions.as_polymorph(),
docid_word_positions_readers,
keep_first,
write_method
write_method,
)?;
database_count += 1;
@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
*self.index.documents.as_polymorph(),
documents_readers,
keep_first,
write_method
write_method,
)?;
database_count += 1;
@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
fst_merge,
WriteMethod::GetMergePut,
)?;
},
}
DatabaseType::WordDocids => {
debug!("Writing the words docids into LMDB on disk...");
let db = *self.index.word_docids.as_polymorph();
@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::FacetLevel0NumbersDocids => {
debug!("Writing the facet numbers docids into LMDB on disk...");
let db = *self.index.facet_id_f64_docids.as_polymorph();
@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::FieldIdWordCountDocids => {
debug!("Writing the field id word count docids into LMDB on disk...");
let db = *self.index.field_id_word_count_docids.as_polymorph();
@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
cbo_roaring_bitmap_merge,
write_method,
)?;
},
}
DatabaseType::WordLevel0PositionDocids => {
debug!("Writing the word level 0 positions docids into LMDB on disk...");
let db = *self.index.word_level_position_docids.as_polymorph();
@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
#[cfg(test)]
mod tests {
use super::*;
use heed::EnvOpenOptions;
use super::*;
#[test]
fn simple_document_replacement() {
let path = tempfile::tempdir().unwrap();
@ -1053,9 +1042,8 @@ mod tests {
assert_eq!(count, 3);
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
let (kevin_id, _) = docs.iter().find(|(_, d)| {
d.get(0).unwrap() == br#""updated kevin""#
}).unwrap();
let (kevin_id, _) =
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
let (id, doc) = docs[*kevin_id as usize];
assert_eq!(id, *kevin_id);

View File

@ -8,25 +8,29 @@ use std::{cmp, iter};
use bstr::ByteSlice as _;
use fst::Set;
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
use heed::BytesEncode;
use linked_hash_map::LinkedHashMap;
use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use serde_json::Value;
use tempfile::tempfile;
use super::merge_function::{
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
};
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
use crate::error::{Error, InternalError, SerializationError};
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
FieldDocIdFacetStringCodec,
};
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
use crate::update::UpdateIndexingStep;
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result};
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
const LMDB_MAX_KEY_LENGTH: usize = 511;
const ONE_KILOBYTE: usize = 1024 * 1024;
@ -56,7 +60,8 @@ pub struct Store<'s, A> {
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
word_docids_limit: usize,
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids:
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
words_pairs_proximities_docids_limit: usize,
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
chunk_compression_level: Option<u32>,
chunk_fusing_shrink_size: Option<u64>,
stop_words: Option<&'s Set<A>>,
) -> Result<Self>
{
) -> Result<Self> {
// We divide the max memory by the number of sorter the Store have.
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Some(1024 * 1024 * 1024), // 1MB
);
let documents_writer = tempfile().and_then(|f| {
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let docid_word_positions_writer = tempfile().and_then(|f| {
create_writer(chunk_compression_type, chunk_compression_level, f)
})?;
let documents_writer = tempfile()
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
let docid_word_positions_writer = tempfile()
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
let mut config = AnalyzerConfig::default();
if let Some(stop_words) = stop_words {
@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.word_docids.get_refresh(word.as_bytes()) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
let word_vec = SmallVec32::from(word.as_bytes());
// A newly inserted element is append at the end of the linked hash map.
@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId,
value: OrderedFloat<f64>,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_number_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
field_id: FieldId,
value: String,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
let sorter = &mut self.field_id_docid_facet_strings_sorter;
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
let key = (field_id, value);
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
match self.facet_field_string_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Save the documents ids under the words pairs proximities that it contains.
fn insert_words_pairs_proximities_docids<'a>(
&mut self,
words_pairs_proximities: impl IntoIterator<Item=((&'a str, &'a str), u8)>,
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
id: DocumentId,
) -> Result<()>
{
) -> Result<()> {
for ((w1, w2), prox) in words_pairs_proximities {
let w1 = SmallVec32::from(w1.as_bytes());
let w2 = SmallVec32::from(w2.as_bytes());
@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// if get_refresh finds the element it is assured
// to be at the end of the linked hash map.
match self.words_pairs_proximities_docids.get_refresh(&key) {
Some(old) => { old.insert(id); },
Some(old) => {
old.insert(id);
}
None => {
// A newly inserted element is append at the end of the linked hash map.
let ids = RoaringBitmap::from_iter(Some(id));
@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
// Removing front elements is equivalent to removing the LRUs.
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
iter.take(overflow).for_each(|x| lrus.push(x));
Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?;
Self::write_words_pairs_proximities(
&mut self.words_pairs_proximities_docids_sorter,
lrus,
)?;
}
Ok(())
@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
record: &[u8],
) -> Result<()>
{
) -> Result<()> {
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
Self::write_docid_word_positions(
&mut self.docid_word_positions_writer,
document_id,
words_positions,
)?;
Self::write_word_position_docids(
&mut self.word_level_position_docids_sorter,
document_id,
words_positions,
)?;
words_positions.clear();
@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_words_pairs_proximities<E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: impl IntoIterator<Item=((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
) -> Result<()>
where
Error: From<E>,
@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
writer: &mut Writer<File>,
id: DocumentId,
words_positions: &HashMap<String, SmallVec32<Position>>,
) -> Result<()>
{
) -> Result<()> {
// We prefix the words by the document id.
let mut key = id.to_be_bytes().to_vec();
let mut buffer = Vec::new();
@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_string_docids<I, E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>,
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
Error: From<E>,
{
let mut key_buffer = Vec::new();
@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
Ok(())
}
fn write_facet_field_number_docids<I, E>(
sorter: &mut Sorter<MergeFn<E>>,
iter: I,
) -> Result<()>
fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
Error: From<E>,
{
let mut data_buffer = Vec::new();
@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
where
I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>,
I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>,
Error: From<E>,
{
let mut key = Vec::new();
@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
log_every_n: Option<usize>,
mut progress_callback: F,
) -> Result<Readers>
where F: FnMut(UpdateIndexingStep),
where
F: FnMut(UpdateIndexingStep),
{
debug!("{:?}: Indexing in a Store...", thread_index);
@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
if count % num_threads == thread_index {
// This is a log routine that we do every `log_every_n` documents.
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
info!(
"We have seen {} documents so far ({:.02?}).",
format_count(count),
before.elapsed()
);
progress_callback(UpdateIndexingStep::IndexDocuments {
documents_seen: count,
total_documents: documents_count,
@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}
for (attr, content) in document.iter() {
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr)
{
let value =
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
let (facet_numbers, facet_strings) = extract_facet_values(&value);
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
facet_numbers_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_numbers);
facet_strings_values
.entry(attr)
.or_insert_with(Vec::new)
.extend(facet_strings);
if self.searchable_fields.contains(&attr) {
let content = match json_to_string(&value) {
@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
last_pos = Some(pos);
let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
words_positions
.entry(token.text().to_string())
.or_insert_with(SmallVec32::new)
.push(position);
}
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
let key = (attr, last_pos as u8 + 1);
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
self.field_id_word_count_docids
.entry(key)
.or_insert_with(RoaringBitmap::new)
.insert(document_id);
}
}
}
@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
self.facet_field_string_docids,
)?;
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut word_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut builder = fst::SetBuilder::memory();
let mut iter = self.word_docids_sorter.into_iter()?;
@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.main_sorter.write_into(&mut main_wtr)?;
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut words_pairs_proximities_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.words_pairs_proximities_docids_sorter
.write_into(&mut words_pairs_proximities_docids_wtr)?;
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut word_level_position_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut field_id_word_count_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut facet_field_numbers_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
let mut facet_field_strings_docids_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_numbers_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_numbers_sorter
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
let mut field_id_docid_facet_strings_wtr =
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
self.field_id_docid_facet_strings_sorter
.write_into(&mut field_id_docid_facet_strings_wtr)?;
let main = writer_into_reader(main_wtr, shrink_size)?;
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let words_pairs_proximities_docids =
writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
let word_level_position_docids =
writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
let field_id_word_count_docids =
writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
let facet_field_numbers_docids =
writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
let facet_field_strings_docids =
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
let field_id_docid_facet_numbers =
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
let field_id_docid_facet_strings =
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
let docid_word_positions =
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
Ok(Readers {
@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
/// close to each other.
fn compute_words_pair_proximities(
word_positions: &HashMap<String, SmallVec32<Position>>,
) -> HashMap<(&str, &str), u8>
{
) -> HashMap<(&str, &str), u8> {
use itertools::Itertools;
let mut words_pair_proximities = HashMap::new();
@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool {
/// take an iterator on tokens and compute their relative position depending on separator kinds
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
/// else we keep the standart proximity of 1 between words.
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
fn process_tokens<'a>(
tokens: impl Iterator<Item = Token<'a>>,
) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.skip_while(|token| token.is_separator().is_some())
.scan((0, None), |(offset, prev_kind), token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
}
_ => (),
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
{
*prev_kind = Some(token.kind);
}
_ => (),
}
Some((*offset, token))
})
.filter(|(_, t)| t.is_word())
.filter(|(_, t)| t.is_word())
}
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
match value {
Value::Null => (),
Value::Bool(b) => output_strings.push(b.to_string()),
Value::Number(number) => if let Some(float) = number.as_f64() {
output_numbers.push(float);
},
Value::Number(number) => {
if let Some(float) = number.as_f64() {
output_numbers.push(float);
}
}
Value::String(string) => {
let string = string.trim().to_lowercase();
output_strings.push(string);
},
Value::Array(values) => if can_recurse {
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
}
Value::Array(values) => {
if can_recurse {
for value in values {
inner_extract_facet_values(value, false, output_numbers, output_strings);
}
}
},
}
Value::Object(_) => (),
}
}

View File

@ -10,14 +10,15 @@ use log::info;
use roaring::RoaringBitmap;
use serde_json::{Map, Value};
use crate::error::{Error, UserError, InternalError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
use crate::{Index, Result};
use super::merge_function::merge_two_obkvs;
use super::{create_writer, create_sorter, IndexDocumentsMethod};
use super::{create_sorter, create_writer, IndexDocumentsMethod};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32,
};
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
@ -64,7 +65,11 @@ impl Transform<'_, '_> {
self.output_from_generic_json(reader, false, progress_callback)
}
pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
pub fn output_from_json_stream<R, F>(
self,
reader: R,
progress_callback: F,
) -> Result<TransformOutput>
where
R: Read,
F: Fn(UpdateIndexingStep) + Sync,
@ -86,14 +91,16 @@ impl Transform<'_, '_> {
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
// Deserialize the whole batch of documents in memory.
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream {
let mut documents: Peekable<
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
> = if is_stream {
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
} else {
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
let iter = vec.into_iter().map(Ok);
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
iter.peekable()
};
@ -104,15 +111,16 @@ impl Transform<'_, '_> {
Err(_) => {
let error = documents.next().unwrap().unwrap_err();
return Err(UserError::SerdeJson(error).into());
},
}
};
let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let alternative_name =
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
let (primary_key_id, primary_key) = compute_primary_key_pair(
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids
self.autogenerate_docids,
)?;
if documents.peek().is_none() {
@ -173,9 +181,11 @@ impl Transform<'_, '_> {
Some(value) => match value {
Value::String(string) => Cow::Borrowed(string.as_str()),
Value::Number(number) => Cow::Owned(number.to_string()),
content => return Err(UserError::InvalidDocumentId {
document_id: content.clone(),
}.into()),
content => {
return Err(
UserError::InvalidDocumentId { document_id: content.clone() }.into()
)
}
},
None => {
if !self.autogenerate_docids {
@ -183,7 +193,7 @@ impl Transform<'_, '_> {
}
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
Cow::Borrowed(uuid)
},
}
};
// We iterate in the fields ids ordered.
@ -194,7 +204,8 @@ impl Transform<'_, '_> {
// and this should be the document id we return the one we generated.
if let Some(value) = document.get(name) {
// We serialize the attribute values.
serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?;
serde_json::to_writer(&mut json_buffer, value)
.map_err(InternalError::SerdeJson)?;
writer.insert(field_id, &json_buffer)?;
}
@ -202,7 +213,8 @@ impl Transform<'_, '_> {
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}.into());
}
.into());
}
}
@ -248,9 +260,9 @@ impl Transform<'_, '_> {
// Extract the position of the primary key in the current headers, None if not found.
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
Some(primary_key) => {
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
},
// The primary key is known so we must find the position in the CSV headers.
headers.iter().position(|h| h == primary_key)
}
None => headers.iter().position(is_primary_key),
};
@ -261,7 +273,7 @@ impl Transform<'_, '_> {
self.index.primary_key(self.rtxn)?,
&mut fields_ids_map,
alternative_name,
self.autogenerate_docids
self.autogenerate_docids,
)?;
// The primary key field is not present in the header, so we need to create it.
@ -308,27 +320,30 @@ impl Transform<'_, '_> {
// We validate the document id [a-zA-Z0-9\-_].
match validate_document_id(&external_id) {
Some(valid) => valid,
None => return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}.into()),
None => {
return Err(UserError::InvalidDocumentId {
document_id: Value::from(external_id),
}
.into())
}
}
},
}
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
};
// When the primary_key_field_id is found in the fields ids list
// we return the generated document id instead of the record field.
let iter = fields_ids.iter()
.map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
});
let iter = fields_ids.iter().map(|(fi, i)| {
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
(fi, field)
});
// We retrieve the field id based on the fields ids map fields ids order.
for (field_id, field) in iter {
// We serialize the attribute values as JSON strings.
json_buffer.clear();
serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?;
serde_json::to_writer(&mut json_buffer, &field)
.map_err(InternalError::SerdeJson)?;
writer.insert(*field_id, &json_buffer)?;
}
@ -410,26 +425,27 @@ impl Transform<'_, '_> {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let key = BEU32::new(docid);
let base_obkv = self.index.documents.get(&self.rtxn, &key)?
.ok_or(InternalError::DatabaseMissingEntry {
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry {
db_name: db_name::DOCUMENTS,
key: None,
})?;
},
)?;
let update_obkv = obkv::KvReader::new(update_obkv);
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
(docid, obkv_buffer.as_slice())
}
}
},
}
None => {
// If this user id is new we add it to the external documents ids map
// for new ids and into the list of new documents.
let new_docid = available_documents_ids.next()
.ok_or(UserError::DocumentLimitReached)?;
let new_docid =
available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
new_documents_ids.insert(new_docid);
(new_docid, update_obkv)
},
}
};
// We insert the document under the documents ids map into the final file.
@ -450,7 +466,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
// Once we have written all the documents into the final sorter, we write the documents
// into this writer, extract the file and reset the seek to be able to read it again.
@ -485,8 +502,7 @@ impl Transform<'_, '_> {
primary_key: String,
old_fields_ids_map: FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput>
{
) -> Result<TransformOutput> {
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?;
@ -494,7 +510,8 @@ impl Transform<'_, '_> {
// We create a final writer to write the new documents in order from the sorter.
let file = tempfile::tempfile()?;
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut writer =
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
let mut obkv_buffer = Vec::new();
for result in self.index.documents.iter(self.rtxn)? {
@ -561,20 +578,19 @@ fn compute_primary_key_pair(
return Err(UserError::MissingPrimaryKey.into());
}
DEFAULT_PRIMARY_KEY_NAME.to_string()
},
}
};
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
Ok((id, name))
},
}
}
}
fn validate_document_id(document_id: &str) -> Option<&str> {
let document_id = document_id.trim();
Some(document_id).filter(|id| {
!id.is_empty() && id.chars().all(|c| {
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')
})
!id.is_empty()
&& id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
})
}
@ -583,8 +599,7 @@ mod test {
use super::*;
mod compute_primary_key {
use super::compute_primary_key_pair;
use super::FieldsIdsMap;
use super::{compute_primary_key_pair, FieldsIdsMap};
#[test]
fn should_return_primary_key_if_is_some() {
@ -594,7 +609,8 @@ mod test {
Some("toto"),
&mut fields_map,
Some("tata".to_string()),
false);
false,
);
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -602,11 +618,8 @@ mod test {
#[test]
fn should_return_alternative_if_primary_is_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
Some("tata".to_string()),
false);
let result =
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
assert_eq!(fields_map.len(), 1);
}
@ -614,23 +627,15 @@ mod test {
#[test]
fn should_return_default_if_both_are_none() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
true);
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
assert_eq!(fields_map.len(), 1);
}
#[test]
fn should_return_err_if_both_are_none_and_recompute_is_false(){
fn should_return_err_if_both_are_none_and_recompute_is_false() {
let mut fields_map = FieldsIdsMap::new();
let result = compute_primary_key_pair(
None,
&mut fields_map,
None,
false);
let result = compute_primary_key_pair(None, &mut fields_map, None, false);
assert!(result.is_err());
assert_eq!(fields_map.len(), 0);
}