mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-08-02 19:59:58 +00:00
format the whole project
This commit is contained in:
@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, SeekFrom, BufReader, BufRead};
|
||||
use std::io::{self, BufRead, BufReader, Seek, SeekFrom};
|
||||
use std::num::{NonZeroU32, NonZeroUsize};
|
||||
use std::result::Result as StdResult;
|
||||
use std::str;
|
||||
@ -10,28 +10,26 @@ use std::time::Instant;
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use chrono::Utc;
|
||||
use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionType};
|
||||
use grenad::{CompressionType, FileFuse, Merger, MergerIter, Reader, Sorter, Writer};
|
||||
use heed::types::ByteSlice;
|
||||
use log::{debug, info, error};
|
||||
use log::{debug, error, info};
|
||||
use memmap::Mmap;
|
||||
use rayon::prelude::*;
|
||||
use rayon::ThreadPool;
|
||||
use serde::{Serialize, Deserialize};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::{Index, Result};
|
||||
use crate::update::{
|
||||
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
|
||||
WordPrefixPairProximityDocids,
|
||||
};
|
||||
use self::store::{Store, Readers};
|
||||
pub use self::merge_function::{
|
||||
fst_merge, cbo_roaring_bitmap_merge, roaring_bitmap_merge, keep_first
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
};
|
||||
use self::store::{Readers, Store};
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
|
||||
use crate::MergeFn;
|
||||
use super::UpdateBuilder;
|
||||
use crate::error::{Error, InternalError};
|
||||
use crate::update::{
|
||||
Facets, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids,
|
||||
WordsLevelPositions, WordsPrefixesFst,
|
||||
};
|
||||
use crate::{Index, MergeFn, Result};
|
||||
|
||||
mod merge_function;
|
||||
mod store;
|
||||
@ -48,7 +46,11 @@ pub enum WriteMethod {
|
||||
GetMergePut,
|
||||
}
|
||||
|
||||
pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> {
|
||||
pub fn create_writer(
|
||||
typ: CompressionType,
|
||||
level: Option<u32>,
|
||||
file: File,
|
||||
) -> io::Result<Writer<File>> {
|
||||
let mut builder = Writer::builder();
|
||||
builder.compression_type(typ);
|
||||
if let Some(level) = level {
|
||||
@ -64,8 +66,7 @@ pub fn create_sorter<E>(
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
) -> Sorter<MergeFn<E>>
|
||||
{
|
||||
) -> Sorter<MergeFn<E>> {
|
||||
let mut builder = Sorter::builder(merge);
|
||||
if let Some(shrink_size) = chunk_fusing_shrink_size {
|
||||
builder.file_fusing_shrink_size(shrink_size);
|
||||
@ -83,7 +84,10 @@ pub fn create_sorter<E>(
|
||||
builder.build()
|
||||
}
|
||||
|
||||
pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Result<Reader<FileFuse>> {
|
||||
pub fn writer_into_reader(
|
||||
writer: Writer<File>,
|
||||
shrink_size: Option<u64>,
|
||||
) -> Result<Reader<FileFuse>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
let file = if let Some(shrink_size) = shrink_size {
|
||||
@ -97,8 +101,7 @@ pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> Res
|
||||
pub fn merge_readers<E>(
|
||||
sources: Vec<Reader<FileFuse>>,
|
||||
merge: MergeFn<E>,
|
||||
) -> Merger<FileFuse, MergeFn<E>>
|
||||
{
|
||||
) -> Merger<FileFuse, MergeFn<E>> {
|
||||
let mut builder = Merger::builder(merge);
|
||||
builder.extend(sources);
|
||||
builder.build()
|
||||
@ -118,13 +121,7 @@ where
|
||||
let before = Instant::now();
|
||||
|
||||
let merger = merge_readers(sources, merge);
|
||||
merger_iter_into_lmdb_database(
|
||||
wtxn,
|
||||
database,
|
||||
merger.into_merge_iter()?,
|
||||
merge,
|
||||
method,
|
||||
)?;
|
||||
merger_iter_into_lmdb_database(wtxn, database, merger.into_merge_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL stores merged in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
@ -149,7 +146,7 @@ where
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
out_iter.append(k, v)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = reader.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
@ -158,11 +155,11 @@ where
|
||||
let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..];
|
||||
let val = merge(k, &vals)?;
|
||||
iter.put_current(k, &val)?;
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -181,18 +178,12 @@ pub fn sorter_into_lmdb_database<E>(
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
Error: From<grenad::Error<E>>
|
||||
Error: From<grenad::Error<E>>,
|
||||
{
|
||||
debug!("Writing MTBL sorter...");
|
||||
let before = Instant::now();
|
||||
|
||||
merger_iter_into_lmdb_database(
|
||||
wtxn,
|
||||
database,
|
||||
sorter.into_iter()?,
|
||||
merge,
|
||||
method,
|
||||
)?;
|
||||
merger_iter_into_lmdb_database(wtxn, database, sorter.into_iter()?, merge, method)?;
|
||||
|
||||
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
|
||||
Ok(())
|
||||
@ -214,7 +205,7 @@ where
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
out_iter.append(k, v)?;
|
||||
}
|
||||
},
|
||||
}
|
||||
WriteMethod::GetMergePut => {
|
||||
while let Some((k, v)) = sorter.next()? {
|
||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
|
||||
@ -226,14 +217,14 @@ where
|
||||
InternalError::IndexingMergingKeys { process: "get-put-merge" }
|
||||
})?;
|
||||
iter.put_current(k, &val)?;
|
||||
},
|
||||
}
|
||||
_ => {
|
||||
drop(iter);
|
||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -341,9 +332,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
// Early return when there is no document to add
|
||||
if reader.buffer().is_empty() {
|
||||
return Ok(DocumentAdditionResult {
|
||||
nb_documents: 0,
|
||||
})
|
||||
return Ok(DocumentAdditionResult { nb_documents: 0 });
|
||||
}
|
||||
|
||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
||||
@ -367,7 +356,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
let output = match self.update_format {
|
||||
UpdateFormat::Csv => transform.output_from_csv(reader, &progress_callback)?,
|
||||
UpdateFormat::Json => transform.output_from_json(reader, &progress_callback)?,
|
||||
UpdateFormat::JsonStream => transform.output_from_json_stream(reader, &progress_callback)?,
|
||||
UpdateFormat::JsonStream => {
|
||||
transform.output_from_json_stream(reader, &progress_callback)?
|
||||
}
|
||||
};
|
||||
|
||||
let nb_documents = output.documents_count;
|
||||
@ -380,7 +371,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> Result<()>
|
||||
where
|
||||
F: Fn(UpdateIndexingStep) + Sync
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
{
|
||||
let before_indexing = Instant::now();
|
||||
|
||||
@ -457,7 +448,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
// settings if none have already been set.
|
||||
backup_pool = rayon::ThreadPoolBuilder::new().build()?;
|
||||
&backup_pool
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let readers = pool.install(|| {
|
||||
@ -595,11 +586,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
let mut documents_ids = self.index.documents_ids(self.wtxn)?;
|
||||
let contains_documents = !documents_ids.is_empty();
|
||||
let write_method = if contains_documents {
|
||||
WriteMethod::GetMergePut
|
||||
} else {
|
||||
WriteMethod::Append
|
||||
};
|
||||
let write_method =
|
||||
if contains_documents { WriteMethod::GetMergePut } else { WriteMethod::Append };
|
||||
|
||||
debug!("Writing using the write method: {:?}", write_method);
|
||||
|
||||
@ -634,7 +622,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
*self.index.docid_word_positions.as_polymorph(),
|
||||
docid_word_positions_readers,
|
||||
keep_first,
|
||||
write_method
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
@ -649,7 +637,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
*self.index.documents.as_polymorph(),
|
||||
documents_readers,
|
||||
keep_first,
|
||||
write_method
|
||||
write_method,
|
||||
)?;
|
||||
|
||||
database_count += 1;
|
||||
@ -730,7 +718,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
fst_merge,
|
||||
WriteMethod::GetMergePut,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::WordDocids => {
|
||||
debug!("Writing the words docids into LMDB on disk...");
|
||||
let db = *self.index.word_docids.as_polymorph();
|
||||
@ -741,7 +729,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::FacetLevel0NumbersDocids => {
|
||||
debug!("Writing the facet numbers docids into LMDB on disk...");
|
||||
let db = *self.index.facet_id_f64_docids.as_polymorph();
|
||||
@ -752,7 +740,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::FieldIdWordCountDocids => {
|
||||
debug!("Writing the field id word count docids into LMDB on disk...");
|
||||
let db = *self.index.field_id_word_count_docids.as_polymorph();
|
||||
@ -763,7 +751,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
cbo_roaring_bitmap_merge,
|
||||
write_method,
|
||||
)?;
|
||||
},
|
||||
}
|
||||
DatabaseType::WordLevel0PositionDocids => {
|
||||
debug!("Writing the word level 0 positions docids into LMDB on disk...");
|
||||
let db = *self.index.word_level_position_docids.as_polymorph();
|
||||
@ -848,9 +836,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use heed::EnvOpenOptions;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
@ -1053,9 +1042,8 @@ mod tests {
|
||||
assert_eq!(count, 3);
|
||||
|
||||
let docs = index.documents(&rtxn, vec![0, 1, 2]).unwrap();
|
||||
let (kevin_id, _) = docs.iter().find(|(_, d)| {
|
||||
d.get(0).unwrap() == br#""updated kevin""#
|
||||
}).unwrap();
|
||||
let (kevin_id, _) =
|
||||
docs.iter().find(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
|
||||
let (id, doc) = docs[*kevin_id as usize];
|
||||
assert_eq!(id, *kevin_id);
|
||||
|
||||
|
@ -8,25 +8,29 @@ use std::{cmp, iter};
|
||||
|
||||
use bstr::ByteSlice as _;
|
||||
use fst::Set;
|
||||
use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
|
||||
use grenad::{CompressionType, FileFuse, Reader, Sorter, Writer};
|
||||
use heed::BytesEncode;
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use log::{debug, info};
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
|
||||
use meilisearch_tokenizer::token::SeparatorKind;
|
||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use super::merge_function::{
|
||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||
};
|
||||
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
|
||||
use crate::error::{Error, InternalError, SerializationError};
|
||||
use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec};
|
||||
use crate::heed_codec::facet::{FieldDocIdFacetStringCodec, FieldDocIdFacetF64Codec};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec,
|
||||
FieldDocIdFacetStringCodec,
|
||||
};
|
||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||
use crate::update::UpdateIndexingStep;
|
||||
use crate::{json_to_string, SmallVec32, Position, DocumentId, FieldId, Result};
|
||||
|
||||
use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
|
||||
use super::merge_function::{fst_merge, keep_first, roaring_bitmap_merge, cbo_roaring_bitmap_merge};
|
||||
use crate::{json_to_string, DocumentId, FieldId, Position, Result, SmallVec32};
|
||||
|
||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||
const ONE_KILOBYTE: usize = 1024 * 1024;
|
||||
@ -56,7 +60,8 @@ pub struct Store<'s, A> {
|
||||
word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>,
|
||||
word_docids_limit: usize,
|
||||
field_id_word_count_docids: HashMap<(FieldId, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids: LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids:
|
||||
LinkedHashMap<(SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap>,
|
||||
words_pairs_proximities_docids_limit: usize,
|
||||
facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat<f64>), RoaringBitmap>,
|
||||
facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>,
|
||||
@ -93,8 +98,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
chunk_compression_level: Option<u32>,
|
||||
chunk_fusing_shrink_size: Option<u64>,
|
||||
stop_words: Option<&'s Set<A>>,
|
||||
) -> Result<Self>
|
||||
{
|
||||
) -> Result<Self> {
|
||||
// We divide the max memory by the number of sorter the Store have.
|
||||
let max_memory = max_memory.map(|mm| cmp::max(ONE_KILOBYTE, mm / 5));
|
||||
let linked_hash_map_size = linked_hash_map_size.unwrap_or(500);
|
||||
@ -172,12 +176,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Some(1024 * 1024 * 1024), // 1MB
|
||||
);
|
||||
|
||||
let documents_writer = tempfile().and_then(|f| {
|
||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||
})?;
|
||||
let docid_word_positions_writer = tempfile().and_then(|f| {
|
||||
create_writer(chunk_compression_type, chunk_compression_level, f)
|
||||
})?;
|
||||
let documents_writer = tempfile()
|
||||
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
|
||||
let docid_word_positions_writer = tempfile()
|
||||
.and_then(|f| create_writer(chunk_compression_type, chunk_compression_level, f))?;
|
||||
|
||||
let mut config = AnalyzerConfig::default();
|
||||
if let Some(stop_words) = stop_words {
|
||||
@ -224,7 +226,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> Result<()> {
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.word_docids.get_refresh(word.as_bytes()) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
let word_vec = SmallVec32::from(word.as_bytes());
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
@ -246,15 +250,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
field_id: FieldId,
|
||||
value: OrderedFloat<f64>,
|
||||
id: DocumentId,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
let sorter = &mut self.field_id_docid_facet_numbers_sorter;
|
||||
Self::write_field_id_docid_facet_number_value(sorter, field_id, id, value)?;
|
||||
|
||||
let key = (field_id, value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_number_docids.get_refresh(&key) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_number_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
|
||||
@ -279,15 +284,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
field_id: FieldId,
|
||||
value: String,
|
||||
id: DocumentId,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
let sorter = &mut self.field_id_docid_facet_strings_sorter;
|
||||
Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?;
|
||||
|
||||
let key = (field_id, value);
|
||||
// if get_refresh finds the element it is assured to be at the end of the linked hash map.
|
||||
match self.facet_field_string_docids.get_refresh(&key) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id)));
|
||||
@ -309,10 +315,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
// Save the documents ids under the words pairs proximities that it contains.
|
||||
fn insert_words_pairs_proximities_docids<'a>(
|
||||
&mut self,
|
||||
words_pairs_proximities: impl IntoIterator<Item=((&'a str, &'a str), u8)>,
|
||||
words_pairs_proximities: impl IntoIterator<Item = ((&'a str, &'a str), u8)>,
|
||||
id: DocumentId,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
for ((w1, w2), prox) in words_pairs_proximities {
|
||||
let w1 = SmallVec32::from(w1.as_bytes());
|
||||
let w2 = SmallVec32::from(w2.as_bytes());
|
||||
@ -320,7 +325,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
// if get_refresh finds the element it is assured
|
||||
// to be at the end of the linked hash map.
|
||||
match self.words_pairs_proximities_docids.get_refresh(&key) {
|
||||
Some(old) => { old.insert(id); },
|
||||
Some(old) => {
|
||||
old.insert(id);
|
||||
}
|
||||
None => {
|
||||
// A newly inserted element is append at the end of the linked hash map.
|
||||
let ids = RoaringBitmap::from_iter(Some(id));
|
||||
@ -337,7 +344,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
// Removing front elements is equivalent to removing the LRUs.
|
||||
let iter = iter::from_fn(|| self.words_pairs_proximities_docids.pop_front());
|
||||
iter.take(overflow).for_each(|x| lrus.push(x));
|
||||
Self::write_words_pairs_proximities(&mut self.words_pairs_proximities_docids_sorter, lrus)?;
|
||||
Self::write_words_pairs_proximities(
|
||||
&mut self.words_pairs_proximities_docids_sorter,
|
||||
lrus,
|
||||
)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -350,8 +360,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
facet_numbers_values: &mut HashMap<FieldId, Vec<f64>>,
|
||||
facet_strings_values: &mut HashMap<FieldId, Vec<String>>,
|
||||
record: &[u8],
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
// We compute the list of words pairs proximities (self-join) and write it directly to disk.
|
||||
let words_pair_proximities = compute_words_pair_proximities(&words_positions);
|
||||
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
|
||||
@ -362,8 +371,16 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
}
|
||||
|
||||
self.documents_writer.insert(document_id.to_be_bytes(), record)?;
|
||||
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
|
||||
Self::write_word_position_docids(&mut self.word_level_position_docids_sorter, document_id, words_positions)?;
|
||||
Self::write_docid_word_positions(
|
||||
&mut self.docid_word_positions_writer,
|
||||
document_id,
|
||||
words_positions,
|
||||
)?;
|
||||
Self::write_word_position_docids(
|
||||
&mut self.word_level_position_docids_sorter,
|
||||
document_id,
|
||||
words_positions,
|
||||
)?;
|
||||
|
||||
words_positions.clear();
|
||||
|
||||
@ -387,7 +404,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
fn write_words_pairs_proximities<E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: impl IntoIterator<Item=((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
|
||||
iter: impl IntoIterator<Item = ((SmallVec32<u8>, SmallVec32<u8>, u8), RoaringBitmap)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
Error: From<E>,
|
||||
@ -419,8 +436,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
writer: &mut Writer<File>,
|
||||
id: DocumentId,
|
||||
words_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> Result<()>
|
||||
{
|
||||
) -> Result<()> {
|
||||
// We prefix the words by the document id.
|
||||
let mut key = id.to_be_bytes().to_vec();
|
||||
let mut buffer = Vec::new();
|
||||
@ -484,12 +500,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_string_docids<I, E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: I,
|
||||
) -> Result<()>
|
||||
fn write_facet_field_string_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item=((FieldId, String), RoaringBitmap)>,
|
||||
I: IntoIterator<Item = ((FieldId, String), RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key_buffer = Vec::new();
|
||||
@ -510,12 +523,9 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn write_facet_field_number_docids<I, E>(
|
||||
sorter: &mut Sorter<MergeFn<E>>,
|
||||
iter: I,
|
||||
) -> Result<()>
|
||||
fn write_facet_field_number_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item=((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
|
||||
I: IntoIterator<Item = ((FieldId, OrderedFloat<f64>), RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut data_buffer = Vec::new();
|
||||
@ -579,7 +589,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
|
||||
fn write_word_docids<I, E>(sorter: &mut Sorter<MergeFn<E>>, iter: I) -> Result<()>
|
||||
where
|
||||
I: IntoIterator<Item=(SmallVec32<u8>, RoaringBitmap)>,
|
||||
I: IntoIterator<Item = (SmallVec32<u8>, RoaringBitmap)>,
|
||||
Error: From<E>,
|
||||
{
|
||||
let mut key = Vec::new();
|
||||
@ -611,7 +621,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
log_every_n: Option<usize>,
|
||||
mut progress_callback: F,
|
||||
) -> Result<Readers>
|
||||
where F: FnMut(UpdateIndexingStep),
|
||||
where
|
||||
F: FnMut(UpdateIndexingStep),
|
||||
{
|
||||
debug!("{:?}: Indexing in a Store...", thread_index);
|
||||
|
||||
@ -629,7 +640,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
if count % num_threads == thread_index {
|
||||
// This is a log routine that we do every `log_every_n` documents.
|
||||
if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) {
|
||||
info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
|
||||
info!(
|
||||
"We have seen {} documents so far ({:.02?}).",
|
||||
format_count(count),
|
||||
before.elapsed()
|
||||
);
|
||||
progress_callback(UpdateIndexingStep::IndexDocuments {
|
||||
documents_seen: count,
|
||||
total_documents: documents_count,
|
||||
@ -638,12 +653,20 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
}
|
||||
|
||||
for (attr, content) in document.iter() {
|
||||
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr) {
|
||||
let value = serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
|
||||
if self.faceted_fields.contains(&attr) || self.searchable_fields.contains(&attr)
|
||||
{
|
||||
let value =
|
||||
serde_json::from_slice(content).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
let (facet_numbers, facet_strings) = extract_facet_values(&value);
|
||||
facet_numbers_values.entry(attr).or_insert_with(Vec::new).extend(facet_numbers);
|
||||
facet_strings_values.entry(attr).or_insert_with(Vec::new).extend(facet_strings);
|
||||
facet_numbers_values
|
||||
.entry(attr)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(facet_numbers);
|
||||
facet_strings_values
|
||||
.entry(attr)
|
||||
.or_insert_with(Vec::new)
|
||||
.extend(facet_strings);
|
||||
|
||||
if self.searchable_fields.contains(&attr) {
|
||||
let content = match json_to_string(&value) {
|
||||
@ -658,12 +681,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
|
||||
last_pos = Some(pos);
|
||||
let position = (attr as usize * MAX_POSITION + pos) as u32;
|
||||
words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
|
||||
words_positions
|
||||
.entry(token.text().to_string())
|
||||
.or_insert_with(SmallVec32::new)
|
||||
.push(position);
|
||||
}
|
||||
|
||||
if let Some(last_pos) = last_pos.filter(|p| *p <= 10) {
|
||||
let key = (attr, last_pos as u8 + 1);
|
||||
self.field_id_word_count_docids.entry(key).or_insert_with(RoaringBitmap::new).insert(document_id);
|
||||
self.field_id_word_count_docids
|
||||
.entry(key)
|
||||
.or_insert_with(RoaringBitmap::new)
|
||||
.insert(document_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -713,7 +742,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
self.facet_field_string_docids,
|
||||
)?;
|
||||
|
||||
let mut word_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut word_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
let mut iter = self.word_docids_sorter.into_iter()?;
|
||||
@ -737,37 +767,55 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
let mut main_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.main_sorter.write_into(&mut main_wtr)?;
|
||||
|
||||
let mut words_pairs_proximities_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.words_pairs_proximities_docids_sorter.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||
let mut words_pairs_proximities_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.words_pairs_proximities_docids_sorter
|
||||
.write_into(&mut words_pairs_proximities_docids_wtr)?;
|
||||
|
||||
let mut word_level_position_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut word_level_position_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.word_level_position_docids_sorter.write_into(&mut word_level_position_docids_wtr)?;
|
||||
|
||||
let mut field_id_word_count_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut field_id_word_count_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_word_count_docids_sorter.write_into(&mut field_id_word_count_docids_wtr)?;
|
||||
|
||||
let mut facet_field_numbers_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut facet_field_numbers_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_numbers_docids_sorter.write_into(&mut facet_field_numbers_docids_wtr)?;
|
||||
|
||||
let mut facet_field_strings_docids_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
let mut facet_field_strings_docids_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.facet_field_strings_docids_sorter.write_into(&mut facet_field_strings_docids_wtr)?;
|
||||
|
||||
let mut field_id_docid_facet_numbers_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_numbers_sorter.write_into(&mut field_id_docid_facet_numbers_wtr)?;
|
||||
let mut field_id_docid_facet_numbers_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_numbers_sorter
|
||||
.write_into(&mut field_id_docid_facet_numbers_wtr)?;
|
||||
|
||||
let mut field_id_docid_facet_strings_wtr = tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_strings_sorter.write_into(&mut field_id_docid_facet_strings_wtr)?;
|
||||
let mut field_id_docid_facet_strings_wtr =
|
||||
tempfile().and_then(|f| create_writer(comp_type, comp_level, f))?;
|
||||
self.field_id_docid_facet_strings_sorter
|
||||
.write_into(&mut field_id_docid_facet_strings_wtr)?;
|
||||
|
||||
let main = writer_into_reader(main_wtr, shrink_size)?;
|
||||
let word_docids = writer_into_reader(word_docids_wtr, shrink_size)?;
|
||||
let words_pairs_proximities_docids = writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||
let word_level_position_docids = writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||
let field_id_word_count_docids = writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
|
||||
let facet_field_numbers_docids = writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
||||
let facet_field_strings_docids = writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_numbers = writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_strings = writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
|
||||
let docid_word_positions = writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||
let words_pairs_proximities_docids =
|
||||
writer_into_reader(words_pairs_proximities_docids_wtr, shrink_size)?;
|
||||
let word_level_position_docids =
|
||||
writer_into_reader(word_level_position_docids_wtr, shrink_size)?;
|
||||
let field_id_word_count_docids =
|
||||
writer_into_reader(field_id_word_count_docids_wtr, shrink_size)?;
|
||||
let facet_field_numbers_docids =
|
||||
writer_into_reader(facet_field_numbers_docids_wtr, shrink_size)?;
|
||||
let facet_field_strings_docids =
|
||||
writer_into_reader(facet_field_strings_docids_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_numbers =
|
||||
writer_into_reader(field_id_docid_facet_numbers_wtr, shrink_size)?;
|
||||
let field_id_docid_facet_strings =
|
||||
writer_into_reader(field_id_docid_facet_strings_wtr, shrink_size)?;
|
||||
let docid_word_positions =
|
||||
writer_into_reader(self.docid_word_positions_writer, shrink_size)?;
|
||||
let documents = writer_into_reader(self.documents_writer, shrink_size)?;
|
||||
|
||||
Ok(Readers {
|
||||
@ -792,8 +840,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
||||
/// close to each other.
|
||||
fn compute_words_pair_proximities(
|
||||
word_positions: &HashMap<String, SmallVec32<Position>>,
|
||||
) -> HashMap<(&str, &str), u8>
|
||||
{
|
||||
) -> HashMap<(&str, &str), u8> {
|
||||
use itertools::Itertools;
|
||||
|
||||
let mut words_pair_proximities = HashMap::new();
|
||||
@ -828,31 +875,34 @@ fn lmdb_key_valid_size(key: &[u8]) -> bool {
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standart proximity of 1 between words.
|
||||
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator().is_some())
|
||||
.scan((0, None), |(offset, prev_kind), token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
|
||||
@ -865,18 +915,22 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<String>) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push(b.to_string()),
|
||||
Value::Number(number) => if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
},
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(string) => {
|
||||
let string = string.trim().to_lowercase();
|
||||
output_strings.push(string);
|
||||
},
|
||||
Value::Array(values) => if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(value, false, output_numbers, output_strings);
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
@ -10,14 +10,15 @@ use log::info;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::error::{Error, UserError, InternalError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::index_documents::merge_function::{merge_obkvs, keep_latest_obkv};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
|
||||
use crate::{Index, Result};
|
||||
use super::merge_function::merge_two_obkvs;
|
||||
use super::{create_writer, create_sorter, IndexDocumentsMethod};
|
||||
use super::{create_sorter, create_writer, IndexDocumentsMethod};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::db_name;
|
||||
use crate::update::index_documents::merge_function::{keep_latest_obkv, merge_obkvs};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::{
|
||||
ExternalDocumentsIds, FieldId, FieldsDistribution, FieldsIdsMap, Index, MergeFn, Result, BEU32,
|
||||
};
|
||||
|
||||
const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
|
||||
|
||||
@ -64,7 +65,11 @@ impl Transform<'_, '_> {
|
||||
self.output_from_generic_json(reader, false, progress_callback)
|
||||
}
|
||||
|
||||
pub fn output_from_json_stream<R, F>(self, reader: R, progress_callback: F) -> Result<TransformOutput>
|
||||
pub fn output_from_json_stream<R, F>(
|
||||
self,
|
||||
reader: R,
|
||||
progress_callback: F,
|
||||
) -> Result<TransformOutput>
|
||||
where
|
||||
R: Read,
|
||||
F: Fn(UpdateIndexingStep) + Sync,
|
||||
@ -86,14 +91,16 @@ impl Transform<'_, '_> {
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn).unwrap();
|
||||
|
||||
// Deserialize the whole batch of documents in memory.
|
||||
let mut documents: Peekable<Box<dyn Iterator<Item=serde_json::Result<Map<String, Value>>>>> = if is_stream {
|
||||
let mut documents: Peekable<
|
||||
Box<dyn Iterator<Item = serde_json::Result<Map<String, Value>>>>,
|
||||
> = if is_stream {
|
||||
let iter = serde_json::Deserializer::from_reader(reader).into_iter();
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
|
||||
iter.peekable()
|
||||
} else {
|
||||
let vec: Vec<_> = serde_json::from_reader(reader).map_err(UserError::SerdeJson)?;
|
||||
let iter = vec.into_iter().map(Ok);
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item=_>>;
|
||||
let iter = Box::new(iter) as Box<dyn Iterator<Item = _>>;
|
||||
iter.peekable()
|
||||
};
|
||||
|
||||
@ -104,15 +111,16 @@ impl Transform<'_, '_> {
|
||||
Err(_) => {
|
||||
let error = documents.next().unwrap().unwrap_err();
|
||||
return Err(UserError::SerdeJson(error).into());
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
let alternative_name = first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
|
||||
let alternative_name =
|
||||
first.and_then(|doc| doc.keys().find(|f| is_primary_key(f)).cloned());
|
||||
let (primary_key_id, primary_key) = compute_primary_key_pair(
|
||||
self.index.primary_key(self.rtxn)?,
|
||||
&mut fields_ids_map,
|
||||
alternative_name,
|
||||
self.autogenerate_docids
|
||||
self.autogenerate_docids,
|
||||
)?;
|
||||
|
||||
if documents.peek().is_none() {
|
||||
@ -173,9 +181,11 @@ impl Transform<'_, '_> {
|
||||
Some(value) => match value {
|
||||
Value::String(string) => Cow::Borrowed(string.as_str()),
|
||||
Value::Number(number) => Cow::Owned(number.to_string()),
|
||||
content => return Err(UserError::InvalidDocumentId {
|
||||
document_id: content.clone(),
|
||||
}.into()),
|
||||
content => {
|
||||
return Err(
|
||||
UserError::InvalidDocumentId { document_id: content.clone() }.into()
|
||||
)
|
||||
}
|
||||
},
|
||||
None => {
|
||||
if !self.autogenerate_docids {
|
||||
@ -183,7 +193,7 @@ impl Transform<'_, '_> {
|
||||
}
|
||||
let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer);
|
||||
Cow::Borrowed(uuid)
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// We iterate in the fields ids ordered.
|
||||
@ -194,7 +204,8 @@ impl Transform<'_, '_> {
|
||||
// and this should be the document id we return the one we generated.
|
||||
if let Some(value) = document.get(name) {
|
||||
// We serialize the attribute values.
|
||||
serde_json::to_writer(&mut json_buffer, value).map_err(InternalError::SerdeJson)?;
|
||||
serde_json::to_writer(&mut json_buffer, value)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(field_id, &json_buffer)?;
|
||||
}
|
||||
|
||||
@ -202,7 +213,8 @@ impl Transform<'_, '_> {
|
||||
if field_id == primary_key_id && validate_document_id(&external_id).is_none() {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}.into());
|
||||
}
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
@ -248,9 +260,9 @@ impl Transform<'_, '_> {
|
||||
// Extract the position of the primary key in the current headers, None if not found.
|
||||
let primary_key_pos = match self.index.primary_key(self.rtxn)? {
|
||||
Some(primary_key) => {
|
||||
// The primary key is known so we must find the position in the CSV headers.
|
||||
headers.iter().position(|h| h == primary_key)
|
||||
},
|
||||
// The primary key is known so we must find the position in the CSV headers.
|
||||
headers.iter().position(|h| h == primary_key)
|
||||
}
|
||||
None => headers.iter().position(is_primary_key),
|
||||
};
|
||||
|
||||
@ -261,7 +273,7 @@ impl Transform<'_, '_> {
|
||||
self.index.primary_key(self.rtxn)?,
|
||||
&mut fields_ids_map,
|
||||
alternative_name,
|
||||
self.autogenerate_docids
|
||||
self.autogenerate_docids,
|
||||
)?;
|
||||
|
||||
// The primary key field is not present in the header, so we need to create it.
|
||||
@ -308,27 +320,30 @@ impl Transform<'_, '_> {
|
||||
// We validate the document id [a-zA-Z0-9\-_].
|
||||
match validate_document_id(&external_id) {
|
||||
Some(valid) => valid,
|
||||
None => return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}.into()),
|
||||
None => {
|
||||
return Err(UserError::InvalidDocumentId {
|
||||
document_id: Value::from(external_id),
|
||||
}
|
||||
.into())
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
None => uuid::Uuid::new_v4().to_hyphenated().encode_lower(&mut uuid_buffer),
|
||||
};
|
||||
|
||||
// When the primary_key_field_id is found in the fields ids list
|
||||
// we return the generated document id instead of the record field.
|
||||
let iter = fields_ids.iter()
|
||||
.map(|(fi, i)| {
|
||||
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
|
||||
(fi, field)
|
||||
});
|
||||
let iter = fields_ids.iter().map(|(fi, i)| {
|
||||
let field = if *fi == primary_key_id { external_id } else { &record[*i] };
|
||||
(fi, field)
|
||||
});
|
||||
|
||||
// We retrieve the field id based on the fields ids map fields ids order.
|
||||
for (field_id, field) in iter {
|
||||
// We serialize the attribute values as JSON strings.
|
||||
json_buffer.clear();
|
||||
serde_json::to_writer(&mut json_buffer, &field).map_err(InternalError::SerdeJson)?;
|
||||
serde_json::to_writer(&mut json_buffer, &field)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
writer.insert(*field_id, &json_buffer)?;
|
||||
}
|
||||
|
||||
@ -410,26 +425,27 @@ impl Transform<'_, '_> {
|
||||
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
|
||||
IndexDocumentsMethod::UpdateDocuments => {
|
||||
let key = BEU32::new(docid);
|
||||
let base_obkv = self.index.documents.get(&self.rtxn, &key)?
|
||||
.ok_or(InternalError::DatabaseMissingEntry {
|
||||
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry {
|
||||
db_name: db_name::DOCUMENTS,
|
||||
key: None,
|
||||
})?;
|
||||
},
|
||||
)?;
|
||||
let update_obkv = obkv::KvReader::new(update_obkv);
|
||||
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
|
||||
(docid, obkv_buffer.as_slice())
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
None => {
|
||||
// If this user id is new we add it to the external documents ids map
|
||||
// for new ids and into the list of new documents.
|
||||
let new_docid = available_documents_ids.next()
|
||||
.ok_or(UserError::DocumentLimitReached)?;
|
||||
let new_docid =
|
||||
available_documents_ids.next().ok_or(UserError::DocumentLimitReached)?;
|
||||
new_external_documents_ids_builder.insert(external_id, new_docid as u64)?;
|
||||
new_documents_ids.insert(new_docid);
|
||||
(new_docid, update_obkv)
|
||||
},
|
||||
}
|
||||
};
|
||||
|
||||
// We insert the document under the documents ids map into the final file.
|
||||
@ -450,7 +466,8 @@ impl Transform<'_, '_> {
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
let mut writer =
|
||||
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
|
||||
// Once we have written all the documents into the final sorter, we write the documents
|
||||
// into this writer, extract the file and reset the seek to be able to read it again.
|
||||
@ -485,8 +502,7 @@ impl Transform<'_, '_> {
|
||||
primary_key: String,
|
||||
old_fields_ids_map: FieldsIdsMap,
|
||||
new_fields_ids_map: FieldsIdsMap,
|
||||
) -> Result<TransformOutput>
|
||||
{
|
||||
) -> Result<TransformOutput> {
|
||||
let fields_distribution = self.index.fields_distribution(self.rtxn)?;
|
||||
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
|
||||
let documents_ids = self.index.documents_ids(self.rtxn)?;
|
||||
@ -494,7 +510,8 @@ impl Transform<'_, '_> {
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let file = tempfile::tempfile()?;
|
||||
let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
let mut writer =
|
||||
create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
|
||||
|
||||
let mut obkv_buffer = Vec::new();
|
||||
for result in self.index.documents.iter(self.rtxn)? {
|
||||
@ -561,20 +578,19 @@ fn compute_primary_key_pair(
|
||||
return Err(UserError::MissingPrimaryKey.into());
|
||||
}
|
||||
DEFAULT_PRIMARY_KEY_NAME.to_string()
|
||||
},
|
||||
}
|
||||
};
|
||||
let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
|
||||
Ok((id, name))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
let document_id = document_id.trim();
|
||||
Some(document_id).filter(|id| {
|
||||
!id.is_empty() && id.chars().all(|c| {
|
||||
matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')
|
||||
})
|
||||
!id.is_empty()
|
||||
&& id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
})
|
||||
}
|
||||
|
||||
@ -583,8 +599,7 @@ mod test {
|
||||
use super::*;
|
||||
|
||||
mod compute_primary_key {
|
||||
use super::compute_primary_key_pair;
|
||||
use super::FieldsIdsMap;
|
||||
use super::{compute_primary_key_pair, FieldsIdsMap};
|
||||
|
||||
#[test]
|
||||
fn should_return_primary_key_if_is_some() {
|
||||
@ -594,7 +609,8 @@ mod test {
|
||||
Some("toto"),
|
||||
&mut fields_map,
|
||||
Some("tata".to_string()),
|
||||
false);
|
||||
false,
|
||||
);
|
||||
assert_eq!(result.unwrap(), (0u8, "toto".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
@ -602,11 +618,8 @@ mod test {
|
||||
#[test]
|
||||
fn should_return_alternative_if_primary_is_none() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(
|
||||
None,
|
||||
&mut fields_map,
|
||||
Some("tata".to_string()),
|
||||
false);
|
||||
let result =
|
||||
compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
|
||||
assert_eq!(result.unwrap(), (0u8, "tata".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
@ -614,23 +627,15 @@ mod test {
|
||||
#[test]
|
||||
fn should_return_default_if_both_are_none() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(
|
||||
None,
|
||||
&mut fields_map,
|
||||
None,
|
||||
true);
|
||||
let result = compute_primary_key_pair(None, &mut fields_map, None, true);
|
||||
assert_eq!(result.unwrap(), (0u8, "id".to_string()));
|
||||
assert_eq!(fields_map.len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn should_return_err_if_both_are_none_and_recompute_is_false(){
|
||||
fn should_return_err_if_both_are_none_and_recompute_is_false() {
|
||||
let mut fields_map = FieldsIdsMap::new();
|
||||
let result = compute_primary_key_pair(
|
||||
None,
|
||||
&mut fields_map,
|
||||
None,
|
||||
false);
|
||||
let result = compute_primary_key_pair(None, &mut fields_map, None, false);
|
||||
assert!(result.is_err());
|
||||
assert_eq!(fields_map.len(), 0);
|
||||
}
|
||||
|
Reference in New Issue
Block a user