mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-14 00:36:25 +00:00
refactor faceted and searchable pipeline
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::collections::HashMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
@ -12,6 +12,7 @@ use serde_json::Value;
|
||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
|
||||
@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R
|
||||
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
@ -56,8 +54,33 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
|
||||
let tokenizer = builder.build();
|
||||
// TODO: Fix ugly allocation
|
||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||
let old_separators: Option<Vec<_>> =
|
||||
settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut del_builder = tokenizer_builder(
|
||||
old_stop_words,
|
||||
old_separators.as_deref(),
|
||||
old_dictionary.as_deref(),
|
||||
None,
|
||||
);
|
||||
let del_tokenizer = del_builder.build();
|
||||
|
||||
// TODO: Fix ugly allocation
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> =
|
||||
settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut add_builder = tokenizer_builder(
|
||||
new_stop_words,
|
||||
new_separators.as_deref(),
|
||||
new_dictionary.as_deref(),
|
||||
None,
|
||||
);
|
||||
let add_tokenizer = add_builder.build();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
@ -69,7 +92,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
|
||||
if !searchable_fields_changed(
|
||||
&KvReader::<FieldId>::new(value),
|
||||
&settings_diff.new.searchable_fields_ids,
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -85,11 +111,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// deletions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
&settings_diff.old,
|
||||
&del_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
@ -99,11 +122,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// additions
|
||||
lang_safe_tokens_from_document(
|
||||
&obkv,
|
||||
searchable_fields,
|
||||
&tokenizer,
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
&settings_diff.new,
|
||||
&add_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
@ -118,8 +138,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::new(del_obkv),
|
||||
KvReader::<FieldId>::new(add_obkv),
|
||||
&KvReader::<FieldId>::new(del_obkv),
|
||||
&KvReader::<FieldId>::new(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
@ -160,7 +180,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
searchable_fields: &Option<Vec<FieldId>>,
|
||||
) -> bool {
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
@ -206,14 +226,10 @@ fn tokenizer_builder<'a>(
|
||||
|
||||
/// Extract words mapped with their positions of a document,
|
||||
/// ensuring no Language detection mistakes was made.
|
||||
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
|
||||
fn lang_safe_tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
settings: &InnerIndexSettings,
|
||||
tokenizer: &Tokenizer,
|
||||
stop_words: Option<&fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
@ -222,7 +238,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&settings.searchable_fields_ids,
|
||||
tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
@ -246,12 +262,14 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// then we don't rerun the extraction.
|
||||
if !script_language.is_empty() {
|
||||
// build a new temporary tokenizer including the allow list.
|
||||
let mut builder = tokenizer_builder(
|
||||
stop_words,
|
||||
allowed_separators,
|
||||
dictionary,
|
||||
Some(&script_language),
|
||||
);
|
||||
// TODO: Fix ugly allocation
|
||||
let stop_words = settings.stop_words.as_ref();
|
||||
let separators: Option<Vec<_>> =
|
||||
settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary: Option<Vec<_>> =
|
||||
settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
|
||||
let mut builder =
|
||||
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
|
||||
let tokenizer = builder.build();
|
||||
|
||||
script_language_word_count.clear();
|
||||
@ -259,7 +277,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
// rerun the extraction.
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
searchable_fields,
|
||||
&settings.searchable_fields_ids,
|
||||
&tokenizer,
|
||||
max_positions_per_attributes,
|
||||
del_add,
|
||||
@ -276,7 +294,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
searchable_fields: &Option<Vec<FieldId>>,
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
|
@ -10,6 +10,7 @@ use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
@ -20,6 +21,7 @@ use crate::Result;
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashSet};
|
||||
use std::collections::BTreeMap;
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
@ -20,6 +20,7 @@ use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
@ -43,7 +44,7 @@ pub struct ExtractedFacetValues {
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
puffin::profile_function!();
|
||||
@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if faceted_fields.contains(&field_id) {
|
||||
let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
|
||||
let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
|
||||
if delete_faceted || add_faceted {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
let del_add_obkv = obkv::KvReader::new(field_bytes);
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
|
||||
let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
|
||||
{
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition) {
|
||||
let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
|
||||
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
|
||||
None => None,
|
||||
};
|
||||
|
@ -10,6 +10,7 @@ use super::helpers::{
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30;
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -1,20 +1,22 @@
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::BytesDecode;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
|
||||
try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{DocumentId, FieldId, Result};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
@ -27,7 +29,7 @@ use crate::{DocumentId, FieldId, Result};
|
||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
exact_attributes: &HashSet<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
@ -43,7 +45,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
max_memory,
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
@ -85,30 +87,29 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|x| x / 3),
|
||||
);
|
||||
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut exact_word_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word: Option<String> = None;
|
||||
let mut deletions = RoaringBitmap::new();
|
||||
let mut additions = RoaringBitmap::new();
|
||||
let mut exact_deletions = RoaringBitmap::new();
|
||||
let mut exact_additions = RoaringBitmap::new();
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
@ -117,20 +118,69 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (word, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
// every words contained in an attribute set to exact must be pushed in the exact_words list.
|
||||
if exact_attributes.contains(&fid) {
|
||||
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
if let Some(word) = word {
|
||||
if word.as_str() != w {
|
||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
|
||||
docids_into_writers(
|
||||
&word,
|
||||
&exact_deletions,
|
||||
&exact_additions,
|
||||
&mut exact_word_docids_writer,
|
||||
);
|
||||
let word = Some(w.to_string());
|
||||
// clear buffers
|
||||
deletions.clear();
|
||||
additions.clear();
|
||||
exact_deletions.clear();
|
||||
exact_additions.clear();
|
||||
}
|
||||
} else {
|
||||
word_docids_sorter.insert(word.as_bytes(), value)?;
|
||||
let word = Some(w.to_string());
|
||||
}
|
||||
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
|
||||
SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?;
|
||||
if delete_from_exact {
|
||||
exact_deletions |= docids;
|
||||
} else {
|
||||
deletions |= docids
|
||||
}
|
||||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
|
||||
SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?;
|
||||
if add_in_exact {
|
||||
exact_additions |= docids;
|
||||
} else {
|
||||
additions |= docids
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(word) = word {
|
||||
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
|
||||
docids_into_writers(
|
||||
&word,
|
||||
&exact_deletions,
|
||||
&exact_additions,
|
||||
&mut exact_word_docids_writer,
|
||||
);
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||
writer_into_reader(word_docids_writer)?,
|
||||
writer_into_reader(exact_word_docids_writer)?,
|
||||
writer_into_reader(word_fid_docids_writer)?,
|
||||
))
|
||||
}
|
||||
@ -178,3 +228,45 @@ fn words_into_sorter(
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn docids_into_writers<W>(
|
||||
word: &str,
|
||||
deletions: &RoaringBitmap,
|
||||
additions: &RoaringBitmap,
|
||||
writer: &mut grenad::Writer<W>,
|
||||
) -> Result<()>
|
||||
where
|
||||
W: std::io::Write,
|
||||
{
|
||||
if deletions == additions {
|
||||
// if the same value is deleted and added, do nothing.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Write each value in the same KvDelAdd before inserting it in the final writer.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
// deletions:
|
||||
if !deletions.is_empty() && !deletions.is_subset(additions) {
|
||||
obkv.insert(
|
||||
DelAdd::Deletion,
|
||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
);
|
||||
}
|
||||
// additions:
|
||||
if !additions.is_empty() {
|
||||
obkv.insert(
|
||||
DelAdd::Addition,
|
||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
);
|
||||
}
|
||||
|
||||
// insert everything in the same writer.
|
||||
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
@ -23,6 +24,7 @@ use crate::{DocumentId, Result};
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -11,6 +11,7 @@ use super::helpers::{
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result};
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
|
@ -31,8 +31,8 @@ use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{FieldId, FieldsIdsMap, Result};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
@ -43,18 +43,10 @@ pub(crate) fn data_from_obkv_documents(
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: Option<HashSet<FieldId>>,
|
||||
faceted_fields: HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
stop_words: Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&[&str]>,
|
||||
dictionary: Option<&[&str]>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
exact_attributes: HashSet<FieldId>,
|
||||
proximity_precision: ProximityPrecision,
|
||||
embedders: EmbeddingConfigs,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
@ -67,8 +59,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
field_id_map.clone(),
|
||||
embedders.clone(),
|
||||
settings_diff,
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
@ -81,13 +72,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
&searchable_fields,
|
||||
&faceted_fields,
|
||||
primary_key_id,
|
||||
geo_fields_ids,
|
||||
&stop_words,
|
||||
&allowed_separators,
|
||||
&dictionary,
|
||||
settings_diff,
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
@ -100,13 +87,12 @@ pub(crate) fn data_from_obkv_documents(
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
|
||||
let exact_attributes = exact_attributes.clone();
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
@ -118,10 +104,9 @@ pub(crate) fn data_from_obkv_documents(
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff,
|
||||
lmdb_writer_sx.clone(),
|
||||
move |doc_word_pos, indexer| {
|
||||
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
|
||||
},
|
||||
extract_word_docids,
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
@ -139,6 +124,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
@ -152,6 +138,7 @@ pub(crate) fn data_from_obkv_documents(
|
||||
>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
@ -161,22 +148,22 @@ pub(crate) fn data_from_obkv_documents(
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
if proximity_precision == ProximityPrecision::ByWord {
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff,
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
@ -195,12 +182,17 @@ pub(crate) fn data_from_obkv_documents(
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
GrenadParameters,
|
||||
&InnerIndexSettingsDiff,
|
||||
) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
@ -213,7 +205,7 @@ fn run_extraction_task<FE, FS, M>(
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
match extract_fn(chunk, indexer) {
|
||||
match extract_fn(chunk, indexer, settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
@ -230,8 +222,7 @@ fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
field_id_map: FieldsIdsMap,
|
||||
embedders: EmbeddingConfigs,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
@ -306,13 +297,9 @@ fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
searchable_fields: &Option<HashSet<FieldId>>,
|
||||
faceted_fields: &HashSet<FieldId>,
|
||||
primary_key_id: FieldId,
|
||||
geo_fields_ids: Option<(FieldId, FieldId)>,
|
||||
stop_words: &Option<fst::Set<Vec<u8>>>,
|
||||
allowed_separators: &Option<&[&str]>,
|
||||
dictionary: &Option<&[&str]>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
@ -341,10 +328,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
searchable_fields,
|
||||
stop_words.as_ref(),
|
||||
*allowed_separators,
|
||||
*dictionary,
|
||||
settings_diff,
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
@ -367,7 +351,7 @@ fn send_and_extract_flattened_documents_data(
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
faceted_fields,
|
||||
settings_diff,
|
||||
geo_fields_ids,
|
||||
)?;
|
||||
|
||||
|
Reference in New Issue
Block a user