refactor faceted and searchable pipeline

This commit is contained in:
ManyTheFish
2024-03-26 13:27:43 +01:00
parent a7e368aaa6
commit b5e4a55af6
14 changed files with 420 additions and 339 deletions

View File

@ -1,4 +1,4 @@
use std::collections::{HashMap, HashSet};
use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::File;
use std::io::BufReader;
@ -12,6 +12,7 @@ use serde_json::Value;
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
use crate::error::{InternalError, SerializationError};
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
searchable_fields: &Option<HashSet<FieldId>>,
stop_words: Option<&fst::Set<Vec<u8>>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
settings_diff: &InnerIndexSettingsDiff,
max_positions_per_attributes: Option<u32>,
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
puffin::profile_function!();
@ -56,8 +54,33 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut value_buffer = Vec::new();
// initialize tokenizer.
let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
let tokenizer = builder.build();
// TODO: Fix ugly allocation
let old_stop_words = settings_diff.old.stop_words.as_ref();
let old_separators: Option<Vec<_>> =
settings_diff.old.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
let old_dictionary: Option<Vec<_>> =
settings_diff.old.dictionary.map(|s| s.iter().map(String::as_str).collect());
let mut del_builder = tokenizer_builder(
old_stop_words,
old_separators.as_deref(),
old_dictionary.as_deref(),
None,
);
let del_tokenizer = del_builder.build();
// TODO: Fix ugly allocation
let new_stop_words = settings_diff.new.stop_words.as_ref();
let new_separators: Option<Vec<_>> =
settings_diff.new.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
let new_dictionary: Option<Vec<_>> =
settings_diff.new.dictionary.map(|s| s.iter().map(String::as_str).collect());
let mut add_builder = tokenizer_builder(
new_stop_words,
new_separators.as_deref(),
new_dictionary.as_deref(),
None,
);
let add_tokenizer = add_builder.build();
// iterate over documents.
let mut cursor = obkv_documents.into_cursor()?;
@ -69,7 +92,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let obkv = KvReader::<FieldId>::new(value);
// if the searchable fields didn't change, skip the searchable indexing for this document.
if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
if !searchable_fields_changed(
&KvReader::<FieldId>::new(value),
&settings_diff.new.searchable_fields_ids,
) {
continue;
}
@ -85,11 +111,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// deletions
lang_safe_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
stop_words,
allowed_separators,
dictionary,
&settings_diff.old,
&del_tokenizer,
max_positions_per_attributes,
DelAdd::Deletion,
&mut del_buffers,
@ -99,11 +122,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// additions
lang_safe_tokens_from_document(
&obkv,
searchable_fields,
&tokenizer,
stop_words,
allowed_separators,
dictionary,
&settings_diff.new,
&add_tokenizer,
max_positions_per_attributes,
DelAdd::Addition,
&mut add_buffers,
@ -118,8 +138,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
value_buffer.clear();
del_add_from_two_obkvs(
KvReader::<FieldId>::new(del_obkv),
KvReader::<FieldId>::new(add_obkv),
&KvReader::<FieldId>::new(del_obkv),
&KvReader::<FieldId>::new(add_obkv),
&mut value_buffer,
)?;
@ -160,7 +180,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
/// Check if any searchable fields of a document changed.
fn searchable_fields_changed(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
searchable_fields: &Option<Vec<FieldId>>,
) -> bool {
for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
@ -206,14 +226,10 @@ fn tokenizer_builder<'a>(
/// Extract words mapped with their positions of a document,
/// ensuring no Language detection mistakes was made.
#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
fn lang_safe_tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
settings: &InnerIndexSettings,
tokenizer: &Tokenizer,
stop_words: Option<&fst::Set<Vec<u8>>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
max_positions_per_attributes: u32,
del_add: DelAdd,
buffers: &'a mut Buffers,
@ -222,7 +238,7 @@ fn lang_safe_tokens_from_document<'a>(
tokens_from_document(
obkv,
searchable_fields,
&settings.searchable_fields_ids,
tokenizer,
max_positions_per_attributes,
del_add,
@ -246,12 +262,14 @@ fn lang_safe_tokens_from_document<'a>(
// then we don't rerun the extraction.
if !script_language.is_empty() {
// build a new temporary tokenizer including the allow list.
let mut builder = tokenizer_builder(
stop_words,
allowed_separators,
dictionary,
Some(&script_language),
);
// TODO: Fix ugly allocation
let stop_words = settings.stop_words.as_ref();
let separators: Option<Vec<_>> =
settings.allowed_separators.map(|s| s.iter().map(String::as_str).collect());
let dictionary: Option<Vec<_>> =
settings.dictionary.map(|s| s.iter().map(String::as_str).collect());
let mut builder =
tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None);
let tokenizer = builder.build();
script_language_word_count.clear();
@ -259,7 +277,7 @@ fn lang_safe_tokens_from_document<'a>(
// rerun the extraction.
tokens_from_document(
obkv,
searchable_fields,
&settings.searchable_fields_ids,
&tokenizer,
max_positions_per_attributes,
del_add,
@ -276,7 +294,7 @@ fn lang_safe_tokens_from_document<'a>(
/// Extract words mapped with their positions of a document.
fn tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<HashSet<FieldId>>,
searchable_fields: &Option<Vec<FieldId>>,
tokenizer: &Tokenizer,
max_positions_per_attributes: u32,
del_add: DelAdd,

View File

@ -10,6 +10,7 @@ use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
};
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::Result;
/// Extracts the facet number and the documents ids where this facet number appear.
@ -20,6 +21,7 @@ use crate::Result;
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
fid_docid_facet_number: grenad::Reader<R>,
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();

View File

@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::{
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// Extracts the facet string and the documents ids where this facet string appear.
@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
puffin::profile_function!();

View File

@ -1,5 +1,5 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
use std::collections::BTreeMap;
use std::convert::TryInto;
use std::fs::File;
use std::io::{self, BufReader};
@ -20,6 +20,7 @@ use crate::error::InternalError;
use crate::facet::value_encoding::f64_into_bytes;
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{create_writer, writer_into_reader};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// The length of the elements that are always in the buffer when inserting new values.
@ -43,7 +44,7 @@ pub struct ExtractedFacetValues {
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
faceted_fields: &HashSet<FieldId>,
settings_diff: &InnerIndexSettingsDiff,
geo_fields_ids: Option<(FieldId, FieldId)>,
) -> Result<ExtractedFacetValues> {
puffin::profile_function!();
@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let obkv = obkv::KvReader::new(value);
for (field_id, field_bytes) in obkv.iter() {
if faceted_fields.contains(&field_id) {
let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id);
let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id);
if delete_faceted || add_faceted {
numbers_key_buffer.clear();
strings_key_buffer.clear();
@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
strings_key_buffer.extend_from_slice(docid_bytes);
let del_add_obkv = obkv::KvReader::new(field_bytes);
let del_value = match del_add_obkv.get(DelAdd::Deletion) {
let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted)
{
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None,
};
let add_value = match del_add_obkv.get(DelAdd::Addition) {
let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) {
Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
None => None,
};

View File

@ -10,6 +10,7 @@ use super::helpers::{
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::Result;
const MAX_COUNTED_WORDS: usize = 30;
@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30;
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();

View File

@ -1,20 +1,22 @@
use std::collections::{BTreeSet, HashSet};
use std::collections::BTreeSet;
use std::fs::File;
use std::io::{self, BufReader};
use heed::BytesDecode;
use heed::{BytesDecode, BytesEncode};
use obkv::KvReaderU16;
use roaring::RoaringBitmap;
use super::helpers::{
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
try_split_array_at, writer_into_reader, GrenadParameters,
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
writer_into_reader, GrenadParameters,
};
use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::update::MergeFn;
use crate::{DocumentId, FieldId, Result};
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
/// Extracts the word and the documents ids where this word appear.
///
@ -27,7 +29,7 @@ use crate::{DocumentId, FieldId, Result};
pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
exact_attributes: &HashSet<FieldId>,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
@ -43,7 +45,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 3),
max_memory,
);
let mut key_buffer = Vec::new();
let mut del_words = BTreeSet::new();
@ -85,30 +87,29 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
add_words.clear();
}
let mut word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 3),
);
let mut exact_word_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable,
merge_deladd_cbo_roaring_bitmaps,
indexer.chunk_compression_type,
indexer.chunk_compression_level,
indexer.max_nb_chunks,
max_memory.map(|x| x / 3),
);
let mut word_fid_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut word_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut exact_word_docids_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut word: Option<String> = None;
let mut deletions = RoaringBitmap::new();
let mut additions = RoaringBitmap::new();
let mut exact_deletions = RoaringBitmap::new();
let mut exact_additions = RoaringBitmap::new();
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
// TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
while let Some((key, value)) = iter.next()? {
@ -117,20 +118,69 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
word_fid_docids_writer.insert(key, value)?;
}
let (word, fid) = StrBEU16Codec::bytes_decode(key)
let (w, fid) = StrBEU16Codec::bytes_decode(key)
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
// every words contained in an attribute set to exact must be pushed in the exact_words list.
if exact_attributes.contains(&fid) {
exact_word_docids_sorter.insert(word.as_bytes(), value)?;
if let Some(word) = word {
if word.as_str() != w {
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
docids_into_writers(
&word,
&exact_deletions,
&exact_additions,
&mut exact_word_docids_writer,
);
let word = Some(w.to_string());
// clear buffers
deletions.clear();
additions.clear();
exact_deletions.clear();
exact_additions.clear();
}
} else {
word_docids_sorter.insert(word.as_bytes(), value)?;
let word = Some(w.to_string());
}
// merge all deletions
let obkv = KvReaderDelAdd::new(value);
if let Some(value) = obkv.get(DelAdd::Deletion) {
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?;
if delete_from_exact {
exact_deletions |= docids;
} else {
deletions |= docids
}
}
// merge all additions
if let Some(value) = obkv.get(DelAdd::Addition) {
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
let docids = CboRoaringBitmapCodec::bytes_decode(value).map_err(|_| {
SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?;
if add_in_exact {
exact_additions |= docids;
} else {
additions |= docids
}
}
}
if let Some(word) = word {
docids_into_writers(&word, &deletions, &additions, &mut word_docids_writer);
docids_into_writers(
&word,
&exact_deletions,
&exact_additions,
&mut exact_word_docids_writer,
);
}
Ok((
sorter_into_reader(word_docids_sorter, indexer)?,
sorter_into_reader(exact_word_docids_sorter, indexer)?,
writer_into_reader(word_docids_writer)?,
writer_into_reader(exact_word_docids_writer)?,
writer_into_reader(word_fid_docids_writer)?,
))
}
@ -178,3 +228,45 @@ fn words_into_sorter(
Ok(())
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
fn docids_into_writers<W>(
word: &str,
deletions: &RoaringBitmap,
additions: &RoaringBitmap,
writer: &mut grenad::Writer<W>,
) -> Result<()>
where
W: std::io::Write,
{
if deletions == additions {
// if the same value is deleted and added, do nothing.
return Ok(());
}
// Write each value in the same KvDelAdd before inserting it in the final writer.
let mut obkv = KvWriterDelAdd::memory();
// deletions:
if !deletions.is_empty() && !deletions.is_subset(additions) {
obkv.insert(
DelAdd::Deletion,
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
);
}
// additions:
if !additions.is_empty() {
obkv.insert(
DelAdd::Addition,
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
})?,
);
}
// insert everything in the same writer.
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
Ok(())
}

View File

@ -13,6 +13,7 @@ use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::proximity::{index_proximity, MAX_DISTANCE};
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{DocumentId, Result};
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@ -23,6 +24,7 @@ use crate::{DocumentId, Result};
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();

View File

@ -11,6 +11,7 @@ use super::helpers::{
use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::update::MergeFn;
use crate::{bucketed_position, DocumentId, Result};
@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result};
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();

View File

@ -31,8 +31,8 @@ use self::extract_word_position_docids::extract_word_position_docids;
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::proximity::ProximityPrecision;
use crate::vector::EmbeddingConfigs;
use crate::{FieldId, FieldsIdsMap, Result};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result};
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
@ -43,18 +43,10 @@ pub(crate) fn data_from_obkv_documents(
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: Option<HashSet<FieldId>>,
faceted_fields: HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
field_id_map: FieldsIdsMap,
stop_words: Option<fst::Set<Vec<u8>>>,
allowed_separators: Option<&[&str]>,
dictionary: Option<&[&str]>,
settings_diff: &InnerIndexSettingsDiff,
max_positions_per_attributes: Option<u32>,
exact_attributes: HashSet<FieldId>,
proximity_precision: ProximityPrecision,
embedders: EmbeddingConfigs,
) -> Result<()> {
puffin::profile_function!();
@ -67,8 +59,7 @@ pub(crate) fn data_from_obkv_documents(
original_documents_chunk,
indexer,
lmdb_writer_sx.clone(),
field_id_map.clone(),
embedders.clone(),
settings_diff,
)
})
.collect::<Result<()>>()
@ -81,13 +72,9 @@ pub(crate) fn data_from_obkv_documents(
flattened_obkv_chunks,
indexer,
lmdb_writer_sx.clone(),
&searchable_fields,
&faceted_fields,
primary_key_id,
geo_fields_ids,
&stop_words,
&allowed_separators,
&dictionary,
settings_diff,
max_positions_per_attributes,
)
})
@ -100,13 +87,12 @@ pub(crate) fn data_from_obkv_documents(
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
settings_diff,
lmdb_writer_sx.clone(),
extract_fid_word_count_docids,
TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids",
);
let exact_attributes = exact_attributes.clone();
run_extraction_task::<
_,
_,
@ -118,10 +104,9 @@ pub(crate) fn data_from_obkv_documents(
>(
docid_word_positions_chunk.clone(),
indexer,
settings_diff,
lmdb_writer_sx.clone(),
move |doc_word_pos, indexer| {
extract_word_docids(doc_word_pos, indexer, &exact_attributes)
},
extract_word_docids,
|(
word_docids_reader,
exact_word_docids_reader,
@ -139,6 +124,7 @@ pub(crate) fn data_from_obkv_documents(
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
settings_diff,
lmdb_writer_sx.clone(),
extract_word_position_docids,
TypedChunk::WordPositionDocids,
@ -152,6 +138,7 @@ pub(crate) fn data_from_obkv_documents(
>(
fid_docid_facet_strings_chunk.clone(),
indexer,
settings_diff,
lmdb_writer_sx.clone(),
extract_facet_string_docids,
TypedChunk::FieldIdFacetStringDocids,
@ -161,22 +148,22 @@ pub(crate) fn data_from_obkv_documents(
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
fid_docid_facet_numbers_chunk.clone(),
indexer,
settings_diff,
lmdb_writer_sx.clone(),
extract_facet_number_docids,
TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids",
);
if proximity_precision == ProximityPrecision::ByWord {
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
docid_word_positions_chunk.clone(),
indexer,
settings_diff,
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
Ok(())
@ -195,12 +182,17 @@ pub(crate) fn data_from_obkv_documents(
fn run_extraction_task<FE, FS, M>(
chunk: grenad::Reader<CursorClonableMmap>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
extract_fn: FE,
serialize_fn: FS,
name: &'static str,
) where
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M>
FE: Fn(
grenad::Reader<CursorClonableMmap>,
GrenadParameters,
&InnerIndexSettingsDiff,
) -> Result<M>
+ Sync
+ Send
+ 'static,
@ -213,7 +205,7 @@ fn run_extraction_task<FE, FS, M>(
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer) {
match extract_fn(chunk, indexer, settings_diff) {
Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
}
@ -230,8 +222,7 @@ fn send_original_documents_data(
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
field_id_map: FieldsIdsMap,
embedders: EmbeddingConfigs,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<()> {
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@ -306,13 +297,9 @@ fn send_and_extract_flattened_documents_data(
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
indexer: GrenadParameters,
lmdb_writer_sx: Sender<Result<TypedChunk>>,
searchable_fields: &Option<HashSet<FieldId>>,
faceted_fields: &HashSet<FieldId>,
primary_key_id: FieldId,
geo_fields_ids: Option<(FieldId, FieldId)>,
stop_words: &Option<fst::Set<Vec<u8>>>,
allowed_separators: &Option<&[&str]>,
dictionary: &Option<&[&str]>,
settings_diff: &InnerIndexSettingsDiff,
max_positions_per_attributes: Option<u32>,
) -> Result<(
grenad::Reader<CursorClonableMmap>,
@ -341,10 +328,7 @@ fn send_and_extract_flattened_documents_data(
extract_docid_word_positions(
flattened_documents_chunk.clone(),
indexer,
searchable_fields,
stop_words.as_ref(),
*allowed_separators,
*dictionary,
settings_diff,
max_positions_per_attributes,
)?;
@ -367,7 +351,7 @@ fn send_and_extract_flattened_documents_data(
} = extract_fid_docid_facet_values(
flattened_documents_chunk.clone(),
indexer,
faceted_fields,
settings_diff,
geo_fields_ids,
)?;