Merge branch 'main' into merge-release-v1.8.1-in-main

This commit is contained in:
Many the fish
2024-05-29 11:31:03 +02:00
committed by GitHub
105 changed files with 5863 additions and 1031 deletions

View File

@ -29,8 +29,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
settings_diff: &InnerIndexSettingsDiff,
max_positions_per_attributes: Option<u32>,
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
puffin::profile_function!();
let max_positions_per_attributes = max_positions_per_attributes
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
let max_memory = indexer.max_memory_by_thread();
@ -186,7 +184,7 @@ fn searchable_fields_changed(
) -> bool {
let searchable_fields = &settings_diff.new.searchable_fields_ids;
for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
if searchable_fields.contains(&field_id) {
let del_add = KvReaderDelAdd::new(field_bytes);
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
// if both fields are None, check the next field.
@ -298,7 +296,7 @@ fn lang_safe_tokens_from_document<'a>(
/// Extract words mapped with their positions of a document.
fn tokens_from_document<'a>(
obkv: &KvReader<FieldId>,
searchable_fields: &Option<Vec<FieldId>>,
searchable_fields: &[FieldId],
tokenizer: &Tokenizer,
max_positions_per_attributes: u32,
del_add: DelAdd,
@ -309,7 +307,7 @@ fn tokens_from_document<'a>(
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
for (field_id, field_bytes) in obkv.iter() {
// if field is searchable.
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
if searchable_fields.as_ref().contains(&field_id) {
// extract deletion or addition only.
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
// parse json.

View File

@ -23,8 +23,6 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut facet_number_docids_sorter = create_sorter(

View File

@ -28,8 +28,6 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let options = NormalizerOption { lossy: true, ..Default::default() };

View File

@ -37,7 +37,7 @@ pub struct ExtractedFacetValues {
/// Extracts the facet values of each faceted field of each document.
///
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
/// Returns the generated grenad reader containing the docid the fid and the original value as key
/// and the normalized value as value extracted from the given chunk of documents.
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
@ -46,8 +46,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<ExtractedFacetValues> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut fid_docid_facet_numbers_sorter = create_sorter(

View File

@ -26,8 +26,6 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut fid_word_count_docids_sorter = create_sorter(

View File

@ -21,8 +21,6 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
primary_key_id: FieldId,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let mut writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,

View File

@ -10,16 +10,16 @@ use bytemuck::cast_slice;
use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat;
use serde_json::{from_slice, Value};
use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::error::UserError;
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::try_split_at;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
use crate::vector::Embedder;
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
use crate::{DocumentId, Result, ThreadPoolNoAbort};
/// The length of the elements that are always in the buffer when inserting new values.
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
@ -31,6 +31,10 @@ pub struct ExtractedVectorPoints {
pub remove_vectors: grenad::Reader<BufReader<File>>,
// docid -> prompt
pub prompts: grenad::Reader<BufReader<File>>,
// embedder
pub embedder_name: String,
pub embedder: Arc<Embedder>,
}
enum VectorStateDelta {
@ -65,6 +69,19 @@ impl VectorStateDelta {
}
}
struct EmbedderVectorExtractor {
embedder_name: String,
embedder: Arc<Embedder>,
prompt: Arc<Prompt>,
// (docid, _index) -> KvWriterDelAdd -> Vector
manual_vectors_writer: Writer<BufWriter<File>>,
// (docid) -> (prompt)
prompts_writer: Writer<BufWriter<File>>,
// (docid) -> ()
remove_vectors_writer: Writer<BufWriter<File>>,
}
/// Extracts the embedding vector contained in each document under the `_vectors` field.
///
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
@ -73,34 +90,52 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
obkv_documents: grenad::Reader<R>,
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
prompt: &Prompt,
embedder_name: &str,
) -> Result<ExtractedVectorPoints> {
puffin::profile_function!();
) -> Result<Vec<ExtractedVectorPoints>> {
let reindex_vectors = settings_diff.reindex_vectors();
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
// the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
// (docid, _index) -> KvWriterDelAdd -> Vector
let mut manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
// (docid) -> (prompt)
let mut prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
let mut extractors = Vec::new();
for (embedder_name, (embedder, prompt)) in
settings_diff.new.embedding_configs.clone().into_iter()
{
// (docid, _index) -> KvWriterDelAdd -> Vector
let manual_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let mut remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> (prompt)
let prompts_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
// (docid) -> ()
let remove_vectors_writer = create_writer(
indexer.chunk_compression_type,
indexer.chunk_compression_level,
tempfile::tempfile()?,
);
extractors.push(EmbedderVectorExtractor {
embedder_name,
embedder,
prompt,
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
});
}
let mut key_buffer = Vec::new();
let mut cursor = obkv_documents.into_cursor()?;
@ -114,152 +149,138 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
key_buffer.clear();
key_buffer.extend_from_slice(docid_bytes);
// since we only needs the primary key when we throw an error we create this getter to
// since we only need the primary key when we throw an error we create this getter to
// lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
// the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id("_vectors");
// filter the old vector fid if the settings has been changed forcing reindexing.
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
let new_vectors_fid = new_fields_ids_map.id("_vectors");
let vectors_field = {
let del = old_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
.transpose()?
.flatten();
let add = new_vectors_fid
.and_then(|vectors_fid| obkv.get(vectors_fid))
.map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
.transpose()?
.flatten();
(del, add)
};
for EmbedderVectorExtractor {
embedder_name,
embedder: _,
prompt,
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
} in extractors.iter_mut()
{
let delta = match parsed_vectors.remove(embedder_name) {
(Some(old), Some(new)) => {
// no autogeneration
let del_vectors = old.into_array_of_vectors();
let add_vectors = new.into_array_of_vectors();
let (del_map, add_map) = vectors_field;
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
let delta = match (del_value, add_value) {
(Some(old), Some(new)) => {
// no autogeneration
let del_vectors = extract_vectors(old, document_id, embedder_name)?;
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
}
(Some(_old), None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
VectorStateDelta::NowRemoved
}
}
(None, Some(new)) => {
// was possibly autogenerated, remove all vectors for that document
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::WasGeneratedNowManual(add_vectors)
}
(None, None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(prompt)
// TODO: this filter works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
});
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
} else {
VectorStateDelta::NowRemoved
}
}
};
// and we finally push the unique vectors into the writer
push_vectors_diff(
&mut remove_vectors_writer,
&mut prompts_writer,
&mut manual_vectors_writer,
&mut key_buffer,
delta,
settings_diff,
)?;
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
}
(Some(_old), None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render(
obkv,
DelAdd::Addition,
new_fields_ids_map,
)?)
} else {
VectorStateDelta::NowRemoved
}
}
(None, Some(new)) => {
// was possibly autogenerated, remove all vectors for that document
let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
VectorStateDelta::WasGeneratedNowManual(add_vectors)
}
(None, None) => {
// Do we keep this document?
let document_is_kept = obkv
.iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept {
// Don't give up if the old prompt was failing
let old_prompt = Some(&prompt)
// TODO: this filter works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
.filter(|_| !settings_diff.reindex_vectors())
.map(|p| {
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
.unwrap_or_default()
});
let new_prompt =
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
if old_prompt.as_ref() != Some(&new_prompt) {
let old_prompt = old_prompt.unwrap_or_default();
tracing::trace!(
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
);
VectorStateDelta::NowGenerated(new_prompt)
} else {
tracing::trace!("⏭️ Prompt unmodified, skipping");
VectorStateDelta::NoChange
}
} else {
VectorStateDelta::NowRemoved
}
}
};
// and we finally push the unique vectors into the writer
push_vectors_diff(
remove_vectors_writer,
prompts_writer,
manual_vectors_writer,
&mut key_buffer,
delta,
reindex_vectors,
)?;
}
}
Ok(ExtractedVectorPoints {
// docid, _index -> KvWriterDelAdd -> Vector
manual_vectors: writer_into_reader(manual_vectors_writer)?,
// docid -> ()
remove_vectors: writer_into_reader(remove_vectors_writer)?,
// docid -> prompt
prompts: writer_into_reader(prompts_writer)?,
})
}
let mut results = Vec::new();
fn to_vector_map(
obkv: KvReaderDelAdd,
side: DelAdd,
document_id: &impl Fn() -> Value,
) -> Result<Option<serde_json::Map<String, Value>>> {
Ok(if let Some(value) = obkv.get(side) {
let Ok(value) = from_slice(value) else {
let value = from_slice(value).map_err(InternalError::SerdeJson)?;
return Err(crate::Error::UserError(UserError::InvalidVectorsMapType {
document_id: document_id(),
value,
}));
};
Some(value)
} else {
None
})
for EmbedderVectorExtractor {
embedder_name,
embedder,
prompt: _,
manual_vectors_writer,
prompts_writer,
remove_vectors_writer,
} in extractors
{
results.push(ExtractedVectorPoints {
// docid, _index -> KvWriterDelAdd -> Vector
manual_vectors: writer_into_reader(manual_vectors_writer)?,
// docid -> ()
remove_vectors: writer_into_reader(remove_vectors_writer)?,
// docid -> prompt
prompts: writer_into_reader(prompts_writer)?,
embedder,
embedder_name,
})
}
Ok(results)
}
/// Computes the diff between both Del and Add numbers and
@ -270,14 +291,13 @@ fn push_vectors_diff(
manual_vectors_writer: &mut Writer<BufWriter<File>>,
key_buffer: &mut Vec<u8>,
delta: VectorStateDelta,
settings_diff: &InnerIndexSettingsDiff,
reindex_vectors: bool,
) -> Result<()> {
puffin::profile_function!();
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
&& !settings_diff.reindex_vectors()
&& !reindex_vectors
{
key_buffer.truncate(TRUNCATE_SIZE);
remove_vectors_writer.insert(&key_buffer, [])?;
@ -308,7 +328,7 @@ fn push_vectors_diff(
EitherOrBoth::Left(vector) => {
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
if !settings_diff.reindex_vectors() {
if !reindex_vectors {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
@ -336,26 +356,6 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
}
/// Extracts the vectors from a JSON value.
fn extract_vectors(
value: Value,
document_id: impl Fn() -> Value,
name: &str,
) -> Result<Vec<Vec<f32>>> {
// FIXME: ugly clone of the vectors here
match serde_json::from_value(value.clone()) {
Ok(vectors) => {
Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors).unwrap_or_default())
}
Err(_) => Err(UserError::InvalidVectorsType {
document_id: document_id(),
value,
subfield: name.to_owned(),
}
.into()),
}
}
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
pub fn extract_embeddings<R: io::Read + io::Seek>(
// docid, prompt
@ -364,7 +364,6 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
embedder: Arc<Embedder>,
request_threads: &ThreadPoolNoAbort,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk

View File

@ -36,8 +36,6 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
grenad::Reader<BufReader<File>>,
grenad::Reader<BufReader<File>>,
)> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_fid_docids_sorter = create_sorter(
@ -167,8 +165,6 @@ fn words_into_sorter(
add_words: &BTreeSet<Vec<u8>>,
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};

View File

@ -26,7 +26,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
@ -71,8 +70,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
// if we change document, we fill the sorter
if current_document_id.map_or(false, |id| id != document_id) {
puffin::profile_scope!("Document into sorter");
// FIXME: span inside of a hot loop might degrade performance and create big reports
let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
let _entered = span.enter();
@ -163,7 +160,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
}
if let Some(document_id) = current_document_id {
puffin::profile_scope!("Final document into sorter");
// FIXME: span inside of a hot loop might degrade performance and create big reports
let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
let _entered = span.enter();
@ -176,7 +172,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
)?;
}
{
puffin::profile_scope!("sorter_into_reader");
// FIXME: span inside of a hot loop might degrade performance and create big reports
let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
let _entered = span.enter();

View File

@ -25,8 +25,6 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
indexer: GrenadParameters,
_settings_diff: &InnerIndexSettingsDiff,
) -> Result<grenad::Reader<BufReader<File>>> {
puffin::profile_function!();
let max_memory = indexer.max_memory_by_thread();
let mut word_position_docids_sorter = create_sorter(
@ -104,8 +102,6 @@ fn words_position_into_sorter(
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> {
puffin::profile_function!();
use itertools::merge_join_by;
use itertools::EitherOrBoth::{Both, Left, Right};

View File

@ -46,8 +46,6 @@ pub(crate) fn data_from_obkv_documents(
settings_diff: Arc<InnerIndexSettingsDiff>,
max_positions_per_attributes: Option<u32>,
) -> Result<()> {
puffin::profile_function!();
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|| {
original_obkv_chunks
@ -88,7 +86,6 @@ pub(crate) fn data_from_obkv_documents(
lmdb_writer_sx.clone(),
extract_fid_word_count_docids,
TypedChunk::FieldIdWordCountDocids,
"field-id-wordcount-docids",
);
run_extraction_task::<
_,
@ -115,7 +112,6 @@ pub(crate) fn data_from_obkv_documents(
word_fid_docids_reader,
}
},
"word-docids",
);
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
@ -125,7 +121,6 @@ pub(crate) fn data_from_obkv_documents(
lmdb_writer_sx.clone(),
extract_word_position_docids,
TypedChunk::WordPositionDocids,
"word-position-docids",
);
run_extraction_task::<
@ -139,7 +134,6 @@ pub(crate) fn data_from_obkv_documents(
lmdb_writer_sx.clone(),
extract_facet_string_docids,
TypedChunk::FieldIdFacetStringDocids,
"field-id-facet-string-docids",
);
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
@ -149,7 +143,6 @@ pub(crate) fn data_from_obkv_documents(
lmdb_writer_sx.clone(),
extract_facet_number_docids,
TypedChunk::FieldIdFacetNumberDocids,
"field-id-facet-number-docids",
);
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
@ -159,7 +152,6 @@ pub(crate) fn data_from_obkv_documents(
lmdb_writer_sx.clone(),
extract_word_pair_proximity_docids,
TypedChunk::WordPairProximityDocids,
"word-pair-proximity-docids",
);
}
@ -183,7 +175,6 @@ fn run_extraction_task<FE, FS, M>(
lmdb_writer_sx: Sender<Result<TypedChunk>>,
extract_fn: FE,
serialize_fn: FS,
name: &'static str,
) where
FE: Fn(
grenad::Reader<CursorClonableMmap>,
@ -201,7 +192,7 @@ fn run_extraction_task<FE, FS, M>(
rayon::spawn(move || {
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: &current_span, "extract_multiple_chunks");
let _entered = child_span.enter();
puffin::profile_scope!("extract_multiple_chunks", name);
match extract_fn(chunk, indexer, &settings_diff) {
Ok(chunk) => {
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
@ -224,27 +215,31 @@ fn send_original_documents_data(
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
let documents_chunk_cloned = original_documents_chunk.clone();
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
// no point in indexing vectors without embedders
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
if index_vectors {
let settings_diff = settings_diff.clone();
let original_documents_chunk = original_documents_chunk.clone();
let lmdb_writer_sx = lmdb_writer_sx.clone();
rayon::spawn(move || {
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
let result = extract_vector_points(
documents_chunk_cloned.clone(),
indexer,
&settings_diff,
&prompt,
&name,
);
match result {
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) {
Ok(extracted_vectors) => {
for ExtractedVectorPoints {
manual_vectors,
remove_vectors,
prompts,
embedder_name,
embedder,
} in extracted_vectors
{
let embeddings = match extract_embeddings(
prompts,
indexer,
@ -253,28 +248,26 @@ fn send_original_documents_data(
) {
Ok(results) => Some(results),
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));
let _ = lmdb_writer_sx.send(Err(error));
None
}
};
if !(remove_vectors.is_empty()
&& manual_vectors.is_empty()
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
{
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
remove_vectors,
embeddings,
expected_dimension: embedder.dimensions(),
manual_vectors,
embedder_name: name,
embedder_name,
}));
}
}
Err(error) => {
let _ = lmdb_writer_sx_cloned.send(Err(error));
}
}
Err(error) => {
let _ = lmdb_writer_sx.send(Err(error));
}
}
});