mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-14 00:36:25 +00:00
Merge branch 'main' into merge-release-v1.8.1-in-main
This commit is contained in:
@ -29,8 +29,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
@ -186,7 +184,7 @@ fn searchable_fields_changed(
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
@ -298,7 +296,7 @@ fn lang_safe_tokens_from_document<'a>(
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<FieldId>,
|
||||
searchable_fields: &Option<Vec<FieldId>>,
|
||||
searchable_fields: &[FieldId],
|
||||
tokenizer: &Tokenizer,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
@ -309,7 +307,7 @@ fn tokens_from_document<'a>(
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
|
||||
if searchable_fields.as_ref().contains(&field_id) {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
|
@ -23,8 +23,6 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
|
@ -28,8 +28,6 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let options = NormalizerOption { lossy: true, ..Default::default() };
|
||||
|
||||
|
@ -37,7 +37,7 @@ pub struct ExtractedFacetValues {
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the orginal value as key
|
||||
/// Returns the generated grenad reader containing the docid the fid and the original value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
@ -46,8 +46,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
|
@ -26,8 +26,6 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
|
@ -21,8 +21,6 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
|
@ -10,16 +10,16 @@ use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use itertools::EitherOrBoth;
|
||||
use ordered_float::OrderedFloat;
|
||||
use serde_json::{from_slice, Value};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::UserError;
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::try_split_at;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors};
|
||||
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
@ -31,6 +31,10 @@ pub struct ExtractedVectorPoints {
|
||||
pub remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> prompt
|
||||
pub prompts: grenad::Reader<BufReader<File>>,
|
||||
|
||||
// embedder
|
||||
pub embedder_name: String,
|
||||
pub embedder: Arc<Embedder>,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
@ -65,6 +69,19 @@ impl VectorStateDelta {
|
||||
}
|
||||
}
|
||||
|
||||
struct EmbedderVectorExtractor {
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
prompt: Arc<Prompt>,
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
manual_vectors_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> (prompt)
|
||||
prompts_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> ()
|
||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
@ -73,34 +90,52 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
prompt: &Prompt,
|
||||
embedder_name: &str,
|
||||
) -> Result<ExtractedVectorPoints> {
|
||||
puffin::profile_function!();
|
||||
) -> Result<Vec<ExtractedVectorPoints>> {
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||
let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors);
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let mut manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let mut prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
let mut extractors = Vec::new();
|
||||
for (embedder_name, (embedder, prompt)) in
|
||||
settings_diff.new.embedding_configs.clone().into_iter()
|
||||
{
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let mut remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
});
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
@ -114,152 +149,138 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// since we only needs the primary key when we throw an error we create this getter to
|
||||
// since we only need the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id("_vectors");
|
||||
// filter the old vector fid if the settings has been changed forcing reindexing.
|
||||
let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors());
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
|
||||
let new_vectors_fid = new_fields_ids_map.id("_vectors");
|
||||
let vectors_field = {
|
||||
let del = old_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
let add = new_vectors_fid
|
||||
.and_then(|vectors_fid| obkv.get(vectors_fid))
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id))
|
||||
.transpose()?
|
||||
.flatten();
|
||||
(del, add)
|
||||
};
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder: _,
|
||||
prompt,
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
} in extractors.iter_mut()
|
||||
{
|
||||
let delta = match parsed_vectors.remove(embedder_name) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = old.into_array_of_vectors();
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
|
||||
let (del_map, add_map) = vectors_field;
|
||||
|
||||
let del_value = del_map.and_then(|mut map| map.remove(embedder_name));
|
||||
let add_value = add_map.and_then(|mut map| map.remove(embedder_name));
|
||||
|
||||
let delta = match (del_value, add_value) {
|
||||
(Some(old), Some(new)) => {
|
||||
// no autogeneration
|
||||
let del_vectors = extract_vectors(old, document_id, embedder_name)?;
|
||||
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
|
||||
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = extract_vectors(new, document_id, embedder_name)?;
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors)
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(prompt)
|
||||
// TODO: this filter works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
.filter(|_| !settings_diff.reindex_vectors())
|
||||
.map(|p| {
|
||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default()
|
||||
});
|
||||
let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
&mut remove_vectors_writer,
|
||||
&mut prompts_writer,
|
||||
&mut manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
settings_diff,
|
||||
)?;
|
||||
VectorStateDelta::ManualDelta(del_vectors, add_vectors)
|
||||
}
|
||||
(Some(_old), None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
(None, Some(new)) => {
|
||||
// was possibly autogenerated, remove all vectors for that document
|
||||
let add_vectors = new.into_array_of_vectors();
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::WasGeneratedNowManual(add_vectors)
|
||||
}
|
||||
(None, None) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(&prompt)
|
||||
// TODO: this filter works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
.filter(|_| !settings_diff.reindex_vectors())
|
||||
.map(|p| {
|
||||
p.render(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
.unwrap_or_default()
|
||||
});
|
||||
let new_prompt =
|
||||
prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
remove_vectors_writer,
|
||||
prompts_writer,
|
||||
manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
reindex_vectors,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
// docid -> ()
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
// docid -> prompt
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
})
|
||||
}
|
||||
let mut results = Vec::new();
|
||||
|
||||
fn to_vector_map(
|
||||
obkv: KvReaderDelAdd,
|
||||
side: DelAdd,
|
||||
document_id: &impl Fn() -> Value,
|
||||
) -> Result<Option<serde_json::Map<String, Value>>> {
|
||||
Ok(if let Some(value) = obkv.get(side) {
|
||||
let Ok(value) = from_slice(value) else {
|
||||
let value = from_slice(value).map_err(InternalError::SerdeJson)?;
|
||||
return Err(crate::Error::UserError(UserError::InvalidVectorsMapType {
|
||||
document_id: document_id(),
|
||||
value,
|
||||
}));
|
||||
};
|
||||
Some(value)
|
||||
} else {
|
||||
None
|
||||
})
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt: _,
|
||||
manual_vectors_writer,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
} in extractors
|
||||
{
|
||||
results.push(ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
// docid -> ()
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
// docid -> prompt
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
|
||||
embedder,
|
||||
embedder_name,
|
||||
})
|
||||
}
|
||||
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
@ -270,14 +291,13 @@ fn push_vectors_diff(
|
||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
reindex_vectors: bool,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values();
|
||||
if must_remove
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
&& !settings_diff.reindex_vectors()
|
||||
&& !reindex_vectors
|
||||
{
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
@ -308,7 +328,7 @@ fn push_vectors_diff(
|
||||
EitherOrBoth::Left(vector) => {
|
||||
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
|
||||
// When vector pipeline will be optimized, this should be removed.
|
||||
if !settings_diff.reindex_vectors() {
|
||||
if !reindex_vectors {
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
@ -336,26 +356,6 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
/// Extracts the vectors from a JSON value.
|
||||
fn extract_vectors(
|
||||
value: Value,
|
||||
document_id: impl Fn() -> Value,
|
||||
name: &str,
|
||||
) -> Result<Vec<Vec<f32>>> {
|
||||
// FIXME: ugly clone of the vectors here
|
||||
match serde_json::from_value(value.clone()) {
|
||||
Ok(vectors) => {
|
||||
Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors).unwrap_or_default())
|
||||
}
|
||||
Err(_) => Err(UserError::InvalidVectorsType {
|
||||
document_id: document_id(),
|
||||
value,
|
||||
subfield: name.to_owned(),
|
||||
}
|
||||
.into()),
|
||||
}
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
@ -364,7 +364,6 @@ pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
embedder: Arc<Embedder>,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
|
@ -36,8 +36,6 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
@ -167,8 +165,6 @@ fn words_into_sorter(
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
|
@ -26,7 +26,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
@ -71,8 +70,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
|
||||
// if we change document, we fill the sorter
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
puffin::profile_scope!("Document into sorter");
|
||||
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
@ -163,7 +160,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
puffin::profile_scope!("Final document into sorter");
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
@ -176,7 +172,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
)?;
|
||||
}
|
||||
{
|
||||
puffin::profile_scope!("sorter_into_reader");
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
|
||||
let _entered = span.enter();
|
||||
|
@ -25,8 +25,6 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
@ -104,8 +102,6 @@ fn words_position_into_sorter(
|
||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
|
@ -46,8 +46,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<()> {
|
||||
puffin::profile_function!();
|
||||
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
@ -88,7 +86,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
"field-id-wordcount-docids",
|
||||
);
|
||||
run_extraction_task::<
|
||||
_,
|
||||
@ -115,7 +112,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
"word-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
@ -125,7 +121,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
"word-position-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<
|
||||
@ -139,7 +134,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
"field-id-facet-string-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
@ -149,7 +143,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
"field-id-facet-number-docids",
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
@ -159,7 +152,6 @@ pub(crate) fn data_from_obkv_documents(
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
"word-pair-proximity-docids",
|
||||
);
|
||||
}
|
||||
|
||||
@ -183,7 +175,6 @@ fn run_extraction_task<FE, FS, M>(
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
name: &'static str,
|
||||
) where
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
@ -201,7 +192,7 @@ fn run_extraction_task<FE, FS, M>(
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
puffin::profile_scope!("extract_multiple_chunks", name);
|
||||
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
@ -224,27 +215,31 @@ fn send_original_documents_data(
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
let documents_chunk_cloned = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
|
||||
let request_threads = ThreadPoolNoAbortBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()?;
|
||||
|
||||
if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() {
|
||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||
// no point in indexing vectors without embedders
|
||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() {
|
||||
let result = extract_vector_points(
|
||||
documents_chunk_cloned.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
&prompt,
|
||||
&name,
|
||||
);
|
||||
match result {
|
||||
Ok(ExtractedVectorPoints { manual_vectors, remove_vectors, prompts }) => {
|
||||
match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) {
|
||||
Ok(extracted_vectors) => {
|
||||
for ExtractedVectorPoints {
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
embedder_name,
|
||||
embedder,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
@ -253,28 +248,26 @@ fn send_original_documents_data(
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints {
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name: name,
|
||||
embedder_name,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx_cloned.send(Err(error));
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
}
|
||||
}
|
||||
});
|
||||
|
Reference in New Issue
Block a user