mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-10-02 17:56:28 +00:00
Merge branch 'indexer-edition-2024' into indexer-edition-2024-doc-chunks
This commit is contained in:
@@ -0,0 +1,319 @@
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{io, mem, str};
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use obkv::{KvReader, KvWriterU16};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepLatestObkv};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
|
||||
|
||||
/// Extracts the word and positions where this word appear and
|
||||
/// prefixes it by the document id.
|
||||
///
|
||||
/// Returns the generated internal documents ids and a grenad reader
|
||||
/// with the list of extracted words from the given chunk of documents.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_positions_per_attributes = max_positions_per_attributes
|
||||
.map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let force_reindexing = settings_diff.reindex_searchable();
|
||||
|
||||
// initialize destination values.
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
KeepLatestObkv,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
// initialize buffers.
|
||||
let mut del_buffers = Buffers::default();
|
||||
let mut add_buffers = Buffers::default();
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
|
||||
// initialize tokenizer.
|
||||
let old_stop_words = settings_diff.old.stop_words.as_ref();
|
||||
let old_separators: Option<Vec<_>> = settings_diff
|
||||
.old
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let old_dictionary: Option<Vec<_>> =
|
||||
settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let del_builder =
|
||||
tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref());
|
||||
let del_tokenizer = del_builder.into_tokenizer();
|
||||
|
||||
let new_stop_words = settings_diff.new.stop_words.as_ref();
|
||||
let new_separators: Option<Vec<_>> = settings_diff
|
||||
.new
|
||||
.allowed_separators
|
||||
.as_ref()
|
||||
.map(|s| s.iter().map(String::as_str).collect());
|
||||
let new_dictionary: Option<Vec<_>> =
|
||||
settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let add_builder =
|
||||
tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref());
|
||||
let add_tokenizer = add_builder.into_tokenizer();
|
||||
|
||||
// iterate over documents.
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let document_id = key
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::from_slice(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !force_reindexing && !searchable_fields_changed(obkv, settings_diff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
documents_ids.push(document_id);
|
||||
|
||||
// Update key buffer prefix.
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&document_id.to_be_bytes());
|
||||
|
||||
// Tokenize deletions and additions in 2 diffferent threads.
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
// deletions
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
&settings_diff.old,
|
||||
&del_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Deletion,
|
||||
&mut del_buffers,
|
||||
)
|
||||
},
|
||||
|| {
|
||||
// additions
|
||||
tokens_from_document(
|
||||
obkv,
|
||||
&settings_diff.new,
|
||||
&add_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
DelAdd::Addition,
|
||||
&mut add_buffers,
|
||||
)
|
||||
},
|
||||
);
|
||||
|
||||
let del_obkv = del?;
|
||||
let add_obkv = add?;
|
||||
|
||||
// merge deletions and additions.
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
KvReader::<FieldId>::from_slice(del_obkv),
|
||||
KvReader::<FieldId>::from_slice(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::from_slice(&value_buffer);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
docid_word_positions_sorter.insert(&key_buffer, value)?;
|
||||
}
|
||||
}
|
||||
|
||||
// the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
|
||||
sorter_into_reader(docid_word_positions_sorter, indexer)
|
||||
}
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let del_add = KvReaderDelAdd::from_slice(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
// if both contains a value and values are the same, check the next field.
|
||||
(Some(del), Some(add)) if del == add => (),
|
||||
// otherwise the fields are different, return true.
|
||||
_otherwise => return true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
false
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<Vec<u8>>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
) -> TokenizerBuilder<'a, Vec<u8>> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &'a KvReader<FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
tokenizer: &Tokenizer<'_>,
|
||||
max_positions_per_attributes: u32,
|
||||
del_add: DelAdd,
|
||||
buffers: &'a mut Buffers,
|
||||
) -> Result<&'a [u8]> {
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
// if field is searchable.
|
||||
if settings.searchable_fields_ids.contains(&field_id) {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
// prepare writing destination.
|
||||
buffers.obkv_positions_buffer.clear();
|
||||
let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
|
||||
|
||||
// convert json into a unique string.
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = settings.localized_searchable_fields_ids.locales(field_id);
|
||||
let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
let position: u16 = index
|
||||
.try_into()
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
writer.insert(position, token.as_bytes())?;
|
||||
}
|
||||
}
|
||||
|
||||
// write positions into document.
|
||||
let positions = writer.into_inner()?;
|
||||
document_writer.insert(field_id, positions)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// returns a KV<FieldId, KV<u16, String>>
|
||||
Ok(document_writer.into_inner().map(|v| v.as_slice())?)
|
||||
}
|
||||
|
||||
/// Transform a JSON value into a string that can be indexed.
|
||||
fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a str> {
|
||||
fn inner(value: &Value, output: &mut String) -> bool {
|
||||
use std::fmt::Write;
|
||||
match value {
|
||||
Value::Null | Value::Object(_) => false,
|
||||
Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(),
|
||||
Value::Number(number) => write!(output, "{}", number).is_ok(),
|
||||
Value::String(string) => write!(output, "{}", string).is_ok(),
|
||||
Value::Array(array) => {
|
||||
let mut count = 0;
|
||||
for value in array {
|
||||
if inner(value, output) {
|
||||
output.push_str(". ");
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
// check that at least one value was written
|
||||
count != 0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Value::String(string) = value {
|
||||
Some(string)
|
||||
} else if inner(value, buffer) {
|
||||
Some(buffer)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
|
||||
/// else we keep the standard proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((0, None), |(offset, prev_kind), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => token.kind = TokenKind::Unknown,
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
struct Buffers {
|
||||
// the field buffer for each fields desserialization, and must be cleared between each field.
|
||||
field_buffer: String,
|
||||
// buffer used to store the value data containing an obkv.
|
||||
obkv_buffer: Vec<u8>,
|
||||
// buffer used to store the value data containing an obkv of tokens with their positions.
|
||||
obkv_positions_buffer: Vec<u8>,
|
||||
}
|
@@ -0,0 +1,58 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
};
|
||||
use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
/// Extracts the facet number and the documents ids where this facet number appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet numbers and
|
||||
/// documents ids from the given chunk of docid facet number positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
fid_docid_facet_number: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = fid_docid_facet_number.into_cursor()?;
|
||||
while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
|
||||
let (field_id, document_id, number) =
|
||||
FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: number };
|
||||
let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::from_slice(deladd_obkv_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
facet_number_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
sorter_into_reader(facet_number_docids_sorter, indexer)
|
||||
}
|
@@ -0,0 +1,303 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::iter::FromIterator;
|
||||
use std::{io, str};
|
||||
|
||||
use charabia::normalizer::{Normalize, NormalizerOption};
|
||||
use charabia::{Language, StrDetection, Token};
|
||||
use heed::types::SerdeJson;
|
||||
use heed::BytesEncode;
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
if settings_diff.settings_update_only() {
|
||||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdBtreesetString,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// as the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted facet strings and
|
||||
/// documents ids from the given chunk of docid facet string positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
MergeDeladdBtreesetString,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
|
||||
if is_same_value && are_same_locales {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
{
|
||||
let old_hyper_normalized_value = normalize_facet_string(normalized_value, old_locales);
|
||||
let new_hyper_normalized_value = if are_same_locales {
|
||||
&old_hyper_normalized_value
|
||||
} else {
|
||||
&normalize_facet_string(normalized_value, new_locales)
|
||||
};
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == new_hyper_normalized_value.as_str() {
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
obkv.insert(deladd_key, val)?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
|
||||
let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
} else {
|
||||
// if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different.
|
||||
// deletion
|
||||
if deladd_reader.get(DelAdd::Deletion).is_some() {
|
||||
// insert old value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, val)?;
|
||||
obkv.finish()?;
|
||||
let key: (u16, &str) = (field_id, old_hyper_normalized_value.as_ref());
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
// addition
|
||||
if deladd_reader.get(DelAdd::Addition).is_some() {
|
||||
// insert new value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, val)?;
|
||||
obkv.finish()?;
|
||||
let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref());
|
||||
let key_bytes =
|
||||
BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?;
|
||||
normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
}
|
||||
|
||||
let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
|
||||
let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in deladd_reader.iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
|
||||
}
|
||||
|
||||
let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?;
|
||||
sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized))
|
||||
}
|
||||
|
||||
/// Normalizes the facet string and truncates it to the max length.
|
||||
fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String {
|
||||
let options: NormalizerOption = NormalizerOption { lossy: true, ..Default::default() };
|
||||
let mut detection = StrDetection::new(facet_string, locales);
|
||||
|
||||
let script = detection.script();
|
||||
// Detect the language of the facet string only if several locales are explicitly provided.
|
||||
let language = match locales {
|
||||
Some(&[language]) => Some(language),
|
||||
Some(multiple_locales) if multiple_locales.len() > 1 => detection.language(),
|
||||
_ => None,
|
||||
};
|
||||
|
||||
let token = Token {
|
||||
lemma: std::borrow::Cow::Borrowed(facet_string),
|
||||
script,
|
||||
language,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
// truncate the facet string to the max length
|
||||
token
|
||||
.normalize(&options)
|
||||
.lemma
|
||||
.char_indices()
|
||||
.take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
|
||||
.map(|(_, c)| c)
|
||||
.collect()
|
||||
}
|
@@ -0,0 +1,574 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::mem::size_of;
|
||||
|
||||
use bytemuck::bytes_of;
|
||||
use grenad::Sorter;
|
||||
use heed::BytesEncode;
|
||||
use itertools::{merge_join_by, EitherOrBoth, Itertools};
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
use FilterableValues::{Empty, Null, Values};
|
||||
|
||||
use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepFirst};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
|
||||
/// The extracted facet values stored in grenad files by type.
|
||||
pub struct ExtractedFacetValues {
|
||||
pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
|
||||
}
|
||||
|
||||
/// Extracts the facet values of each faceted field of each document.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid the fid and the original value as key
|
||||
/// and the normalized value as value extracted from the given chunk of documents.
|
||||
/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<ExtractedFacetValues> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
KeepFirst,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
KeepFirst,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 2),
|
||||
true,
|
||||
);
|
||||
|
||||
// The tuples represents the Del and Add side for a bitmap
|
||||
let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
|
||||
|
||||
// We create two buffers for mutable ref issues with closures.
|
||||
let mut numbers_key_buffer = Vec::new();
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let old_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.old.faceted_fields_ids.iter().copied().collect();
|
||||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
|
||||
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
let get_document_json_value = move |field_id, side| {
|
||||
obkv.get(field_id)
|
||||
.map(KvReaderDelAdd::from_slice)
|
||||
.and_then(|kv| kv.get(side))
|
||||
.map(from_slice)
|
||||
.transpose()
|
||||
.map_err(InternalError::SerdeJson)
|
||||
};
|
||||
// iterate over the faceted fields instead of over the whole document.
|
||||
for eob in
|
||||
merge_join_by(old_faceted_fids.iter(), new_faceted_fids.iter(), |old, new| {
|
||||
old.cmp(new)
|
||||
})
|
||||
{
|
||||
let (field_id, del_value, add_value) = match eob {
|
||||
EitherOrBoth::Left(&field_id) => {
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
|
||||
// deletion only
|
||||
(field_id, del_value, None)
|
||||
}
|
||||
EitherOrBoth::Right(&field_id) => {
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
// addition only
|
||||
(field_id, None, add_value)
|
||||
}
|
||||
EitherOrBoth::Both(&field_id, _) => {
|
||||
// during settings update, recompute the changing settings only.
|
||||
if settings_diff.settings_update_only {
|
||||
continue;
|
||||
}
|
||||
|
||||
let del_value = get_document_json_value(field_id, DelAdd::Deletion)?;
|
||||
let add_value = get_document_json_value(field_id, DelAdd::Addition)?;
|
||||
|
||||
(field_id, del_value, add_value)
|
||||
}
|
||||
};
|
||||
|
||||
if del_value.is_some() || add_value.is_some() {
|
||||
numbers_key_buffer.clear();
|
||||
strings_key_buffer.clear();
|
||||
|
||||
// Set key to the field_id
|
||||
// Note: this encoding is consistent with FieldIdCodec
|
||||
numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
|
||||
let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
|
||||
let document = DocumentId::from_be_bytes(document);
|
||||
|
||||
// For the other extraction tasks, prefix the key with the field_id and the document_id
|
||||
numbers_key_buffer.extend_from_slice(docid_bytes);
|
||||
strings_key_buffer.extend_from_slice(docid_bytes);
|
||||
|
||||
// We insert the document id on the Del and the Add side if the field exists.
|
||||
let (ref mut del_exists, ref mut add_exists) =
|
||||
facet_exists_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_null, ref mut add_is_null) =
|
||||
facet_is_null_docids.entry(field_id).or_default();
|
||||
let (ref mut del_is_empty, ref mut add_is_empty) =
|
||||
facet_is_empty_docids.entry(field_id).or_default();
|
||||
|
||||
if del_value.is_some() {
|
||||
del_exists.insert(document);
|
||||
}
|
||||
if add_value.is_some() {
|
||||
add_exists.insert(document);
|
||||
}
|
||||
|
||||
let del_geo_support = settings_diff
|
||||
.old
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let add_geo_support = settings_diff
|
||||
.new
|
||||
.geo_fields_ids
|
||||
.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
|
||||
let del_filterable_values =
|
||||
del_value.map(|value| extract_facet_values(&value, del_geo_support));
|
||||
let add_filterable_values =
|
||||
add_value.map(|value| extract_facet_values(&value, add_geo_support));
|
||||
|
||||
// Those closures are just here to simplify things a bit.
|
||||
let mut insert_numbers_diff = |del_numbers, add_numbers| {
|
||||
insert_numbers_diff(
|
||||
&mut fid_docid_facet_numbers_sorter,
|
||||
&mut numbers_key_buffer,
|
||||
del_numbers,
|
||||
add_numbers,
|
||||
)
|
||||
};
|
||||
let mut insert_strings_diff = |del_strings, add_strings| {
|
||||
insert_strings_diff(
|
||||
&mut fid_docid_facet_strings_sorter,
|
||||
&mut strings_key_buffer,
|
||||
del_strings,
|
||||
add_strings,
|
||||
)
|
||||
};
|
||||
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(None, None) => (),
|
||||
(Some(del_filterable_values), None) => match del_filterable_values {
|
||||
Null => {
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
},
|
||||
(None, Some(add_filterable_values)) => match add_filterable_values {
|
||||
Null => {
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
Empty => {
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
Values { numbers, strings } => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
}
|
||||
},
|
||||
(Some(del_filterable_values), Some(add_filterable_values)) => {
|
||||
match (del_filterable_values, add_filterable_values) {
|
||||
(Null, Null) | (Empty, Empty) => (),
|
||||
(Null, Empty) => {
|
||||
del_is_null.insert(document);
|
||||
add_is_empty.insert(document);
|
||||
}
|
||||
(Empty, Null) => {
|
||||
del_is_empty.insert(document);
|
||||
add_is_null.insert(document);
|
||||
}
|
||||
(Null, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_null.insert(document);
|
||||
}
|
||||
(Empty, Values { numbers, strings }) => {
|
||||
insert_numbers_diff(vec![], numbers)?;
|
||||
insert_strings_diff(vec![], strings)?;
|
||||
del_is_empty.insert(document);
|
||||
}
|
||||
(Values { numbers, strings }, Null) => {
|
||||
add_is_null.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(Values { numbers, strings }, Empty) => {
|
||||
add_is_empty.insert(document);
|
||||
insert_numbers_diff(numbers, vec![])?;
|
||||
insert_strings_diff(strings, vec![])?;
|
||||
}
|
||||
(
|
||||
Values { numbers: del_numbers, strings: del_strings },
|
||||
Values { numbers: add_numbers, strings: add_strings },
|
||||
) => {
|
||||
insert_numbers_diff(del_numbers, add_numbers)?;
|
||||
insert_strings_diff(del_strings, add_strings)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut facet_exists_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
|
||||
|
||||
let mut facet_is_null_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
|
||||
|
||||
let mut facet_is_empty_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
|
||||
deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
|
||||
facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
|
||||
}
|
||||
let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
|
||||
|
||||
Ok(ExtractedFacetValues {
|
||||
fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
|
||||
fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
|
||||
fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
|
||||
fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
|
||||
fid_facet_exists_docids_chunk: facet_exists_docids_reader,
|
||||
})
|
||||
}
|
||||
|
||||
/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
|
||||
fn deladd_obkv_cbo_roaring_bitmaps(
|
||||
buffer: &mut Vec<u8>,
|
||||
del_bitmap: &RoaringBitmap,
|
||||
add_bitmap: &RoaringBitmap,
|
||||
) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(buffer);
|
||||
let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
|
||||
let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
|
||||
obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
|
||||
obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
|
||||
obkv.finish()
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_str(s: &str) -> &str {
|
||||
let index = s
|
||||
.char_indices()
|
||||
.map(|(idx, _)| idx)
|
||||
.chain(std::iter::once(s.len()))
|
||||
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||
.last();
|
||||
|
||||
&s[..index.unwrap_or(0)]
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_numbers_diff(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<KeepFirst>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_numbers: Vec<f64>,
|
||||
mut add_numbers: Vec<f64>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the float numbers
|
||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
del_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.dedup_by_key(|f| OrderedFloat(*f));
|
||||
|
||||
let merged_numbers_iter = itertools::merge_join_by(
|
||||
del_numbers.into_iter().map(OrderedFloat),
|
||||
add_numbers.into_iter().map(OrderedFloat),
|
||||
|del, add| del.cmp(add),
|
||||
);
|
||||
|
||||
// insert facet numbers in sorter
|
||||
for eob in merged_numbers_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
match eob {
|
||||
EitherOrBoth::Both(_, _) => (), // no need to touch anything
|
||||
EitherOrBoth::Left(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Del part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Deletion, bytes_of(&()))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Right(OrderedFloat(number)) => {
|
||||
if let Some(value_bytes) = f64_into_bytes(number) {
|
||||
key_buffer.extend_from_slice(&value_bytes);
|
||||
key_buffer.extend_from_slice(&number.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those numbers.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, bytes_of(&()))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Computes the diff between both Del and Add strings and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_strings_diff(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<KeepFirst>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_strings: Vec<(String, String)>,
|
||||
mut add_strings: Vec<(String, String)>,
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the normalized and original strings
|
||||
del_strings.sort_unstable();
|
||||
add_strings.sort_unstable();
|
||||
del_strings.dedup();
|
||||
add_strings.dedup();
|
||||
|
||||
let del_strings = del_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||
let add_strings = add_strings.iter().chunk_by(|(normalized, _)| normalized);
|
||||
|
||||
let merged_strings_iter = itertools::merge_join_by(
|
||||
del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
|
||||
|(normalized_del, _), (normalized_add, _)| normalized_del.cmp(normalized_add),
|
||||
);
|
||||
|
||||
// insert normalized and original facet string in sorter
|
||||
for eob in merged_strings_iter {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let (side, normalized, original) = match eob {
|
||||
EitherOrBoth::Both((normalized, del), (_, add)) => {
|
||||
let merged_strings_iter =
|
||||
itertools::merge_join_by(del, add, |(_, original_del), (_, original_add)| {
|
||||
original_del.cmp(original_add)
|
||||
});
|
||||
|
||||
// FIXME: we're in a bit of a pickle here, because we're only saving **one** original value per side,
|
||||
// but we possibly have multiple original values that changed in the case where the field is an
|
||||
// array of multiple values that normalize to the same value.
|
||||
// (e.g. "foo" = ["bar", "Bar", "bAr", "baR"]. I'm not judging why you would do that ¯\_(ツ)_/¯)
|
||||
//
|
||||
// We'll work best effort by ignoring when the same value appears in both sides, deleting the first
|
||||
// value that is only in the old version, and adding the first value that is only in the new version
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
let mut del = None;
|
||||
let mut add = None;
|
||||
let mut both = None;
|
||||
|
||||
for eob in merged_strings_iter {
|
||||
match eob {
|
||||
EitherOrBoth::Both((_normalized, original), _) => {
|
||||
both = match both {
|
||||
Some(both) => Some(both),
|
||||
None => Some(original),
|
||||
}
|
||||
}
|
||||
EitherOrBoth::Left((_normalized, original)) => {
|
||||
del = match del {
|
||||
Some(del) => Some(del),
|
||||
None => Some(original),
|
||||
};
|
||||
}
|
||||
EitherOrBoth::Right((_normalized, original)) => {
|
||||
add = match add {
|
||||
Some(add) => Some(add),
|
||||
None => Some(original),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(del) = del {
|
||||
obkv.insert(DelAdd::Deletion, del)?;
|
||||
}
|
||||
if let Some(add) = add
|
||||
// prefer the newly added, but if there is none, keep a value in the list of values
|
||||
// since the normalized value appears both in old and new, we should never remove it.
|
||||
.or(both)
|
||||
{
|
||||
obkv.insert(DelAdd::Addition, add)?;
|
||||
}
|
||||
|
||||
let truncated = truncate_str(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
continue;
|
||||
}
|
||||
EitherOrBoth::Left((_normalized, mut original)) => {
|
||||
// FIXME: we only consider the first value for the purpose of facet search
|
||||
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||
let Some((normalized, original)) = original.next() else {
|
||||
continue;
|
||||
};
|
||||
(DelAdd::Deletion, normalized, original)
|
||||
}
|
||||
EitherOrBoth::Right((_normalized, mut original)) => {
|
||||
// FIXME: we only consider the first value for the purpose of facet search
|
||||
// another structure is needed, able to retain all originals associated with a normalized value.
|
||||
let Some((normalized, original)) = original.next() else {
|
||||
continue;
|
||||
};
|
||||
(DelAdd::Addition, normalized, original)
|
||||
}
|
||||
};
|
||||
let truncated = truncate_str(normalized);
|
||||
key_buffer.extend_from_slice(truncated.as_bytes());
|
||||
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(side, original)?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Represent what a document field contains.
|
||||
enum FilterableValues {
|
||||
/// Corresponds to the JSON `null` value.
|
||||
Null,
|
||||
/// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`.
|
||||
Empty,
|
||||
/// Represents all the numbers and strings values found in this document field.
|
||||
Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
|
||||
}
|
||||
|
||||
/// Extracts the facet values of a JSON field.
|
||||
fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
|
||||
fn inner_extract_facet_values(
|
||||
value: &Value,
|
||||
can_recurse: bool,
|
||||
output_numbers: &mut Vec<f64>,
|
||||
output_strings: &mut Vec<(String, String)>,
|
||||
geo_field: bool,
|
||||
) {
|
||||
match value {
|
||||
Value::Null => (),
|
||||
Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())),
|
||||
Value::Number(number) => {
|
||||
if let Some(float) = number.as_f64() {
|
||||
output_numbers.push(float);
|
||||
}
|
||||
}
|
||||
Value::String(original) => {
|
||||
// if we're working on a geofield it MUST be something we can parse or else there was an internal error
|
||||
// in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
|
||||
if geo_field {
|
||||
if let Ok(float) = original.parse() {
|
||||
output_numbers.push(float);
|
||||
} else {
|
||||
tracing::warn!(
|
||||
"Internal error, could not parse a geofield that has been validated. Please open an issue."
|
||||
)
|
||||
}
|
||||
}
|
||||
let normalized = crate::normalize_facet(original);
|
||||
output_strings.push((normalized, original.clone()));
|
||||
}
|
||||
Value::Array(values) => {
|
||||
if can_recurse {
|
||||
for value in values {
|
||||
inner_extract_facet_values(
|
||||
value,
|
||||
false,
|
||||
output_numbers,
|
||||
output_strings,
|
||||
geo_field,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
Value::Object(_) => (),
|
||||
}
|
||||
}
|
||||
|
||||
match value {
|
||||
Value::Null => FilterableValues::Null,
|
||||
Value::String(s) if s.is_empty() => FilterableValues::Empty,
|
||||
Value::Array(a) if a.is_empty() => FilterableValues::Empty,
|
||||
Value::Object(o) if o.is_empty() => FilterableValues::Empty,
|
||||
otherwise => {
|
||||
let mut numbers = Vec::new();
|
||||
let mut strings = Vec::new();
|
||||
inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
|
||||
FilterableValues::Values { numbers, strings }
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,96 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::Result;
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
/// Extracts the field id word count and the documents ids where
|
||||
/// this field id with this amount of words appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted field id word counts
|
||||
/// and documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut value_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| {
|
||||
KvReaderU16::from_slice(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()
|
||||
})
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| {
|
||||
KvReaderU16::from_slice(addition).iter().take(MAX_COUNTED_WORDS + 1).count()
|
||||
})
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
if deletion != addition {
|
||||
// Insert deleted word count in sorter if exist.
|
||||
if let Some(word_count) = deletion {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
// Insert added word count in sorter if exist.
|
||||
if let Some(word_count) = addition {
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(fid_bytes);
|
||||
key_buffer.push(word_count as u8);
|
||||
fid_word_count_docids_sorter
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sorter_into_reader(fid_word_count_docids_sorter, indexer)
|
||||
}
|
@@ -0,0 +1,103 @@
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use concat_arrays::concat_arrays;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::GeoError;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::extract_finite_float_from_value;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, InternalError, Result};
|
||||
|
||||
/// Extracts the geographical coordinates contained in each document under the `_geo` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let reader = KvReaderDelAdd::from_slice(obkv.get(primary_key_id).unwrap());
|
||||
let document_id =
|
||||
reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// extract old version
|
||||
let del_lat_lng = extract_lat_lng(obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
|
||||
// extract new version
|
||||
let add_lat_lng = extract_lat_lng(obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
|
||||
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
if let Some([lat, lng]) = del_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Deletion, bytes)?;
|
||||
}
|
||||
if let Some([lat, lng]) = add_lat_lng {
|
||||
#[allow(clippy::drop_non_drop)]
|
||||
let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
|
||||
obkv.insert(DelAdd::Addition, bytes)?;
|
||||
}
|
||||
let bytes = obkv.into_inner()?;
|
||||
writer.insert(docid_bytes, bytes)?;
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
deladd: DelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<Option<[f64; 2]>> {
|
||||
match settings.geo_fields_ids {
|
||||
Some((lat_fid, lng_fid)) => {
|
||||
let lat =
|
||||
document.get(lat_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
|
||||
let lng =
|
||||
document.get(lng_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
|
||||
let (lat, lng) = match (lat, lng) {
|
||||
(Some(lat), Some(lng)) => (lat, lng),
|
||||
(Some(_), None) => {
|
||||
return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, Some(_)) => {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
|
||||
}
|
||||
(None, None) => return Ok(None),
|
||||
};
|
||||
let lat = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
|
||||
|
||||
let lng = extract_finite_float_from_value(
|
||||
serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
|
||||
)
|
||||
.map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
|
||||
Ok(Some([lat, lng]))
|
||||
}
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
@@ -0,0 +1,841 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter};
|
||||
use std::mem::size_of;
|
||||
use std::str::from_utf8;
|
||||
use std::sync::Arc;
|
||||
|
||||
use bytemuck::cast_slice;
|
||||
use grenad::Writer;
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::error::FaultSource;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::vector::settings::ReindexAction;
|
||||
use crate::vector::{Embedder, Embedding};
|
||||
use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||
|
||||
pub struct ExtractedVectorPoints {
|
||||
// docid, _index -> KvWriterDelAdd -> Vector
|
||||
pub manual_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> ()
|
||||
pub remove_vectors: grenad::Reader<BufReader<File>>,
|
||||
// docid -> prompt
|
||||
pub prompts: grenad::Reader<BufReader<File>>,
|
||||
|
||||
// embedder
|
||||
pub embedder_name: String,
|
||||
pub embedder: Arc<Embedder>,
|
||||
pub add_to_user_provided: RoaringBitmap,
|
||||
pub remove_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum VectorStateDelta {
|
||||
NoChange,
|
||||
// Remove all vectors, generated or manual, from this document
|
||||
NowRemoved,
|
||||
|
||||
NowManual(Vec<Vec<f32>>),
|
||||
|
||||
// Add the vector computed from the specified prompt
|
||||
// Remove any previous vector
|
||||
// Note: changing the value of the prompt **does require** recording this delta
|
||||
NowGenerated(String),
|
||||
}
|
||||
|
||||
impl VectorStateDelta {
|
||||
fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
|
||||
match self {
|
||||
VectorStateDelta::NoChange => Default::default(),
|
||||
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
|
||||
// We always delete the previous vectors
|
||||
VectorStateDelta::NowManual(add) => (true, Default::default(), add),
|
||||
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct EmbedderVectorExtractor {
|
||||
embedder_name: String,
|
||||
embedder: Arc<Embedder>,
|
||||
prompt: Arc<Prompt>,
|
||||
|
||||
// (docid) -> (prompt)
|
||||
prompts_writer: Writer<BufWriter<File>>,
|
||||
// (docid) -> ()
|
||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
manual_vectors_writer: Writer<BufWriter<File>>,
|
||||
// The docids of the documents that contains a user defined embedding
|
||||
add_to_user_provided: RoaringBitmap,
|
||||
|
||||
action: ExtractionAction,
|
||||
}
|
||||
|
||||
struct DocumentOperation {
|
||||
// The docids of the documents that contains an auto-generated embedding
|
||||
remove_from_user_provided: RoaringBitmap,
|
||||
}
|
||||
|
||||
enum ExtractionAction {
|
||||
SettingsFullReindex,
|
||||
SettingsRegeneratePrompts { old_prompt: Arc<Prompt> },
|
||||
DocumentOperation(DocumentOperation),
|
||||
}
|
||||
|
||||
struct ManualEmbedderErrors {
|
||||
embedder_name: String,
|
||||
docid: String,
|
||||
other_docids: usize,
|
||||
}
|
||||
|
||||
impl ManualEmbedderErrors {
|
||||
pub fn push_error(
|
||||
errors: &mut Option<ManualEmbedderErrors>,
|
||||
embedder_name: &str,
|
||||
document_id: impl Fn() -> Value,
|
||||
) {
|
||||
match errors {
|
||||
Some(errors) => {
|
||||
if errors.embedder_name == embedder_name {
|
||||
errors.other_docids = errors.other_docids.saturating_add(1)
|
||||
}
|
||||
}
|
||||
None => {
|
||||
*errors = Some(Self {
|
||||
embedder_name: embedder_name.to_owned(),
|
||||
docid: document_id().to_string(),
|
||||
other_docids: 0,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn to_result(
|
||||
errors: Option<ManualEmbedderErrors>,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
) -> Result<()> {
|
||||
match errors {
|
||||
Some(errors) => {
|
||||
let embedder_name = &errors.embedder_name;
|
||||
let mut msg = format!(
|
||||
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document {}{}",
|
||||
errors.docid,
|
||||
if errors.other_docids != 0 {
|
||||
format!(" and at least {} other document(s)", errors.other_docids)
|
||||
} else {
|
||||
"".to_string()
|
||||
}
|
||||
);
|
||||
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
None => Ok(()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||
///
|
||||
/// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
obkv_documents: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedders_configs: &[IndexEmbeddingConfig],
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
) -> Result<(Vec<ExtractedVectorPoints>, UnusedVectorsDistribution)> {
|
||||
let mut unused_vectors_distribution = UnusedVectorsDistribution::new();
|
||||
let mut manual_errors = None;
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let old_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
|
||||
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
let new_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut extractors = Vec::new();
|
||||
|
||||
let mut configs = settings_diff.new.embedding_configs.clone().into_inner();
|
||||
let old_configs = &settings_diff.old.embedding_configs;
|
||||
|
||||
if reindex_vectors {
|
||||
for (name, action) in settings_diff.embedding_config_updates.iter() {
|
||||
if let Some(action) = action.reindex() {
|
||||
let Some((embedder_name, (embedder, prompt, _quantized))) =
|
||||
configs.remove_entry(name)
|
||||
else {
|
||||
tracing::error!(embedder = name, "Requested embedder config not found");
|
||||
continue;
|
||||
};
|
||||
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let action = match action {
|
||||
ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex,
|
||||
ReindexAction::RegeneratePrompts => {
|
||||
let Some((_, old_prompt, _quantized)) = old_configs.get(name) else {
|
||||
tracing::error!(embedder = name, "Old embedder config not found");
|
||||
continue;
|
||||
};
|
||||
|
||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt }
|
||||
}
|
||||
};
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided: RoaringBitmap::new(),
|
||||
action,
|
||||
});
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// document operation
|
||||
|
||||
for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() {
|
||||
// (docid, _index) -> KvWriterDelAdd -> Vector
|
||||
let manual_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> (prompt)
|
||||
let prompts_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
// (docid) -> ()
|
||||
let remove_vectors_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
extractors.push(EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided: RoaringBitmap::new(),
|
||||
action: ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided: RoaringBitmap::new(),
|
||||
}),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
// this must always be serialized as (docid, external_docid);
|
||||
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
|
||||
let (docid_bytes, external_id_bytes) =
|
||||
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
let docid = DocumentId::from_be_bytes(docid_bytes);
|
||||
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
||||
|
||||
// since we only need the primary key when we throw an error we create this getter to
|
||||
// lazily get it when needed
|
||||
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
|
||||
|
||||
let mut parsed_vectors = ParsedVectorsDiff::new(
|
||||
docid,
|
||||
embedders_configs,
|
||||
obkv,
|
||||
old_vectors_fid,
|
||||
new_vectors_fid,
|
||||
)
|
||||
.map_err(|error| error.to_crate_error(document_id().to_string()))?;
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided,
|
||||
action,
|
||||
} in extractors.iter_mut()
|
||||
{
|
||||
let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_));
|
||||
|
||||
let (old, new) = parsed_vectors.remove(embedder_name);
|
||||
let delta = match action {
|
||||
ExtractionAction::SettingsFullReindex => match old {
|
||||
// A full reindex can be triggered either by:
|
||||
// 1. a new embedder
|
||||
// 2. an existing embedder changed so that it must regenerate all generated embeddings.
|
||||
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
|
||||
VectorState::Inline(vectors) => {
|
||||
if !vectors.must_regenerate() {
|
||||
add_to_user_provided.insert(docid);
|
||||
}
|
||||
|
||||
match vectors.into_array_of_vectors() {
|
||||
Some(add_vectors) => {
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(
|
||||
crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
),
|
||||
));
|
||||
}
|
||||
VectorStateDelta::NowManual(add_vectors)
|
||||
}
|
||||
None => VectorStateDelta::NoChange,
|
||||
}
|
||||
}
|
||||
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
|
||||
VectorState::Manual => VectorStateDelta::NoChange,
|
||||
// generated vectors must be regenerated
|
||||
VectorState::Generated => {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(
|
||||
&mut manual_errors,
|
||||
embedder_name.as_str(),
|
||||
document_id,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
|
||||
}
|
||||
},
|
||||
// prompt regeneration is only triggered for existing embedders
|
||||
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
|
||||
if old.must_regenerate() {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(
|
||||
&mut manual_errors,
|
||||
embedder_name.as_str(),
|
||||
document_id,
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_if_prompt_changed(
|
||||
obkv,
|
||||
(old_prompt, prompt),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
)?
|
||||
} else {
|
||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||
// already in the DB since this is an existing embedder
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
}
|
||||
ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided,
|
||||
}) => extract_vector_document_diff(
|
||||
docid,
|
||||
obkv,
|
||||
prompt,
|
||||
(add_to_user_provided, remove_from_user_provided),
|
||||
(old, new),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
document_id,
|
||||
embedder_name,
|
||||
embedder_is_manual,
|
||||
&mut manual_errors,
|
||||
)?,
|
||||
};
|
||||
// and we finally push the unique vectors into the writer
|
||||
push_vectors_diff(
|
||||
remove_vectors_writer,
|
||||
prompts_writer,
|
||||
manual_vectors_writer,
|
||||
&mut key_buffer,
|
||||
delta,
|
||||
)?;
|
||||
}
|
||||
|
||||
unused_vectors_distribution.append(parsed_vectors);
|
||||
}
|
||||
|
||||
ManualEmbedderErrors::to_result(
|
||||
manual_errors,
|
||||
possible_embedding_mistakes,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
|
||||
let mut results = Vec::new();
|
||||
|
||||
for EmbedderVectorExtractor {
|
||||
embedder_name,
|
||||
embedder,
|
||||
prompt: _,
|
||||
prompts_writer,
|
||||
remove_vectors_writer,
|
||||
action,
|
||||
manual_vectors_writer,
|
||||
add_to_user_provided,
|
||||
} in extractors
|
||||
{
|
||||
let remove_from_user_provided =
|
||||
if let ExtractionAction::DocumentOperation(DocumentOperation {
|
||||
remove_from_user_provided,
|
||||
}) = action
|
||||
{
|
||||
remove_from_user_provided
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
results.push(ExtractedVectorPoints {
|
||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||
prompts: writer_into_reader(prompts_writer)?,
|
||||
embedder,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
})
|
||||
}
|
||||
|
||||
Ok((results, unused_vectors_distribution))
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments
|
||||
fn extract_vector_document_diff(
|
||||
docid: DocumentId,
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||
(old, new): (VectorState, VectorState),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
document_id: impl Fn() -> Value,
|
||||
embedder_name: &str,
|
||||
embedder_is_manual: bool,
|
||||
manual_errors: &mut Option<ManualEmbedderErrors>,
|
||||
) -> Result<VectorStateDelta> {
|
||||
match (old.must_regenerate(), new.must_regenerate()) {
|
||||
(true, true) | (false, false) => {}
|
||||
(true, false) => {
|
||||
add_to_user_provided.insert(docid);
|
||||
}
|
||||
(false, true) => {
|
||||
remove_from_user_provided.insert(docid);
|
||||
}
|
||||
}
|
||||
|
||||
let delta = match (old, new) {
|
||||
// regardless of the previous state, if a document now contains inline _vectors, they must
|
||||
// be extracted manually
|
||||
(_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
|
||||
Some(add_vectors) => {
|
||||
if add_vectors.len() > usize::from(u8::MAX) {
|
||||
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
|
||||
document_id().to_string(),
|
||||
add_vectors.len(),
|
||||
)));
|
||||
}
|
||||
|
||||
VectorStateDelta::NowManual(add_vectors)
|
||||
}
|
||||
None => VectorStateDelta::NoChange,
|
||||
},
|
||||
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
|
||||
// document changed
|
||||
(VectorState::Generated, VectorState::Generated) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
// Don't give up if the old prompt was failing
|
||||
let old_prompt = Some(&prompt).map(|p| {
|
||||
p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
.unwrap_or_default()
|
||||
});
|
||||
let new_prompt =
|
||||
prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
if old_prompt.as_ref() != Some(&new_prompt) {
|
||||
let old_prompt = old_prompt.unwrap_or_default();
|
||||
tracing::trace!(
|
||||
"🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}"
|
||||
);
|
||||
VectorStateDelta::NowGenerated(new_prompt)
|
||||
} else {
|
||||
tracing::trace!("⏭️ Prompt unmodified, skipping");
|
||||
VectorStateDelta::NoChange
|
||||
}
|
||||
} else {
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
// inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from
|
||||
// the previous version of the document.
|
||||
// Manual -> Generated is also not possible without an Inline to the right (which is handled above)
|
||||
// Generated -> Generated is handled above, so not possible
|
||||
// As a result, this code is unreachable
|
||||
(_not_generated, VectorState::Generated) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id);
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
// becomes autogenerated
|
||||
VectorStateDelta::NowGenerated(prompt.render_kvdeladd(
|
||||
obkv,
|
||||
DelAdd::Addition,
|
||||
new_fields_ids_map,
|
||||
)?)
|
||||
} else {
|
||||
// make sure the document is always removed from user provided on removal
|
||||
remove_from_user_provided.insert(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
// inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous
|
||||
// version of the document.
|
||||
// however the Rust type system cannot know that.
|
||||
(_manual, VectorState::Manual) => {
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// if the new version of documents has the vectors in the DB,
|
||||
// then they are user-provided and nothing possibly changed
|
||||
VectorStateDelta::NoChange
|
||||
} else {
|
||||
// make sure the document is always removed from user provided on removal
|
||||
remove_from_user_provided.insert(docid);
|
||||
VectorStateDelta::NowRemoved
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(delta)
|
||||
}
|
||||
|
||||
fn regenerate_if_prompt_changed(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
) -> Result<VectorStateDelta> {
|
||||
let old_prompt = old_prompt
|
||||
.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
.unwrap_or(Default::default());
|
||||
let new_prompt = new_prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
if new_prompt == old_prompt {
|
||||
return Ok(VectorStateDelta::NoChange);
|
||||
}
|
||||
Ok(VectorStateDelta::NowGenerated(new_prompt))
|
||||
}
|
||||
|
||||
fn regenerate_prompt(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||
) -> Result<VectorStateDelta> {
|
||||
let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
Ok(VectorStateDelta::NowGenerated(prompt))
|
||||
}
|
||||
|
||||
/// We cannot compute the diff between both Del and Add vectors.
|
||||
/// We'll push every vector and compute the difference later in TypedChunk.
|
||||
fn push_vectors_diff(
|
||||
remove_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
prompts_writer: &mut Writer<BufWriter<File>>,
|
||||
manual_vectors_writer: &mut Writer<BufWriter<File>>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
delta: VectorStateDelta,
|
||||
) -> Result<()> {
|
||||
let (must_remove, prompt, mut add_vectors) = delta.into_values();
|
||||
if must_remove {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
remove_vectors_writer.insert(&key_buffer, [])?;
|
||||
}
|
||||
if !prompt.is_empty() {
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
prompts_writer.insert(&key_buffer, prompt.as_bytes())?;
|
||||
}
|
||||
|
||||
// We sort and dedup the vectors
|
||||
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
|
||||
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
|
||||
|
||||
// insert vectors into the writer
|
||||
for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
|
||||
// Generate the key by extending the unique index to it.
|
||||
key_buffer.truncate(TRUNCATE_SIZE);
|
||||
let index = u16::try_from(i).unwrap();
|
||||
key_buffer.extend_from_slice(&index.to_be_bytes());
|
||||
|
||||
// We insert only the Add part of the Obkv to inform
|
||||
// that we only want to remove all those vectors.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
|
||||
let bytes = obkv.into_inner()?;
|
||||
manual_vectors_writer.insert(&key_buffer, bytes)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Compares two vectors by using the OrderingFloat helper.
|
||||
fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
|
||||
a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_embeddings<R: io::Read + io::Seek>(
|
||||
// docid, prompt
|
||||
prompt_reader: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
embedder: Arc<Embedder>,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism
|
||||
let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk
|
||||
|
||||
// docid, state with embedding
|
||||
let mut state_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut chunks = Vec::with_capacity(n_chunks);
|
||||
let mut current_chunk = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut current_chunk_ids = Vec::with_capacity(n_vectors_per_chunk);
|
||||
let mut chunks_ids = Vec::with_capacity(n_chunks);
|
||||
let mut cursor = prompt_reader.into_cursor()?;
|
||||
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
// SAFETY: precondition, the grenad value was saved from a string
|
||||
let prompt = unsafe { std::str::from_utf8_unchecked(value) };
|
||||
if current_chunk.len() == current_chunk.capacity() {
|
||||
chunks.push(std::mem::replace(
|
||||
&mut current_chunk,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
chunks_ids.push(std::mem::replace(
|
||||
&mut current_chunk_ids,
|
||||
Vec::with_capacity(n_vectors_per_chunk),
|
||||
));
|
||||
};
|
||||
current_chunk.push(prompt.to_owned());
|
||||
current_chunk_ids.push(docid);
|
||||
|
||||
if chunks.len() == chunks.capacity() {
|
||||
let chunked_embeds = embed_chunks(
|
||||
&embedder,
|
||||
std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)),
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
|
||||
}
|
||||
chunks_ids.clear();
|
||||
}
|
||||
}
|
||||
|
||||
// send last chunk
|
||||
if !chunks.is_empty() {
|
||||
let chunked_embeds = embed_chunks(
|
||||
&embedder,
|
||||
std::mem::take(&mut chunks),
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
for (docid, embeddings) in chunks_ids
|
||||
.iter()
|
||||
.flat_map(|docids| docids.iter())
|
||||
.zip(chunked_embeds.iter().flat_map(|embeds| embeds.iter()))
|
||||
{
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
|
||||
}
|
||||
}
|
||||
|
||||
if !current_chunk.is_empty() {
|
||||
let embeds = embed_chunks(
|
||||
&embedder,
|
||||
vec![std::mem::take(&mut current_chunk)],
|
||||
embedder_name,
|
||||
possible_embedding_mistakes,
|
||||
unused_vectors_distribution,
|
||||
request_threads,
|
||||
)?;
|
||||
|
||||
if let Some(embeds) = embeds.first() {
|
||||
for (docid, embeddings) in current_chunk_ids.iter().zip(embeds.iter()) {
|
||||
state_writer.insert(docid.to_be_bytes(), cast_slice(embeddings))?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
writer_into_reader(state_writer)
|
||||
}
|
||||
|
||||
fn embed_chunks(
|
||||
embedder: &Embedder,
|
||||
text_chunks: Vec<Vec<String>>,
|
||||
embedder_name: &str,
|
||||
possible_embedding_mistakes: &PossibleEmbeddingMistakes,
|
||||
unused_vectors_distribution: &UnusedVectorsDistribution,
|
||||
request_threads: &ThreadPoolNoAbort,
|
||||
) -> Result<Vec<Vec<Embedding>>> {
|
||||
match embedder.embed_chunks(text_chunks, request_threads) {
|
||||
Ok(chunks) => Ok(chunks),
|
||||
Err(error) => {
|
||||
if let FaultSource::Bug = error.fault {
|
||||
Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(
|
||||
error.into(),
|
||||
)))
|
||||
} else {
|
||||
let mut msg =
|
||||
format!(r"While embedding documents for embedder `{embedder_name}`: {error}");
|
||||
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
}
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in
|
||||
possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
if let EmbedErrorKind::ManualEmbed(_) = &error.kind {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@@ -0,0 +1,243 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::sorter_into_reader;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
///
|
||||
/// The first returned reader is the one for normal word_docids, and the second one is for
|
||||
/// exact_word_docids
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
true,
|
||||
);
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_words = BTreeSet::new();
|
||||
let mut add_words = BTreeSet::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let (fid_bytes, _) = try_split_array_at(fid_bytes)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::from_slice(deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::from_slice(addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
words_into_sorter(
|
||||
document_id,
|
||||
fid,
|
||||
&mut key_buffer,
|
||||
&del_words,
|
||||
&add_words,
|
||||
&mut word_fid_docids_sorter,
|
||||
)?;
|
||||
|
||||
del_words.clear();
|
||||
add_words.clear();
|
||||
}
|
||||
|
||||
let mut word_fid_docids_writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / 3),
|
||||
true,
|
||||
);
|
||||
|
||||
let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
|
||||
let mut buffer = Vec::new();
|
||||
// NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::from_slice(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
let (w, fid) = StrBEU16Codec::bytes_decode(key)
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Deletion, value)?;
|
||||
if delete_from_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
// merge all additions
|
||||
if let Some(value) = obkv.get(DelAdd::Addition) {
|
||||
let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
obkv.insert(DelAdd::Addition, value)?;
|
||||
if add_in_exact {
|
||||
exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
} else {
|
||||
word_docids_sorter.insert(w, obkv.into_inner().unwrap())?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((
|
||||
sorter_into_reader(word_docids_sorter, indexer)?,
|
||||
sorter_into_reader(exact_word_docids_sorter, indexer)?,
|
||||
writer_into_reader(word_fid_docids_writer)?,
|
||||
))
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn words_into_sorter(
|
||||
document_id: DocumentId,
|
||||
fid: FieldId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_words: &BTreeSet<Vec<u8>>,
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let word_bytes = match eob {
|
||||
Left(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Right(word_bytes) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
Both(word_bytes, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
word_bytes
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn docids_into_writers<W>(
|
||||
word: &str,
|
||||
deletions: &RoaringBitmap,
|
||||
additions: &RoaringBitmap,
|
||||
writer: &mut grenad::Writer<W>,
|
||||
) -> Result<()>
|
||||
where
|
||||
W: std::io::Write,
|
||||
{
|
||||
if deletions == additions {
|
||||
// if the same value is deleted and added, do nothing.
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Write each value in the same KvDelAdd before inserting it in the final writer.
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
// deletions:
|
||||
if !deletions.is_empty() && !deletions.is_subset(additions) {
|
||||
obkv.insert(
|
||||
DelAdd::Deletion,
|
||||
CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
// additions:
|
||||
if !additions.is_empty() {
|
||||
obkv.insert(
|
||||
DelAdd::Addition,
|
||||
CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| {
|
||||
SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) }
|
||||
})?,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert everything in the same writer.
|
||||
writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?;
|
||||
|
||||
Ok(())
|
||||
}
|
@@ -0,0 +1,261 @@
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::{cmp, io};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE};
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{DocumentId, Result};
|
||||
|
||||
/// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted word pairs proximities and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
// early return if the data shouldn't be deleted nor created.
|
||||
if settings_diff.settings_update_only && !settings_diff.reindex_proximities() {
|
||||
let writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
return writer_into_reader(writer);
|
||||
}
|
||||
|
||||
let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord;
|
||||
let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord;
|
||||
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory.map(|m| m / MAX_DISTANCE as usize),
|
||||
true,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
let mut del_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut add_word_positions: VecDeque<(String, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
let mut del_word_pair_proximity = BTreeMap::new();
|
||||
let mut add_word_pair_proximity = BTreeMap::new();
|
||||
let mut current_document_id = None;
|
||||
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
// if we change document, we fill the sorter
|
||||
if current_document_id.map_or(false, |id| id != document_id) {
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
del_word_pair_proximity.clear();
|
||||
add_word_pair_proximity.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let (del, add): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
if !any_deletion {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::from_slice(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
del_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !del_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut del_word_positions,
|
||||
&mut del_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
|| {
|
||||
if !any_addition {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::from_slice(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
}) {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
let word = std::str::from_utf8(word)?;
|
||||
add_word_positions.push_back((word.to_string(), position));
|
||||
}
|
||||
|
||||
while !add_word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(
|
||||
&mut add_word_positions,
|
||||
&mut add_word_pair_proximity,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
},
|
||||
);
|
||||
|
||||
del?;
|
||||
add?;
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "final_document_into_sorter");
|
||||
let _entered = span.enter();
|
||||
|
||||
document_word_positions_into_sorter(
|
||||
document_id,
|
||||
&del_word_pair_proximity,
|
||||
&add_word_pair_proximity,
|
||||
&mut word_pair_proximity_docids_sorters,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
// FIXME: span inside of a hot loop might degrade performance and create big reports
|
||||
let span = tracing::trace_span!(target: "indexing::details", "sorter_into_reader");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
|
||||
for sorter in word_pair_proximity_docids_sorters {
|
||||
sorter.write_into_stream_writer(&mut writer)?;
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
|
||||
///
|
||||
/// This list is used by the engine to calculate the documents containing words that are
|
||||
/// close to each other.
|
||||
fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdCboRoaringBitmaps>],
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut key_buffer = Vec::new();
|
||||
for eob in
|
||||
merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
|
||||
d.cmp(a)
|
||||
})
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let ((w1, w2), prox) = match eob {
|
||||
Left(key_value) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Right(key_value) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
Both(key_value, _) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key_value
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.push(*prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
|
||||
word_pair_proximity_docids_sorters[*prox as usize - 1]
|
||||
.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
) -> Result<()> {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity
|
||||
.entry((head_word.clone(), word.clone()))
|
||||
.and_modify(|p| {
|
||||
*p = cmp::min(*p, prox);
|
||||
})
|
||||
.or_insert(prox);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
@@ -0,0 +1,138 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
///
|
||||
/// Returns a grenad reader with the list of extracted words at positions and
|
||||
/// documents ids from the given chunk of docid word positions.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
docid_word_positions: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
_settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
max_memory,
|
||||
true,
|
||||
);
|
||||
|
||||
let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
|
||||
let mut current_document_id: Option<u32> = None;
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut cursor = docid_word_positions.into_cursor()?;
|
||||
while let Some((key, value)) = cursor.move_on_next()? {
|
||||
let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = DocumentId::from_be_bytes(document_id_bytes);
|
||||
|
||||
if current_document_id.map_or(false, |id| document_id != id) {
|
||||
words_position_into_sorter(
|
||||
current_document_id.unwrap(),
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
del_word_positions.clear();
|
||||
add_word_positions.clear();
|
||||
}
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::from_slice(deletion).iter() {
|
||||
let position = bucketed_position(position);
|
||||
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (position, word_bytes) in KvReaderU16::from_slice(addition).iter() {
|
||||
let position = bucketed_position(position);
|
||||
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(document_id) = current_document_id {
|
||||
words_position_into_sorter(
|
||||
document_id,
|
||||
&mut key_buffer,
|
||||
&del_word_positions,
|
||||
&add_word_positions,
|
||||
&mut word_position_docids_sorter,
|
||||
)?;
|
||||
}
|
||||
|
||||
// TODO remove noop DelAdd OBKV
|
||||
let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
|
||||
|
||||
Ok(word_position_docids_reader)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
fn words_position_into_sorter(
|
||||
document_id: DocumentId,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
|
||||
{
|
||||
buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut buffer);
|
||||
let (position, word_bytes) = match eob {
|
||||
Left(key) => {
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Right(key) => {
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
Both(key, _) => {
|
||||
// both values needs to be kept because it will be used in other extractors.
|
||||
value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
|
||||
value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
|
||||
key
|
||||
}
|
||||
};
|
||||
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(word_bytes);
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(&position.to_be_bytes());
|
||||
word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
406
crates/milli/src/update/index_documents/extract/mod.rs
Normal file
406
crates/milli/src/update/index_documents/extract/mod.rs
Normal file
@@ -0,0 +1,406 @@
|
||||
mod extract_docid_word_positions;
|
||||
mod extract_facet_number_docids;
|
||||
mod extract_facet_string_docids;
|
||||
mod extract_fid_docid_facet_values;
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_geo_points;
|
||||
mod extract_vector_points;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod extract_word_position_docids;
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::BufReader;
|
||||
use std::sync::{Arc, OnceLock};
|
||||
|
||||
use crossbeam_channel::Sender;
|
||||
use rayon::prelude::*;
|
||||
|
||||
use self::extract_docid_word_positions::extract_docid_word_positions;
|
||||
use self::extract_facet_number_docids::extract_facet_number_docids;
|
||||
use self::extract_facet_string_docids::extract_facet_string_docids;
|
||||
use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues};
|
||||
use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
|
||||
use self::extract_geo_points::extract_geo_points;
|
||||
use self::extract_vector_points::{
|
||||
extract_embeddings, extract_vector_points, ExtractedVectorPoints,
|
||||
};
|
||||
use self::extract_word_docids::extract_word_docids;
|
||||
use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
|
||||
use self::extract_word_position_docids::extract_word_position_docids;
|
||||
use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
||||
use super::{helpers, TypedChunk};
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::PossibleEmbeddingMistakes;
|
||||
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||
|
||||
/// Extract data for each databases from obkv documents in parallel.
|
||||
/// Send data in grenad file over provided Sender.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")]
|
||||
pub(crate) fn data_from_obkv_documents(
|
||||
original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
) -> Result<()> {
|
||||
let (original_pipeline_result, flattened_pipeline_result): (Result<_>, Result<_>) = rayon::join(
|
||||
|| {
|
||||
original_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|original_documents_chunk| {
|
||||
send_original_documents_data(
|
||||
original_documents_chunk,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
embedders_configs.clone(),
|
||||
settings_diff.clone(),
|
||||
possible_embedding_mistakes.clone(),
|
||||
)
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
|| {
|
||||
flattened_obkv_chunks
|
||||
.par_bridge()
|
||||
.map(|flattened_obkv_chunks| {
|
||||
send_and_extract_flattened_documents_data(
|
||||
flattened_obkv_chunks,
|
||||
indexer,
|
||||
lmdb_writer_sx.clone(),
|
||||
primary_key_id,
|
||||
settings_diff.clone(),
|
||||
max_positions_per_attributes,
|
||||
)
|
||||
})
|
||||
.map(|result| {
|
||||
if let Ok((
|
||||
ref docid_word_positions_chunk,
|
||||
(ref fid_docid_facet_numbers_chunk, ref fid_docid_facet_strings_chunk),
|
||||
)) = result
|
||||
{
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_fid_word_count_docids,
|
||||
TypedChunk::FieldIdWordCountDocids,
|
||||
);
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
grenad::Reader<BufReader<File>>,
|
||||
),
|
||||
>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_docids,
|
||||
|(
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
)| {
|
||||
TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
exact_word_docids_reader,
|
||||
word_fid_docids_reader,
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_position_docids,
|
||||
TypedChunk::WordPositionDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<
|
||||
_,
|
||||
_,
|
||||
(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
|
||||
>(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_string_docids,
|
||||
TypedChunk::FieldIdFacetStringDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_facet_number_docids,
|
||||
TypedChunk::FieldIdFacetNumberDocids,
|
||||
);
|
||||
|
||||
run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>(
|
||||
docid_word_positions_chunk.clone(),
|
||||
indexer,
|
||||
settings_diff.clone(),
|
||||
lmdb_writer_sx.clone(),
|
||||
extract_word_pair_proximity_docids,
|
||||
TypedChunk::WordPairProximityDocids,
|
||||
);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
})
|
||||
.collect::<Result<()>>()
|
||||
},
|
||||
);
|
||||
|
||||
original_pipeline_result.and(flattened_pipeline_result)
|
||||
}
|
||||
|
||||
/// Spawn a new task to extract data for a specific DB using extract_fn.
|
||||
/// Generated grenad chunks are merged using the merge_fn.
|
||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||
/// and sent into lmdb_writer_sx.
|
||||
fn run_extraction_task<FE, FS, M>(
|
||||
chunk: grenad::Reader<CursorClonableMmap>,
|
||||
indexer: GrenadParameters,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
extract_fn: FE,
|
||||
serialize_fn: FS,
|
||||
) where
|
||||
FE: Fn(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
GrenadParameters,
|
||||
&InnerIndexSettingsDiff,
|
||||
) -> Result<M>
|
||||
+ Sync
|
||||
+ Send
|
||||
+ 'static,
|
||||
FS: Fn(M) -> TypedChunk + Sync + Send + 'static,
|
||||
M: Send,
|
||||
{
|
||||
let current_span = tracing::Span::current();
|
||||
|
||||
rayon::spawn(move || {
|
||||
let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks");
|
||||
let _entered = child_span.enter();
|
||||
|
||||
match extract_fn(chunk, indexer, &settings_diff) {
|
||||
Ok(chunk) => {
|
||||
let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk)));
|
||||
}
|
||||
Err(e) => {
|
||||
let _ = lmdb_writer_sx.send(Err(e));
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||
|
||||
REQUEST_THREADS.get_or_init(|| {
|
||||
ThreadPoolNoAbortBuilder::new()
|
||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||
.thread_name(|index| format!("embedding-request-{index}"))
|
||||
.build()
|
||||
.unwrap()
|
||||
})
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents
|
||||
fn send_original_documents_data(
|
||||
original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
embedders_configs: Arc<Vec<IndexEmbeddingConfig>>,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
possible_embedding_mistakes: Arc<PossibleEmbeddingMistakes>,
|
||||
) -> Result<()> {
|
||||
let original_documents_chunk =
|
||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||
// no point in indexing vectors without embedders
|
||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||
|
||||
if index_vectors {
|
||||
let settings_diff = settings_diff.clone();
|
||||
let embedders_configs = embedders_configs.clone();
|
||||
|
||||
let original_documents_chunk = original_documents_chunk.clone();
|
||||
let lmdb_writer_sx = lmdb_writer_sx.clone();
|
||||
rayon::spawn(move || {
|
||||
match extract_vector_points(
|
||||
original_documents_chunk.clone(),
|
||||
indexer,
|
||||
&embedders_configs,
|
||||
&settings_diff,
|
||||
&possible_embedding_mistakes,
|
||||
) {
|
||||
Ok((extracted_vectors, unused_vectors_distribution)) => {
|
||||
for ExtractedVectorPoints {
|
||||
manual_vectors,
|
||||
remove_vectors,
|
||||
prompts,
|
||||
embedder_name,
|
||||
embedder,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
} in extracted_vectors
|
||||
{
|
||||
let embeddings = match extract_embeddings(
|
||||
prompts,
|
||||
indexer,
|
||||
embedder.clone(),
|
||||
&embedder_name,
|
||||
&possible_embedding_mistakes,
|
||||
&unused_vectors_distribution,
|
||||
request_threads(),
|
||||
) {
|
||||
Ok(results) => Some(results),
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
None
|
||||
}
|
||||
};
|
||||
if !(remove_vectors.is_empty()
|
||||
&& manual_vectors.is_empty()
|
||||
&& embeddings.as_ref().map_or(true, |e| e.is_empty()))
|
||||
{
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints {
|
||||
remove_vectors,
|
||||
embeddings,
|
||||
expected_dimension: embedder.dimensions(),
|
||||
manual_vectors,
|
||||
embedder_name,
|
||||
add_to_user_provided,
|
||||
remove_from_user_provided,
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(error) => {
|
||||
let _ = lmdb_writer_sx.send(Err(error));
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// TODO: create a custom internal error
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk)));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||
/// - documents_ids
|
||||
/// - docid_word_positions
|
||||
/// - docid_fid_facet_numbers
|
||||
/// - docid_fid_facet_strings
|
||||
/// - docid_fid_facet_exists
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn send_and_extract_flattened_documents_data(
|
||||
flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
|
||||
indexer: GrenadParameters,
|
||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||
primary_key_id: FieldId,
|
||||
settings_diff: Arc<InnerIndexSettingsDiff>,
|
||||
max_positions_per_attributes: Option<u32>,
|
||||
) -> Result<(
|
||||
grenad::Reader<CursorClonableMmap>,
|
||||
(grenad::Reader<CursorClonableMmap>, grenad::Reader<CursorClonableMmap>),
|
||||
)> {
|
||||
let flattened_documents_chunk =
|
||||
flattened_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||
|
||||
if settings_diff.run_geo_indexing() {
|
||||
let documents_chunk_cloned = flattened_documents_chunk.clone();
|
||||
let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
|
||||
let settings_diff = settings_diff.clone();
|
||||
rayon::spawn(move || {
|
||||
let result =
|
||||
extract_geo_points(documents_chunk_cloned, indexer, primary_key_id, &settings_diff);
|
||||
let _ = match result {
|
||||
Ok(geo_points) => lmdb_writer_sx_cloned.send(Ok(TypedChunk::GeoPoints(geo_points))),
|
||||
Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
|
||||
rayon::join(
|
||||
|| {
|
||||
let docid_word_positions_chunk = extract_docid_word_positions(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
max_positions_per_attributes,
|
||||
)?;
|
||||
|
||||
// send docid_word_positions_chunk to DB writer
|
||||
let docid_word_positions_chunk =
|
||||
unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
|
||||
|
||||
Ok(docid_word_positions_chunk)
|
||||
},
|
||||
|| {
|
||||
let ExtractedFacetValues {
|
||||
fid_docid_facet_numbers_chunk,
|
||||
fid_docid_facet_strings_chunk,
|
||||
fid_facet_is_null_docids_chunk,
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
fid_facet_exists_docids_chunk,
|
||||
} = extract_fid_docid_facet_values(
|
||||
flattened_documents_chunk.clone(),
|
||||
indexer,
|
||||
&settings_diff,
|
||||
)?;
|
||||
|
||||
// send fid_docid_facet_numbers_chunk to DB writer
|
||||
let fid_docid_facet_numbers_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
|
||||
fid_docid_facet_numbers_chunk.clone(),
|
||||
)));
|
||||
|
||||
// send fid_docid_facet_strings_chunk to DB writer
|
||||
let fid_docid_facet_strings_chunk =
|
||||
unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
|
||||
fid_docid_facet_strings_chunk.clone(),
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(fid_facet_is_null_docids_chunk)));
|
||||
|
||||
let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(
|
||||
fid_facet_is_empty_docids_chunk,
|
||||
)));
|
||||
|
||||
let _ = lmdb_writer_sx
|
||||
.send(Ok(TypedChunk::FieldIdFacetExistsDocids(fid_facet_exists_docids_chunk)));
|
||||
|
||||
Ok((fid_docid_facet_numbers_chunk, fid_docid_facet_strings_chunk))
|
||||
},
|
||||
);
|
||||
|
||||
Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
|
||||
}
|
Reference in New Issue
Block a user