Refactor Settings Indexing process

**Changes:**
The transform structure is now relying on FieldIdMapWithMetadata and AttributePatterns to prepare
the obkv documents during a settings reindexing.
The InnerIndexSettingsDiff and InnerIndexSettings structs are now relying on FieldIdMapWithMetadata, FilterableAttributesRule and AttributePatterns to define the field and the databases that should be reindexed.
The faceted_fields_ids, localized_searchable_fields_ids and localized_faceted_fields_ids have been removed in favor of the FieldIdMapWithMetadata.
We are now relying on the FieldIdMapWithMetadata to retain vectors_fids from the facets and the searchables.

The searchable database computing is now relying on the FieldIdMapWithMetadata to know if a field is searchable and retrieve the locales.

The facet database computing is now relying on the FieldIdMapWithMetadata to compute the facet databases, the facet-search and retrieve the locales.

The facet level database computing is now relying on the FieldIdMapWithMetadata and the facet level database are cleared depending on the settings differences (clear_facet_levels_based_on_settings_diff).

The vector point extraction uses the FieldIdMapWithMetadata instead of FieldsIdsMapWithMetadata.

**Impact:**
- Dump import
- Settings update
This commit is contained in:
ManyTheFish
2025-03-03 10:32:02 +01:00
parent 286d310287
commit 659855c88e
12 changed files with 375 additions and 272 deletions

View File

@@ -150,9 +150,14 @@ fn searchable_fields_changed(
obkv: &KvReader<FieldId>,
settings_diff: &InnerIndexSettingsDiff,
) -> bool {
let searchable_fields = &settings_diff.new.searchable_fields_ids;
for (field_id, field_bytes) in obkv.iter() {
if searchable_fields.contains(&field_id) {
let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else {
// If the field id is not in the fields ids map, skip it.
// This happens for the vectors sub-fields. for example:
// "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered.
continue;
};
if metadata.is_searchable() {
let del_add = KvReaderDelAdd::from_slice(field_bytes);
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
// if both fields are None, check the next field.
@@ -200,8 +205,14 @@ fn tokens_from_document<'a>(
buffers.obkv_buffer.clear();
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
for (field_id, field_bytes) in obkv.iter() {
let Some(metadata) = settings.fields_ids_map.metadata(field_id) else {
// If the field id is not in the fields ids map, skip it.
// This happens for the vectors sub-fields. for example:
// "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered.
continue;
};
// if field is searchable.
if settings.searchable_fields_ids.contains(&field_id) {
if metadata.is_searchable() {
// extract deletion or addition only.
if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
// parse json.
@@ -216,7 +227,7 @@ fn tokens_from_document<'a>(
buffers.field_buffer.clear();
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
// create an iterator of token with their positions.
let locales = settings.localized_searchable_fields_ids.locales(field_id);
let locales = metadata.locales(&settings.localized_attributes_rules);
let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);

View File

@@ -12,12 +12,11 @@ use heed::BytesEncode;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
use crate::localized_attributes_rules::LocalizedFieldIds;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::index_documents::helpers::{
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
/// Extracts the facet string and the documents ids where this facet string appear.
@@ -33,13 +32,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
if settings_diff.settings_update_only() {
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
} else {
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
let facet_search = settings_diff.new.facet_search;
extract_facet_string_docids_document_update(
docid_fid_facet_string,
indexer,
localized_field_ids,
facet_search,
&settings_diff.new,
)
}
}
@@ -52,8 +48,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
docid_fid_facet_string: grenad::Reader<R>,
indexer: GrenadParameters,
localized_field_ids: &LocalizedFieldIds,
facet_search: bool,
settings: &InnerIndexSettings,
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
let max_memory = indexer.max_memory_by_thread();
@@ -92,6 +87,14 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let Some(metadata) = settings.fields_ids_map.metadata(field_id) else {
unreachable!("metadata not found for field_id: {}", field_id)
};
if !metadata.is_faceted(&settings.filterable_attributes_rules) {
continue;
}
let (document_id_bytes, normalized_value_bytes) =
try_split_array_at::<_, 4>(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
@@ -99,8 +102,10 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
let normalized_value = str::from_utf8(normalized_value_bytes)?;
// Facet search normalization
if facet_search {
let locales = localized_field_ids.locales(field_id);
let features =
metadata.filterable_attributes_features(&settings.filterable_attributes_rules);
if features.is_facet_searchable() {
let locales = metadata.locales(&settings.localized_attributes_rules);
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
@@ -178,8 +183,15 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes);
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else {
unreachable!("old metadata not found for field_id: {}", field_id)
};
let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else {
unreachable!("new metadata not found for field_id: {}", field_id)
};
let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules);
let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules);
let are_same_locales = old_locales == new_locales;
let reindex_facet_search =
@@ -197,10 +209,15 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
// Facet search normalization
if settings_diff.new.facet_search {
let new_filterable_features = new_metadata
.filterable_attributes_features(&settings_diff.new.filterable_attributes_rules);
let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales);
let old_hyper_normalized_value;
let old_filterable_features = old_metadata
.filterable_attributes_features(&settings_diff.old.filterable_attributes_rules);
let old_hyper_normalized_value = if !settings_diff.old.facet_search
|| deladd_reader.get(DelAdd::Deletion).is_none()
|| !old_filterable_features.is_facet_searchable()
{
// if the facet search is disabled in the old settings or if no facet string is deleted,
// we don't need to normalize the facet string.
@@ -215,7 +232,9 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
// if the facet string is the same, we can put the deletion and addition in the same obkv.
if old_hyper_normalized_value == Some(&new_hyper_normalized_value) {
if old_hyper_normalized_value == Some(&new_hyper_normalized_value)
&& new_filterable_features.is_facet_searchable()
{
// nothing to do if we delete and re-add the value.
if is_same_value {
continue;
@@ -249,7 +268,9 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
}
// addition
if deladd_reader.get(DelAdd::Addition).is_some() {
if new_filterable_features.is_facet_searchable()
&& deladd_reader.get(DelAdd::Addition).is_some()
{
// insert new value
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
buffer.clear();

View File

@@ -76,9 +76,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
let mut strings_key_buffer = Vec::new();
let old_faceted_fids: BTreeSet<_> =
settings_diff.old.faceted_fields_ids.iter().copied().collect();
settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion);
let new_faceted_fids: BTreeSet<_> =
settings_diff.new.faceted_fields_ids.iter().copied().collect();
settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition);
if !settings_diff.settings_update_only || settings_diff.reindex_facets() {
let mut cursor = obkv_documents.into_cursor()?;

View File

@@ -15,8 +15,9 @@ use serde_json::Value;
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
use crate::error::FaultSource;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::index::IndexEmbeddingConfig;
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
use crate::prompt::Prompt;
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
use crate::update::settings::InnerIndexSettingsDiff;
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
@@ -190,12 +191,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
let reindex_vectors = settings_diff.reindex_vectors();
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
let old_fields_ids_map =
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
let new_fields_ids_map =
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
// the vector field id may have changed
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
@@ -383,7 +380,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
);
continue;
}
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
regenerate_prompt(obkv, prompt, new_fields_ids_map)?
}
},
// prompt regeneration is only triggered for existing embedders
@@ -400,7 +397,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
regenerate_if_prompt_changed(
obkv,
(old_prompt, prompt),
(&old_fields_ids_map, &new_fields_ids_map),
(old_fields_ids_map, new_fields_ids_map),
)?
} else {
// we can simply ignore user provided vectors as they are not regenerated and are
@@ -416,7 +413,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(&old_fields_ids_map, &new_fields_ids_map),
(old_fields_ids_map, new_fields_ids_map),
document_id,
embedder_name,
embedder_is_manual,
@@ -486,10 +483,7 @@ fn extract_vector_document_diff(
prompt: &Prompt,
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
(old, new): (VectorState, VectorState),
(old_fields_ids_map, new_fields_ids_map): (
&FieldsIdsMapWithMetadata,
&FieldsIdsMapWithMetadata,
),
(old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata),
document_id: impl Fn() -> Value,
embedder_name: &str,
embedder_is_manual: bool,
@@ -611,10 +605,7 @@ fn extract_vector_document_diff(
fn regenerate_if_prompt_changed(
obkv: &obkv::KvReader<FieldId>,
(old_prompt, new_prompt): (&Prompt, &Prompt),
(old_fields_ids_map, new_fields_ids_map): (
&FieldsIdsMapWithMetadata,
&FieldsIdsMapWithMetadata,
),
(old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata),
) -> Result<VectorStateDelta> {
let old_prompt = old_prompt
.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
@@ -630,7 +621,7 @@ fn regenerate_if_prompt_changed(
fn regenerate_prompt(
obkv: &obkv::KvReader<FieldId>,
prompt: &Prompt,
new_fields_ids_map: &FieldsIdsMapWithMetadata,
new_fields_ids_map: &FieldIdMapWithMetadata,
) -> Result<VectorStateDelta> {
let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;