mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-10-02 17:56:28 +00:00
Refactor Settings Indexing process
**Changes:** The transform structure is now relying on FieldIdMapWithMetadata and AttributePatterns to prepare the obkv documents during a settings reindexing. The InnerIndexSettingsDiff and InnerIndexSettings structs are now relying on FieldIdMapWithMetadata, FilterableAttributesRule and AttributePatterns to define the field and the databases that should be reindexed. The faceted_fields_ids, localized_searchable_fields_ids and localized_faceted_fields_ids have been removed in favor of the FieldIdMapWithMetadata. We are now relying on the FieldIdMapWithMetadata to retain vectors_fids from the facets and the searchables. The searchable database computing is now relying on the FieldIdMapWithMetadata to know if a field is searchable and retrieve the locales. The facet database computing is now relying on the FieldIdMapWithMetadata to compute the facet databases, the facet-search and retrieve the locales. The facet level database computing is now relying on the FieldIdMapWithMetadata and the facet level database are cleared depending on the settings differences (clear_facet_levels_based_on_settings_diff). The vector point extraction uses the FieldIdMapWithMetadata instead of FieldsIdsMapWithMetadata. **Impact:** - Dump import - Settings update
This commit is contained in:
@@ -150,9 +150,14 @@ fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else {
|
||||
// If the field id is not in the fields ids map, skip it.
|
||||
// This happens for the vectors sub-fields. for example:
|
||||
// "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered.
|
||||
continue;
|
||||
};
|
||||
if metadata.is_searchable() {
|
||||
let del_add = KvReaderDelAdd::from_slice(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
@@ -200,8 +205,14 @@ fn tokens_from_document<'a>(
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
let Some(metadata) = settings.fields_ids_map.metadata(field_id) else {
|
||||
// If the field id is not in the fields ids map, skip it.
|
||||
// This happens for the vectors sub-fields. for example:
|
||||
// "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered.
|
||||
continue;
|
||||
};
|
||||
// if field is searchable.
|
||||
if settings.searchable_fields_ids.contains(&field_id) {
|
||||
if metadata.is_searchable() {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
@@ -216,7 +227,7 @@ fn tokens_from_document<'a>(
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = settings.localized_searchable_fields_ids.locales(field_id);
|
||||
let locales = metadata.locales(&settings.localized_attributes_rules);
|
||||
let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
|
@@ -12,12 +12,11 @@ use heed::BytesEncode;
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
@@ -33,13 +32,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
if settings_diff.settings_update_only() {
|
||||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
let facet_search = settings_diff.new.facet_search;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
facet_search,
|
||||
&settings_diff.new,
|
||||
)
|
||||
}
|
||||
}
|
||||
@@ -52,8 +48,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
facet_search: bool,
|
||||
settings: &InnerIndexSettings,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@@ -92,6 +87,14 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let Some(metadata) = settings.fields_ids_map.metadata(field_id) else {
|
||||
unreachable!("metadata not found for field_id: {}", field_id)
|
||||
};
|
||||
|
||||
if !metadata.is_faceted(&settings.filterable_attributes_rules) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
@@ -99,8 +102,10 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
if facet_search {
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let features =
|
||||
metadata.filterable_attributes_features(&settings.filterable_attributes_rules);
|
||||
if features.is_facet_searchable() {
|
||||
let locales = metadata.locales(&settings.localized_attributes_rules);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
@@ -178,8 +183,15 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else {
|
||||
unreachable!("old metadata not found for field_id: {}", field_id)
|
||||
};
|
||||
let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else {
|
||||
unreachable!("new metadata not found for field_id: {}", field_id)
|
||||
};
|
||||
|
||||
let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules);
|
||||
let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
let reindex_facet_search =
|
||||
@@ -197,10 +209,15 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
|
||||
// Facet search normalization
|
||||
if settings_diff.new.facet_search {
|
||||
let new_filterable_features = new_metadata
|
||||
.filterable_attributes_features(&settings_diff.new.filterable_attributes_rules);
|
||||
let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales);
|
||||
let old_hyper_normalized_value;
|
||||
let old_filterable_features = old_metadata
|
||||
.filterable_attributes_features(&settings_diff.old.filterable_attributes_rules);
|
||||
let old_hyper_normalized_value = if !settings_diff.old.facet_search
|
||||
|| deladd_reader.get(DelAdd::Deletion).is_none()
|
||||
|| !old_filterable_features.is_facet_searchable()
|
||||
{
|
||||
// if the facet search is disabled in the old settings or if no facet string is deleted,
|
||||
// we don't need to normalize the facet string.
|
||||
@@ -215,7 +232,9 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == Some(&new_hyper_normalized_value) {
|
||||
if old_hyper_normalized_value == Some(&new_hyper_normalized_value)
|
||||
&& new_filterable_features.is_facet_searchable()
|
||||
{
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
@@ -249,7 +268,9 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
// addition
|
||||
if deladd_reader.get(DelAdd::Addition).is_some() {
|
||||
if new_filterable_features.is_facet_searchable()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some()
|
||||
{
|
||||
// insert new value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
|
@@ -76,9 +76,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let old_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.old.faceted_fields_ids.iter().copied().collect();
|
||||
settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion);
|
||||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition);
|
||||
|
||||
if !settings_diff.settings_update_only || settings_diff.reindex_facets() {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
@@ -15,8 +15,9 @@ use serde_json::Value;
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||
use crate::error::FaultSource;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||
@@ -190,12 +191,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let old_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
|
||||
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
let new_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
@@ -383,7 +380,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
|
||||
regenerate_prompt(obkv, prompt, new_fields_ids_map)?
|
||||
}
|
||||
},
|
||||
// prompt regeneration is only triggered for existing embedders
|
||||
@@ -400,7 +397,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
regenerate_if_prompt_changed(
|
||||
obkv,
|
||||
(old_prompt, prompt),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
(old_fields_ids_map, new_fields_ids_map),
|
||||
)?
|
||||
} else {
|
||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||
@@ -416,7 +413,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
prompt,
|
||||
(add_to_user_provided, remove_from_user_provided),
|
||||
(old, new),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
(old_fields_ids_map, new_fields_ids_map),
|
||||
document_id,
|
||||
embedder_name,
|
||||
embedder_is_manual,
|
||||
@@ -486,10 +483,7 @@ fn extract_vector_document_diff(
|
||||
prompt: &Prompt,
|
||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||
(old, new): (VectorState, VectorState),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
(old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata),
|
||||
document_id: impl Fn() -> Value,
|
||||
embedder_name: &str,
|
||||
embedder_is_manual: bool,
|
||||
@@ -611,10 +605,7 @@ fn extract_vector_document_diff(
|
||||
fn regenerate_if_prompt_changed(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
(old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata),
|
||||
) -> Result<VectorStateDelta> {
|
||||
let old_prompt = old_prompt
|
||||
.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
@@ -630,7 +621,7 @@ fn regenerate_if_prompt_changed(
|
||||
fn regenerate_prompt(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||
new_fields_ids_map: &FieldIdMapWithMetadata,
|
||||
) -> Result<VectorStateDelta> {
|
||||
let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
|
Reference in New Issue
Block a user