mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-28 09:11:00 +00:00
Refactor Settings Indexing process
**Changes:** The transform structure is now relying on FieldIdMapWithMetadata and AttributePatterns to prepare the obkv documents during a settings reindexing. The InnerIndexSettingsDiff and InnerIndexSettings structs are now relying on FieldIdMapWithMetadata, FilterableAttributesRule and AttributePatterns to define the field and the databases that should be reindexed. The faceted_fields_ids, localized_searchable_fields_ids and localized_faceted_fields_ids have been removed in favor of the FieldIdMapWithMetadata. We are now relying on the FieldIdMapWithMetadata to retain vectors_fids from the facets and the searchables. The searchable database computing is now relying on the FieldIdMapWithMetadata to know if a field is searchable and retrieve the locales. The facet database computing is now relying on the FieldIdMapWithMetadata to compute the facet databases, the facet-search and retrieve the locales. The facet level database computing is now relying on the FieldIdMapWithMetadata and the facet level database are cleared depending on the settings differences (clear_facet_levels_based_on_settings_diff). The vector point extraction uses the FieldIdMapWithMetadata instead of FieldsIdsMapWithMetadata. **Impact:** - Dump import - Settings update
This commit is contained in:
@ -95,12 +95,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
// If the settings specifies that a _geo field must be used therefore we must check the
|
||||
// validity of it in all the documents of this batch and this is when we return `Some`.
|
||||
let geo_field_id = match documents_batch_index.id(RESERVED_GEO_FIELD_NAME) {
|
||||
Some(geo_field_id)
|
||||
if index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME)
|
||||
|| index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) =>
|
||||
{
|
||||
Some(geo_field_id)
|
||||
}
|
||||
Some(geo_field_id) if index.is_geo_enabled(rtxn)? => Some(geo_field_id),
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
|
@ -150,9 +150,14 @@ fn searchable_fields_changed(
|
||||
obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else {
|
||||
// If the field id is not in the fields ids map, skip it.
|
||||
// This happens for the vectors sub-fields. for example:
|
||||
// "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered.
|
||||
continue;
|
||||
};
|
||||
if metadata.is_searchable() {
|
||||
let del_add = KvReaderDelAdd::from_slice(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
@ -200,8 +205,14 @@ fn tokens_from_document<'a>(
|
||||
buffers.obkv_buffer.clear();
|
||||
let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
let Some(metadata) = settings.fields_ids_map.metadata(field_id) else {
|
||||
// If the field id is not in the fields ids map, skip it.
|
||||
// This happens for the vectors sub-fields. for example:
|
||||
// "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered.
|
||||
continue;
|
||||
};
|
||||
// if field is searchable.
|
||||
if settings.searchable_fields_ids.contains(&field_id) {
|
||||
if metadata.is_searchable() {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
@ -216,7 +227,7 @@ fn tokens_from_document<'a>(
|
||||
buffers.field_buffer.clear();
|
||||
if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = settings.localized_searchable_fields_ids.locales(field_id);
|
||||
let locales = metadata.locales(&settings.localized_attributes_rules);
|
||||
let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales))
|
||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||
|
||||
|
@ -12,12 +12,11 @@ use heed::BytesEncode;
|
||||
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
|
||||
use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// Extracts the facet string and the documents ids where this facet string appear.
|
||||
@ -33,13 +32,10 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
if settings_diff.settings_update_only() {
|
||||
extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff)
|
||||
} else {
|
||||
let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids;
|
||||
let facet_search = settings_diff.new.facet_search;
|
||||
extract_facet_string_docids_document_update(
|
||||
docid_fid_facet_string,
|
||||
indexer,
|
||||
localized_field_ids,
|
||||
facet_search,
|
||||
&settings_diff.new,
|
||||
)
|
||||
}
|
||||
}
|
||||
@ -52,8 +48,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
|
||||
fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
docid_fid_facet_string: grenad::Reader<R>,
|
||||
indexer: GrenadParameters,
|
||||
localized_field_ids: &LocalizedFieldIds,
|
||||
facet_search: bool,
|
||||
settings: &InnerIndexSettings,
|
||||
) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
@ -92,6 +87,14 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let Some(metadata) = settings.fields_ids_map.metadata(field_id) else {
|
||||
unreachable!("metadata not found for field_id: {}", field_id)
|
||||
};
|
||||
|
||||
if !metadata.is_faceted(&settings.filterable_attributes_rules) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let (document_id_bytes, normalized_value_bytes) =
|
||||
try_split_array_at::<_, 4>(bytes).unwrap();
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
@ -99,8 +102,10 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
let normalized_value = str::from_utf8(normalized_value_bytes)?;
|
||||
|
||||
// Facet search normalization
|
||||
if facet_search {
|
||||
let locales = localized_field_ids.locales(field_id);
|
||||
let features =
|
||||
metadata.filterable_attributes_features(&settings.filterable_attributes_rules);
|
||||
if features.is_facet_searchable() {
|
||||
let locales = metadata.locales(&settings.localized_attributes_rules);
|
||||
let hyper_normalized_value = normalize_facet_string(normalized_value, locales);
|
||||
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
@ -178,8 +183,15 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
|
||||
let field_id = FieldId::from_be_bytes(field_id_bytes);
|
||||
|
||||
let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id);
|
||||
let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id);
|
||||
let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else {
|
||||
unreachable!("old metadata not found for field_id: {}", field_id)
|
||||
};
|
||||
let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else {
|
||||
unreachable!("new metadata not found for field_id: {}", field_id)
|
||||
};
|
||||
|
||||
let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules);
|
||||
let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules);
|
||||
|
||||
let are_same_locales = old_locales == new_locales;
|
||||
let reindex_facet_search =
|
||||
@ -197,10 +209,15 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
|
||||
// Facet search normalization
|
||||
if settings_diff.new.facet_search {
|
||||
let new_filterable_features = new_metadata
|
||||
.filterable_attributes_features(&settings_diff.new.filterable_attributes_rules);
|
||||
let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales);
|
||||
let old_hyper_normalized_value;
|
||||
let old_filterable_features = old_metadata
|
||||
.filterable_attributes_features(&settings_diff.old.filterable_attributes_rules);
|
||||
let old_hyper_normalized_value = if !settings_diff.old.facet_search
|
||||
|| deladd_reader.get(DelAdd::Deletion).is_none()
|
||||
|| !old_filterable_features.is_facet_searchable()
|
||||
{
|
||||
// if the facet search is disabled in the old settings or if no facet string is deleted,
|
||||
// we don't need to normalize the facet string.
|
||||
@ -215,7 +232,9 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
let set = BTreeSet::from_iter(std::iter::once(normalized_value));
|
||||
|
||||
// if the facet string is the same, we can put the deletion and addition in the same obkv.
|
||||
if old_hyper_normalized_value == Some(&new_hyper_normalized_value) {
|
||||
if old_hyper_normalized_value == Some(&new_hyper_normalized_value)
|
||||
&& new_filterable_features.is_facet_searchable()
|
||||
{
|
||||
// nothing to do if we delete and re-add the value.
|
||||
if is_same_value {
|
||||
continue;
|
||||
@ -249,7 +268,9 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
// addition
|
||||
if deladd_reader.get(DelAdd::Addition).is_some() {
|
||||
if new_filterable_features.is_facet_searchable()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some()
|
||||
{
|
||||
// insert new value
|
||||
let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?;
|
||||
buffer.clear();
|
||||
|
@ -76,9 +76,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
let mut strings_key_buffer = Vec::new();
|
||||
|
||||
let old_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.old.faceted_fields_ids.iter().copied().collect();
|
||||
settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion);
|
||||
let new_faceted_fids: BTreeSet<_> =
|
||||
settings_diff.new.faceted_fields_ids.iter().copied().collect();
|
||||
settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition);
|
||||
|
||||
if !settings_diff.settings_update_only || settings_diff.reindex_facets() {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
|
@ -15,8 +15,9 @@ use serde_json::Value;
|
||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||
use crate::constants::RESERVED_VECTORS_FIELD_NAME;
|
||||
use crate::error::FaultSource;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::prompt::{FieldsIdsMapWithMetadata, Prompt};
|
||||
use crate::prompt::Prompt;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution};
|
||||
@ -190,12 +191,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
let reindex_vectors = settings_diff.reindex_vectors();
|
||||
|
||||
let old_fields_ids_map = &settings_diff.old.fields_ids_map;
|
||||
let old_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids);
|
||||
|
||||
let new_fields_ids_map = &settings_diff.new.fields_ids_map;
|
||||
let new_fields_ids_map =
|
||||
FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids);
|
||||
|
||||
// the vector field id may have changed
|
||||
let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME);
|
||||
@ -383,7 +380,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
);
|
||||
continue;
|
||||
}
|
||||
regenerate_prompt(obkv, prompt, &new_fields_ids_map)?
|
||||
regenerate_prompt(obkv, prompt, new_fields_ids_map)?
|
||||
}
|
||||
},
|
||||
// prompt regeneration is only triggered for existing embedders
|
||||
@ -400,7 +397,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
regenerate_if_prompt_changed(
|
||||
obkv,
|
||||
(old_prompt, prompt),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
(old_fields_ids_map, new_fields_ids_map),
|
||||
)?
|
||||
} else {
|
||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||
@ -416,7 +413,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
prompt,
|
||||
(add_to_user_provided, remove_from_user_provided),
|
||||
(old, new),
|
||||
(&old_fields_ids_map, &new_fields_ids_map),
|
||||
(old_fields_ids_map, new_fields_ids_map),
|
||||
document_id,
|
||||
embedder_name,
|
||||
embedder_is_manual,
|
||||
@ -486,10 +483,7 @@ fn extract_vector_document_diff(
|
||||
prompt: &Prompt,
|
||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||
(old, new): (VectorState, VectorState),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
(old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata),
|
||||
document_id: impl Fn() -> Value,
|
||||
embedder_name: &str,
|
||||
embedder_is_manual: bool,
|
||||
@ -611,10 +605,7 @@ fn extract_vector_document_diff(
|
||||
fn regenerate_if_prompt_changed(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
&FieldsIdsMapWithMetadata,
|
||||
),
|
||||
(old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata),
|
||||
) -> Result<VectorStateDelta> {
|
||||
let old_prompt = old_prompt
|
||||
.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map)
|
||||
@ -630,7 +621,7 @@ fn regenerate_if_prompt_changed(
|
||||
fn regenerate_prompt(
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||
new_fields_ids_map: &FieldIdMapWithMetadata,
|
||||
) -> Result<VectorStateDelta> {
|
||||
let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?;
|
||||
|
||||
|
@ -26,6 +26,7 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
|
||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||
pub use self::helpers::*;
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use super::facet::clear_facet_levels_based_on_settings_diff;
|
||||
use super::new::StdResult;
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError};
|
||||
@ -215,9 +216,8 @@ where
|
||||
flattened_documents,
|
||||
} = output;
|
||||
|
||||
// update the internal facet and searchable list,
|
||||
// update the searchable list,
|
||||
// because they might have changed due to the nested documents flattening.
|
||||
settings_diff.new.recompute_facets(self.wtxn, self.index)?;
|
||||
settings_diff.new.recompute_searchables(self.wtxn, self.index)?;
|
||||
|
||||
let settings_diff = Arc::new(settings_diff);
|
||||
@ -465,6 +465,11 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
// If the settings are only being updated, we may have to clear some of the facet levels.
|
||||
if settings_diff.settings_update_only() {
|
||||
clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}).map_err(InternalError::from)??;
|
||||
|
||||
@ -765,18 +770,19 @@ mod tests {
|
||||
use bumpalo::Bump;
|
||||
use fst::IntoStreamer;
|
||||
use heed::RwTxn;
|
||||
use maplit::hashset;
|
||||
use maplit::{btreeset, hashset};
|
||||
|
||||
use super::*;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::documents::mmap_from_objects;
|
||||
use crate::filterable_attributes_rules::filtered_matching_field_names;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::Setting;
|
||||
use crate::{all_obkv_to_json, db_snap, Filter, Search, UserError};
|
||||
use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError};
|
||||
|
||||
#[test]
|
||||
fn simple_document_replacement() {
|
||||
@ -1006,7 +1012,9 @@ mod tests {
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME)));
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
|
||||
RESERVED_GEO_FIELD_NAME.to_string(),
|
||||
)]);
|
||||
})
|
||||
.unwrap();
|
||||
}
|
||||
@ -1018,7 +1026,9 @@ mod tests {
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME)));
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
|
||||
RESERVED_GEO_FIELD_NAME.to_string(),
|
||||
)]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
@ -1234,15 +1244,24 @@ mod tests {
|
||||
let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")];
|
||||
settings.set_searchable_fields(searchable_fields);
|
||||
|
||||
let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin"));
|
||||
let faceted_fields = vec![
|
||||
FilterableAttributesRule::Field("title".to_string()),
|
||||
FilterableAttributesRule::Field("nested.object".to_string()),
|
||||
FilterableAttributesRule::Field("nested.machin".to_string()),
|
||||
];
|
||||
settings.set_filterable_fields(faceted_fields);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin")));
|
||||
let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let facets =
|
||||
filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| {
|
||||
features.is_filterable()
|
||||
});
|
||||
assert_eq!(facets, btreeset!("title", "nested.object", "nested.machin"));
|
||||
|
||||
// testing the simple query search
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
@ -1438,7 +1457,9 @@ mod tests {
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset!(String::from("dog")));
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
|
||||
"dog".to_string(),
|
||||
)]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
@ -1457,9 +1478,14 @@ mod tests {
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let hidden = index.faceted_fields(&rtxn).unwrap();
|
||||
let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let facets =
|
||||
filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| {
|
||||
features.is_filterable()
|
||||
});
|
||||
|
||||
assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain")));
|
||||
assert_eq!(facets, btreeset!("dog", "dog.race", "dog.race.bernese mountain"));
|
||||
|
||||
for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] {
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
@ -1480,9 +1506,14 @@ mod tests {
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let facets =
|
||||
filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| {
|
||||
features.is_filterable()
|
||||
});
|
||||
|
||||
assert_eq!(facets, hashset!());
|
||||
assert_eq!(facets, btreeset!());
|
||||
|
||||
// update the settings to test the sortable
|
||||
index
|
||||
@ -1506,10 +1537,6 @@ mod tests {
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
|
||||
assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain")));
|
||||
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S(
|
||||
"dog.race.bernese mountain",
|
||||
@ -1717,8 +1744,13 @@ mod tests {
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
|
||||
let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let facets =
|
||||
filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| {
|
||||
features.is_filterable()
|
||||
});
|
||||
assert_eq!(facets, btreeset!("colour", "colour.green", "colour.green.blue"));
|
||||
|
||||
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||
@ -1738,7 +1770,7 @@ mod tests {
|
||||
assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![7]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("colour"));
|
||||
let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())];
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
@ -1823,8 +1855,13 @@ mod tests {
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue")));
|
||||
let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let facets =
|
||||
filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| {
|
||||
features.is_filterable()
|
||||
});
|
||||
assert_eq!(facets, btreeset!("colour", "colour.green", "colour.green.blue"));
|
||||
|
||||
let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap();
|
||||
let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap();
|
||||
@ -1844,7 +1881,7 @@ mod tests {
|
||||
assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("colour"));
|
||||
let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())];
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
@ -1887,8 +1924,13 @@ mod tests {
|
||||
|
||||
let check_ok = |index: &Index| {
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let facets = index.faceted_fields(&rtxn).unwrap();
|
||||
assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue")));
|
||||
let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let facets =
|
||||
filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| {
|
||||
features.is_filterable()
|
||||
});
|
||||
assert_eq!(facets, btreeset!("tags", "tags.green", "tags.green.blue"));
|
||||
|
||||
let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap();
|
||||
let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap();
|
||||
@ -1907,7 +1949,7 @@ mod tests {
|
||||
assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]);
|
||||
};
|
||||
|
||||
let faceted_fields = hashset!(S("tags"));
|
||||
let faceted_fields = vec![FilterableAttributesRule::Field("tags".to_string())];
|
||||
|
||||
let index = TempIndex::new();
|
||||
index.add_documents(content()).unwrap();
|
||||
@ -2259,7 +2301,9 @@ mod tests {
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset! { S("title") });
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
|
||||
"title".to_string(),
|
||||
)]);
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
@ -3115,7 +3159,10 @@ mod tests {
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("docid"));
|
||||
settings.set_filterable_fields(hashset! { S("label"), S("label2") });
|
||||
settings.set_filterable_fields(vec![
|
||||
FilterableAttributesRule::Field("label".to_string()),
|
||||
FilterableAttributesRule::Field("label2".to_string()),
|
||||
]);
|
||||
})
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
@ -3294,7 +3341,9 @@ mod tests {
|
||||
index
|
||||
.update_settings_using_wtxn(&mut wtxn, |settings| {
|
||||
settings.set_primary_key(S("id"));
|
||||
settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME)));
|
||||
settings.set_filterable_fields(vec![FilterableAttributesRule::Field(
|
||||
RESERVED_GEO_FIELD_NAME.to_string(),
|
||||
)]);
|
||||
settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME)));
|
||||
})
|
||||
.unwrap();
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::btree_map::Entry as BEntry;
|
||||
use std::collections::hash_map::Entry as HEntry;
|
||||
use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek};
|
||||
|
||||
@ -18,8 +18,10 @@ use super::helpers::{
|
||||
ObkvsMergeAdditionsAndDeletions,
|
||||
};
|
||||
use super::{create_writer, IndexDocumentsMethod, IndexerConfig, KeepFirst};
|
||||
use crate::attribute_patterns::PatternMatch;
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::index::{db_name, main_key};
|
||||
use crate::update::del_add::{
|
||||
into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation,
|
||||
@ -31,9 +33,7 @@ use crate::update::{AvailableIds, UpdateIndexingStep};
|
||||
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||
use crate::vector::settings::WriteBackToDocuments;
|
||||
use crate::vector::ArroyWrapper;
|
||||
use crate::{
|
||||
is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
|
||||
};
|
||||
use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result};
|
||||
|
||||
pub struct TransformOutput {
|
||||
pub primary_key: String,
|
||||
@ -52,7 +52,7 @@ pub struct TransformOutput {
|
||||
/// containing all those documents.
|
||||
pub struct Transform<'a, 'i> {
|
||||
pub index: &'i Index,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
fields_ids_map: FieldIdMapWithMetadata,
|
||||
|
||||
indexer_settings: &'a IndexerConfig,
|
||||
pub index_documents_method: IndexDocumentsMethod,
|
||||
@ -84,7 +84,7 @@ pub enum Operation {
|
||||
///
|
||||
/// If new fields are present in the addition, they are added to the index field ids map.
|
||||
fn create_fields_mapping(
|
||||
index_field_map: &mut FieldsIdsMap,
|
||||
index_field_map: &mut FieldIdMapWithMetadata,
|
||||
batch_field_map: &DocumentsBatchIndex,
|
||||
) -> Result<HashMap<FieldId, FieldId>> {
|
||||
batch_field_map
|
||||
@ -141,10 +141,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
true,
|
||||
);
|
||||
let documents_ids = index.documents_ids(wtxn)?;
|
||||
let fields_ids_map = index.fields_ids_map(wtxn)?;
|
||||
let builder = MetadataBuilder::from_index(index, wtxn)?;
|
||||
let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
|
||||
|
||||
Ok(Transform {
|
||||
index,
|
||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||
fields_ids_map,
|
||||
indexer_settings,
|
||||
available_documents_ids: AvailableIds::new(&documents_ids),
|
||||
original_sorter,
|
||||
@ -354,7 +357,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
documents_seen: documents_count,
|
||||
});
|
||||
|
||||
self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
|
||||
self.index.put_fields_ids_map(wtxn, self.fields_ids_map.as_fields_ids_map())?;
|
||||
self.index.put_primary_key(wtxn, &primary_key)?;
|
||||
self.documents_count += documents_count;
|
||||
// Now that we have a valid sorter that contains the user id and the obkv we
|
||||
@ -371,7 +374,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
)]
|
||||
fn flatten_from_fields_ids_map(
|
||||
obkv: &KvReader<FieldId>,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
fields_ids_map: &mut FieldIdMapWithMetadata,
|
||||
) -> Result<Option<Vec<u8>>> {
|
||||
if obkv
|
||||
.iter()
|
||||
@ -657,7 +660,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
fn rebind_existing_document(
|
||||
old_obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
modified_faceted_fields: &HashSet<String>,
|
||||
mut injected_vectors: serde_json::Map<String, serde_json::Value>,
|
||||
old_vectors_fid: Option<FieldId>,
|
||||
original_obkv_buffer: Option<&mut Vec<u8>>,
|
||||
@ -667,23 +669,26 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) };
|
||||
|
||||
// If only a faceted field has been added, keep only this field.
|
||||
let global_facet_settings_changed = settings_diff.global_facet_settings_changed();
|
||||
let facet_fids_changed = settings_diff.facet_fids_changed();
|
||||
let necessary_faceted_field =
|
||||
|id: FieldId| -> bool {
|
||||
|
||||
let necessary_faceted_field = |id: FieldId| -> Option<DelAddOperation> {
|
||||
if facet_fids_changed {
|
||||
let field_name = settings_diff.new.fields_ids_map.name(id).unwrap();
|
||||
if global_facet_settings_changed {
|
||||
settings_diff.new.user_defined_faceted_fields.iter().any(|long| {
|
||||
is_faceted_by(long, field_name) || is_faceted_by(field_name, long)
|
||||
})
|
||||
} else if facet_fids_changed {
|
||||
modified_faceted_fields.iter().any(|long| {
|
||||
is_faceted_by(long, field_name) || is_faceted_by(field_name, long)
|
||||
})
|
||||
} else {
|
||||
false
|
||||
// if the faceted fields changed, we need to keep all the field that are
|
||||
// faceted in the old or new settings.
|
||||
match (
|
||||
settings_diff.old.match_faceted_field(field_name),
|
||||
settings_diff.new.match_faceted_field(field_name),
|
||||
) {
|
||||
(PatternMatch::NoMatch, PatternMatch::NoMatch) => None,
|
||||
(PatternMatch::NoMatch, _) => Some(DelAddOperation::Addition),
|
||||
(_, PatternMatch::NoMatch) => Some(DelAddOperation::Deletion),
|
||||
(_, _) => Some(DelAddOperation::DeletionAndAddition),
|
||||
}
|
||||
};
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
// Alway provide all fields when vectors are involved because
|
||||
// we need the fields for the prompt/templating.
|
||||
@ -734,12 +739,22 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
}
|
||||
}
|
||||
|
||||
if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors {
|
||||
if is_primary_key(id) || reindex_vectors {
|
||||
operations.insert(id, DelAddOperation::DeletionAndAddition);
|
||||
obkv_writer.insert(id, val)?;
|
||||
} else if let Some(operation) = settings_diff.reindex_searchable_id(id) {
|
||||
operations.insert(id, operation);
|
||||
obkv_writer.insert(id, val)?;
|
||||
} else {
|
||||
let facet_operation = necessary_faceted_field(id);
|
||||
let searchable_operation = settings_diff.reindex_searchable_id(id);
|
||||
let operation = facet_operation
|
||||
// TODO: replace `zip.map` with `zip_with` once stable
|
||||
.zip(searchable_operation)
|
||||
.map(|(op1, op2)| op1.merge(op2))
|
||||
.or(facet_operation)
|
||||
.or(searchable_operation);
|
||||
if let Some(operation) = operation {
|
||||
operations.insert(id, operation);
|
||||
obkv_writer.insert(id, val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !injected_vectors.is_empty() {
|
||||
@ -856,7 +871,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
};
|
||||
|
||||
if original_sorter.is_some() || flattened_sorter.is_some() {
|
||||
let modified_faceted_fields = settings_diff.modified_faceted_fields();
|
||||
let mut original_obkv_buffer = Vec::new();
|
||||
let mut flattened_obkv_buffer = Vec::new();
|
||||
let mut document_sorter_key_buffer = Vec::new();
|
||||
@ -897,7 +911,6 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
Self::rebind_existing_document(
|
||||
old_obkv,
|
||||
&settings_diff,
|
||||
&modified_faceted_fields,
|
||||
injected_vectors,
|
||||
old_vectors_fid,
|
||||
Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()),
|
||||
|
@ -365,7 +365,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let merger = builder.build();
|
||||
|
||||
let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size);
|
||||
indexer.execute(wtxn)?;
|
||||
indexer.execute(wtxn, &settings_diff.new)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetStringDocids(_) => {
|
||||
@ -401,7 +401,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
Some(normalized_facet_id_string_merger),
|
||||
data_size,
|
||||
);
|
||||
indexer.execute(wtxn)?;
|
||||
indexer.execute(wtxn, &settings_diff.new)?;
|
||||
is_merged_database = true;
|
||||
}
|
||||
TypedChunk::FieldIdFacetExistsDocids(_) => {
|
||||
|
Reference in New Issue
Block a user