diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index b865d0a35..358b18274 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -9,7 +9,7 @@ use heed::RoTxn; use serde_json::Value; use super::super::cache::BalancedCaches; -use super::facet_document::extract_document_facets; +use super::facet_document::{extract_document_facets, extract_merged_document_facets}; use super::FacetKind; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; @@ -106,17 +106,19 @@ impl FacetedDocidsExtractor { return Ok(()); } - extract_document_facets( + extract_merged_document_facets( attributes_to_extract, - inner.current(rtxn, index, context.db_fields_ids_map)?, + inner.merged(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), + &mut del_add_facet_value, + cached_sorter.deref_mut(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + &mut |fid, depth, value, del_add_facet_value, cached_sorter| { Self::facet_fn_with_options( &context.doc_alloc, - cached_sorter.deref_mut(), + cached_sorter, BalancedCaches::insert_del_u32, - &mut del_add_facet_value, + del_add_facet_value, DelAddFacetValue::insert_del, docid, fid, @@ -124,19 +126,12 @@ impl FacetedDocidsExtractor { value, ) }, - )?; - - extract_document_facets( - attributes_to_extract, - inner.merged(rtxn, index, context.db_fields_ids_map)?, - inner.external_document_id(), - new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + &mut |fid, depth, value, del_add_facet_value, cached_sorter| { Self::facet_fn_with_options( &context.doc_alloc, - cached_sorter.deref_mut(), + cached_sorter, BalancedCaches::insert_add_u32, - &mut del_add_facet_value, + del_add_facet_value, DelAddFacetValue::insert_add, docid, fid, @@ -282,7 +277,7 @@ impl FacetedDocidsExtractor { } } -struct DelAddFacetValue<'doc> { +pub(crate) struct DelAddFacetValue<'doc> { strings: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, f64s: HashMap<(FieldId, BVec<'doc, u8>), DelAdd, hashbrown::DefaultHashBuilder, &'doc Bump>, } diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index eff529120..3bd9bccb2 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,9 +1,11 @@ +use serde_json::value::RawValue; use serde_json::Value; -use crate::update::new::document::Document; +use super::extract_facets::DelAddFacetValue; +use crate::update::new::document::{Document, MergedDocument, MergedValue}; use crate::update::new::extract::geo::extract_geo_coordinates; -use crate::update::new::extract::perm_json_p; -use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; +use crate::update::new::extract::{perm_json_p, BalancedCaches}; +use crate::{FieldId, FieldsIdsMap, GlobalFieldsIdsMap, InternalError, Result, UserError}; pub fn extract_document_facets<'doc>( attributes_to_extract: &[&str], @@ -15,58 +17,7 @@ pub fn extract_document_facets<'doc>( for res in document.iter_top_level_fields() { let (field_name, value) = res?; - let mut tokenize_field = - |name: &str, depth: perm_json_p::Depth, value: &Value| match field_id_map - .id_or_insert(name) - { - Some(field_id) => facet_fn(field_id, depth, value), - None => Err(UserError::AttributeLimitReached.into()), - }; - - // if the current field is searchable or contains a searchable attribute - let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); - if selection != perm_json_p::Selection::Skip { - // parse json. - match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { - Value::Object(object) => { - perm_json_p::seek_leaf_values_in_object( - &object, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, - )?; - - if selection == perm_json_p::Selection::Select { - tokenize_field( - field_name, - perm_json_p::Depth::OnBaseKey, - &Value::Object(object), - )?; - } - } - Value::Array(array) => { - perm_json_p::seek_leaf_values_in_array( - &array, - Some(attributes_to_extract), - &[], // skip no attributes - field_name, - perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, - )?; - - if selection == perm_json_p::Selection::Select { - tokenize_field( - field_name, - perm_json_p::Depth::OnBaseKey, - &Value::Array(array), - )?; - } - } - value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, - } - } + extract_document_facet(attributes_to_extract, field_id_map, facet_fn, field_name, value)?; } if attributes_to_extract.contains(&"_geo") { @@ -85,3 +36,203 @@ pub fn extract_document_facets<'doc>( Ok(()) } + +fn extract_document_facet( + attributes_to_extract: &[&str], + field_id_map: &mut GlobalFieldsIdsMap<'_>, + facet_fn: &mut impl FnMut(u16, perm_json_p::Depth, &Value) -> std::result::Result<(), crate::Error>, + field_name: &str, + value: &serde_json::value::RawValue, +) -> Result<()> { + let mut tokenize_field = |name: &str, depth: perm_json_p::Depth, value: &Value| { + match field_id_map.id_or_insert(name) { + Some(field_id) => facet_fn(field_id, depth, value), + None => Err(UserError::AttributeLimitReached.into()), + } + }; + let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); + if selection != perm_json_p::Selection::Skip { + // parse json. + match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { + Value::Object(object) => { + perm_json_p::seek_leaf_values_in_object( + &object, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + perm_json_p::Depth::OnBaseKey, + &mut tokenize_field, + )?; + + if selection == perm_json_p::Selection::Select { + tokenize_field( + field_name, + perm_json_p::Depth::OnBaseKey, + &Value::Object(object), + )?; + } + } + Value::Array(array) => { + perm_json_p::seek_leaf_values_in_array( + &array, + Some(attributes_to_extract), + &[], // skip no attributes + field_name, + perm_json_p::Depth::OnBaseKey, + &mut tokenize_field, + )?; + + if selection == perm_json_p::Selection::Select { + tokenize_field( + field_name, + perm_json_p::Depth::OnBaseKey, + &Value::Array(array), + )?; + } + } + value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, + } + }; + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +pub fn extract_merged_document_facets<'doc, 'del_add_facet_value, 'cache>( + attributes_to_extract: &[&str], + document: MergedDocument<'doc, 'doc, 'doc, FieldsIdsMap>, + external_document_id: &str, + del_add_facet_value: &mut DelAddFacetValue<'del_add_facet_value>, + cached_sorter: &mut BalancedCaches<'cache>, + field_id_map: &mut GlobalFieldsIdsMap, + facet_fn_current: &mut impl FnMut( + FieldId, + perm_json_p::Depth, + &Value, + &mut DelAddFacetValue<'del_add_facet_value>, + &mut BalancedCaches<'cache>, + ) -> Result<()>, + facet_fn_updated: &mut impl FnMut( + FieldId, + perm_json_p::Depth, + &Value, + &mut DelAddFacetValue<'del_add_facet_value>, + &mut BalancedCaches<'cache>, + ) -> Result<()>, +) -> Result<()> { + for res in document.iter_merged_top_level_fields() { + let (field_name, value) = res?; + match value { + MergedValue::Current(value) => { + extract_document_facet( + attributes_to_extract, + field_id_map, + &mut |fid, depth, value| { + facet_fn_current(fid, depth, value, del_add_facet_value, cached_sorter) + }, + field_name, + value, + )?; + } + MergedValue::Updated(value) => { + extract_document_facet( + attributes_to_extract, + field_id_map, + &mut |fid, depth, value| { + facet_fn_updated(fid, depth, value, del_add_facet_value, cached_sorter) + }, + field_name, + value, + )?; + } + MergedValue::CurrentAndUpdated(current, updated) => { + if current.get() == updated.get() { + continue; + } + extract_document_facet( + attributes_to_extract, + field_id_map, + &mut |fid, depth, value| { + facet_fn_current(fid, depth, value, del_add_facet_value, cached_sorter) + }, + field_name, + current, + )?; + extract_document_facet( + attributes_to_extract, + field_id_map, + &mut |fid, depth, value| { + facet_fn_updated(fid, depth, value, del_add_facet_value, cached_sorter) + }, + field_name, + updated, + )?; + } + } + } + + if attributes_to_extract.contains(&"_geo") { + match document.merged_geo_field()? { + Some(MergedValue::Current(current)) => { + extract_geo_facet( + external_document_id, + current, + field_id_map, + &mut |fid, depth, value| { + facet_fn_current(fid, depth, value, del_add_facet_value, cached_sorter) + }, + )?; + } + Some(MergedValue::Updated(updated)) => { + extract_geo_facet( + external_document_id, + updated, + field_id_map, + &mut |fid, depth, value| { + facet_fn_updated(fid, depth, value, del_add_facet_value, cached_sorter) + }, + )?; + } + Some(MergedValue::CurrentAndUpdated(current, updated)) + if current.get() != updated.get() => + { + extract_geo_facet( + external_document_id, + current, + field_id_map, + &mut |fid, depth, value| { + facet_fn_current(fid, depth, value, del_add_facet_value, cached_sorter) + }, + )?; + extract_geo_facet( + external_document_id, + updated, + field_id_map, + &mut |fid, depth, value| { + facet_fn_updated(fid, depth, value, del_add_facet_value, cached_sorter) + }, + )?; + } + None | Some(MergedValue::CurrentAndUpdated(_, _)) => {} + } + } + + Ok(()) +} + +fn extract_geo_facet( + external_document_id: &str, + geo_value: &RawValue, + field_id_map: &mut GlobalFieldsIdsMap<'_>, + facet_fn: &mut impl FnMut(FieldId, perm_json_p::Depth, &Value) -> Result<()>, +) -> Result<()> { + if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? { + let (lat_fid, lng_fid) = field_id_map + .id_or_insert("_geo.lat") + .zip(field_id_map.id_or_insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + + facet_fn(lat_fid, perm_json_p::Depth::OnBaseKey, &lat.into())?; + facet_fn(lng_fid, perm_json_p::Depth::OnBaseKey, &lng.into())?; + }; + Ok(()) +}