mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Add sanity checks for facet values
This commit is contained in:
		| @@ -79,17 +79,23 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; | ||||
| use std::collections::BTreeSet; | ||||
| use std::fs::File; | ||||
| use std::io::BufReader; | ||||
| use std::ops::Bound; | ||||
|  | ||||
| use grenad::Merger; | ||||
| use heed::types::{Bytes, DecodeIgnore}; | ||||
| use heed::BytesDecode as _; | ||||
| use roaring::RoaringBitmap; | ||||
| use time::OffsetDateTime; | ||||
| use tracing::debug; | ||||
|  | ||||
| use self::incremental::FacetsUpdateIncremental; | ||||
| use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps}; | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; | ||||
| use crate::heed_codec::facet::{ | ||||
|     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, | ||||
| }; | ||||
| use crate::heed_codec::BytesRefCodec; | ||||
| use crate::search::facet::get_highest_level; | ||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd}; | ||||
| use crate::{try_split_array_at, FieldId, Index, Result}; | ||||
|  | ||||
| @@ -646,3 +652,194 @@ mod comparison_bench { | ||||
|         } | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Run sanity checks on the specified fid tree | ||||
| /// | ||||
| /// 1. No "orphan" child value, any child value has a parent | ||||
| /// 2. Any docid in the child appears in the parent | ||||
| /// 3. No docid in the parent is missing from all its children | ||||
| /// 4. no group is bigger than max_group_size | ||||
| /// 5. Less than 50% of groups are bigger than group_size | ||||
| /// 6. group size matches the number of children | ||||
| /// 7. max_level is < 255 | ||||
| pub(crate) fn sanity_checks( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
|     field_id: FieldId, | ||||
|     facet_type: FacetType, | ||||
|     group_size: usize, | ||||
|     _min_level_size: usize, // might add a check on level size later | ||||
|     max_group_size: usize, | ||||
| ) -> Result<()> { | ||||
|     tracing::info!(%field_id, ?facet_type, "performing sanity checks"); | ||||
|     let database = match facet_type { | ||||
|         FacetType::String => { | ||||
|             index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|         } | ||||
|         FacetType::Number => { | ||||
|             index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>() | ||||
|         } | ||||
|     }; | ||||
|  | ||||
|     let leaf_prefix: FacetGroupKey<&[u8]> = FacetGroupKey { field_id, level: 0, left_bound: &[] }; | ||||
|  | ||||
|     let leaf_it = database.prefix_iter(rtxn, &leaf_prefix)?; | ||||
|  | ||||
|     let max_level = get_highest_level(rtxn, database, field_id)?; | ||||
|     if max_level == u8::MAX { | ||||
|         panic!("max_level == 255"); | ||||
|     } | ||||
|  | ||||
|     for leaf in leaf_it { | ||||
|         let (leaf_facet_value, leaf_docids) = leaf?; | ||||
|         let mut current_level = 0; | ||||
|  | ||||
|         let mut current_parent_facet_value: Option<FacetGroupKey<&[u8]>> = None; | ||||
|         let mut current_parent_docids: Option<crate::heed_codec::facet::FacetGroupValue> = None; | ||||
|         loop { | ||||
|             current_level += 1; | ||||
|             if current_level >= max_level { | ||||
|                 break; | ||||
|             } | ||||
|             let parent_key_right_bound = FacetGroupKey { | ||||
|                 field_id, | ||||
|                 level: current_level, | ||||
|                 left_bound: leaf_facet_value.left_bound, | ||||
|             }; | ||||
|             let (parent_facet_value, parent_docids) = database | ||||
|                 .get_lower_than_or_equal_to(rtxn, &parent_key_right_bound)? | ||||
|                 .expect("no parent found"); | ||||
|             if parent_facet_value.level != current_level { | ||||
|                 panic!( | ||||
|                     "wrong parent level, found_level={}, expected_level={}", | ||||
|                     parent_facet_value.level, current_level | ||||
|                 ); | ||||
|             } | ||||
|             if parent_facet_value.field_id != field_id { | ||||
|                 panic!("wrong parent fid"); | ||||
|             } | ||||
|             if parent_facet_value.left_bound > leaf_facet_value.left_bound { | ||||
|                 panic!("wrong parent left bound"); | ||||
|             } | ||||
|  | ||||
|             if !leaf_docids.bitmap.is_subset(&parent_docids.bitmap) { | ||||
|                 panic!( | ||||
|                     "missing docids from leaf in parent, current_level={}, parent={}, child={}, missing={missing:?}, child_len={}, child={:?}", | ||||
|                     current_level, | ||||
|                     facet_to_string(parent_facet_value.left_bound, facet_type), | ||||
|                     facet_to_string(leaf_facet_value.left_bound, facet_type), | ||||
|                     leaf_docids.bitmap.len(), | ||||
|                     leaf_docids.bitmap.clone(), | ||||
|                     missing=leaf_docids.bitmap - parent_docids.bitmap, | ||||
|                 ) | ||||
|             } | ||||
|  | ||||
|             if let Some(current_parent_facet_value) = current_parent_facet_value { | ||||
|                 if current_parent_facet_value.field_id != parent_facet_value.field_id { | ||||
|                     panic!("wrong parent parent fid"); | ||||
|                 } | ||||
|                 if current_parent_facet_value.level + 1 != parent_facet_value.level { | ||||
|                     panic!("wrong parent parent level"); | ||||
|                 } | ||||
|                 if current_parent_facet_value.left_bound < parent_facet_value.left_bound { | ||||
|                     panic!("wrong parent parent left bound"); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             if let Some(current_parent_docids) = current_parent_docids { | ||||
|                 if !current_parent_docids.bitmap.is_subset(&parent_docids.bitmap) { | ||||
|                     panic!("missing docids from intermediate node in parent, parent_level={}, parent={}, intermediate={}, missing={missing:?}, intermediate={:?}", | ||||
|                     parent_facet_value.level, | ||||
|                     facet_to_string(parent_facet_value.left_bound, facet_type), | ||||
|                     facet_to_string(current_parent_facet_value.unwrap().left_bound, facet_type), | ||||
|                     current_parent_docids.bitmap.clone(), | ||||
|                     missing=current_parent_docids.bitmap - parent_docids.bitmap, | ||||
|                     ); | ||||
|                 } | ||||
|             } | ||||
|  | ||||
|             current_parent_facet_value = Some(parent_facet_value); | ||||
|             current_parent_docids = Some(parent_docids); | ||||
|         } | ||||
|     } | ||||
|     tracing::info!(%field_id, ?facet_type, "checked all leaves"); | ||||
|  | ||||
|     let mut current_level = max_level; | ||||
|     let mut greater_than_group = 0usize; | ||||
|     let mut total = 0usize; | ||||
|     loop { | ||||
|         if current_level == 0 { | ||||
|             break; | ||||
|         } | ||||
|         let child_level = current_level - 1; | ||||
|         tracing::info!(%field_id, ?facet_type, %current_level, "checked groups for level"); | ||||
|         let level_groups_prefix: FacetGroupKey<&[u8]> = | ||||
|             FacetGroupKey { field_id, level: current_level, left_bound: &[] }; | ||||
|         let mut level_groups_it = database.prefix_iter(rtxn, &level_groups_prefix)?.peekable(); | ||||
|  | ||||
|         'group_it: loop { | ||||
|             let Some(group) = level_groups_it.next() else { break 'group_it }; | ||||
|  | ||||
|             let (group_facet_value, group_docids) = group?; | ||||
|             let child_left_bound = group_facet_value.left_bound.to_owned(); | ||||
|             let mut expected_docids = RoaringBitmap::new(); | ||||
|             let mut expected_size = 0usize; | ||||
|             let right_bound = level_groups_it | ||||
|                 .peek() | ||||
|                 .and_then(|res| res.as_ref().ok()) | ||||
|                 .map(|(key, _)| key.left_bound); | ||||
|             let child_left_bound = FacetGroupKey { | ||||
|                 field_id, | ||||
|                 level: child_level, | ||||
|                 left_bound: child_left_bound.as_slice(), | ||||
|             }; | ||||
|             let child_left_bound = Bound::Included(&child_left_bound); | ||||
|             let child_right_bound; | ||||
|             let child_right_bound = if let Some(right_bound) = right_bound { | ||||
|                 child_right_bound = | ||||
|                     FacetGroupKey { field_id, level: child_level, left_bound: right_bound }; | ||||
|                 Bound::Excluded(&child_right_bound) | ||||
|             } else { | ||||
|                 Bound::Unbounded | ||||
|             }; | ||||
|             let children = database.range(rtxn, &(child_left_bound, child_right_bound))?; | ||||
|             for child in children { | ||||
|                 let (child_facet_value, child_docids) = child?; | ||||
|                 if child_facet_value.field_id != field_id { | ||||
|                     break; | ||||
|                 } | ||||
|                 if child_facet_value.level != child_level { | ||||
|                     break; | ||||
|                 } | ||||
|                 expected_size += 1; | ||||
|                 expected_docids |= &child_docids.bitmap; | ||||
|             } | ||||
|             assert_eq!(expected_size, group_docids.size as usize); | ||||
|             assert!(expected_size <= max_group_size); | ||||
|             assert_eq!(expected_docids, group_docids.bitmap); | ||||
|             total += 1; | ||||
|             if expected_size > group_size { | ||||
|                 greater_than_group += 1; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         current_level -= 1; | ||||
|     } | ||||
|     if greater_than_group * 2 > total { | ||||
|         panic!("too many groups have a size > group_size"); | ||||
|     } | ||||
|  | ||||
|     tracing::info!("sanity checks OK"); | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn facet_to_string(facet_value: &[u8], facet_type: FacetType) -> String { | ||||
|     match facet_type { | ||||
|         FacetType::String => bstr::BStr::new(facet_value).to_string(), | ||||
|         FacetType::Number => match OrderedF64Codec::bytes_decode(facet_value) { | ||||
|             Ok(value) => value.to_string(), | ||||
|             Err(e) => format!("error: {e} (bytes: {facet_value:?}"), | ||||
|         }, | ||||
|     } | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user