mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	Fix some facet indexing bugs
This commit is contained in:
		
				
					committed by
					
						 Loïc Lecrenier
						Loïc Lecrenier
					
				
			
			
				
	
			
			
			
						parent
						
							68cbcdf08b
						
					
				
				
					commit
					61252248fb
				
			| @@ -139,7 +139,7 @@ mod test { | ||||
|             let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize); | ||||
|             let mut options = heed::EnvOpenOptions::new(); | ||||
|             let options = options.map_size(4096 * 4 * 100); | ||||
|             let tempdir = tempfile::TempDir::new_in("databases/").unwrap(); | ||||
|             let tempdir = tempfile::TempDir::new().unwrap(); | ||||
|             let env = options.open(tempdir.path()).unwrap(); | ||||
|             let content = env.create_database(None).unwrap(); | ||||
|  | ||||
|   | ||||
| @@ -242,6 +242,15 @@ pub fn snap_facet_id_string_docids(index: &Index) -> String { | ||||
|     }); | ||||
|     snap | ||||
| } | ||||
| pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { | ||||
|     let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |( | ||||
|         (field_id, doc_id, string), | ||||
|         other_string, | ||||
|     )| { | ||||
|         &format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}") | ||||
|     }); | ||||
|     snap | ||||
| } | ||||
| pub fn snap_documents_ids(index: &Index) -> String { | ||||
|     let rtxn = index.read_txn().unwrap(); | ||||
|     let documents_ids = index.documents_ids(&rtxn).unwrap(); | ||||
| @@ -423,6 +432,9 @@ macro_rules! full_snap_of_db { | ||||
|     ($index:ident, facet_id_string_docids) => {{ | ||||
|         $crate::snapshot_tests::snap_facet_id_string_docids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, field_id_docid_facet_strings) => {{ | ||||
|         $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) | ||||
|     }}; | ||||
|     ($index:ident, documents_ids) => {{ | ||||
|         $crate::snapshot_tests::snap_documents_ids(&$index) | ||||
|     }}; | ||||
|   | ||||
| @@ -1,8 +1,9 @@ | ||||
| use crate::facet::FacetType; | ||||
| use crate::heed_codec::facet::new::{ | ||||
|     FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, | ||||
| }; | ||||
| use crate::search::facet::get_highest_level; | ||||
| use crate::Result; | ||||
| use crate::{Index, Result}; | ||||
| use heed::Error; | ||||
| use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn}; | ||||
| use roaring::RoaringBitmap; | ||||
| @@ -287,7 +288,7 @@ impl FacetsUpdateIncremental { | ||||
|             .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? | ||||
|             .count(); | ||||
|  | ||||
|         if size_highest_level < self.min_level_size { | ||||
|         if size_highest_level < self.group_size * self.min_level_size { | ||||
|             return Ok(()); | ||||
|         } | ||||
|  | ||||
|   | ||||
| @@ -38,12 +38,13 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||
|  | ||||
|         let (document_id_bytes, normalized_value_bytes) = | ||||
|             try_split_array_at::<_, 4>(bytes).unwrap(); | ||||
|         let document_id = u32::from_be_bytes(document_id_bytes); | ||||
|  | ||||
|         let normalised_value = std::str::from_utf8(normalized_value_bytes)?; | ||||
|         let key = FacetKey { field_id, level: 0, left_bound: normalised_value }; | ||||
|         let key_bytes = FacetKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap(); | ||||
|  | ||||
|         facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?; | ||||
|         facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; | ||||
|     } | ||||
|  | ||||
|     sorter_into_reader(facet_string_docids_sorter, indexer) | ||||
|   | ||||
| @@ -592,7 +592,7 @@ mod tests { | ||||
|     use crate::index::tests::TempIndex; | ||||
|     use crate::search::TermsMatchingStrategy; | ||||
|     use crate::update::DeleteDocuments; | ||||
|     use crate::BEU16; | ||||
|     use crate::{db_snap, BEU16}; | ||||
|  | ||||
|     #[test] | ||||
|     fn simple_document_replacement() { | ||||
| @@ -1379,6 +1379,25 @@ mod tests { | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_string_docids, @r###" | ||||
|         3   0  first        1  [1, ] | ||||
|         3   0  second       1  [2, ] | ||||
|         3   0  third        1  [3, ] | ||||
|         3   0  zeroth       1  [0, ] | ||||
|         "###); | ||||
|         db_snap!(index, field_id_docid_facet_strings, @r###" | ||||
|         3   0    zeroth       zeroth | ||||
|         3   1    first        first | ||||
|         3   2    second       second | ||||
|         3   3    third        third | ||||
|         "###); | ||||
|         db_snap!(index, string_faceted_documents_ids, @r###" | ||||
|         0   [] | ||||
|         1   [] | ||||
|         2   [] | ||||
|         3   [0, 1, 2, 3, ] | ||||
|         "###); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let hidden = index.faceted_fields(&rtxn).unwrap(); | ||||
| @@ -1399,6 +1418,15 @@ mod tests { | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_string_docids, @""); | ||||
|         db_snap!(index, field_id_docid_facet_strings, @""); | ||||
|         db_snap!(index, string_faceted_documents_ids, @r###" | ||||
|         0   [] | ||||
|         1   [] | ||||
|         2   [] | ||||
|         3   [0, 1, 2, 3, ] | ||||
|         "###); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
| @@ -1412,6 +1440,25 @@ mod tests { | ||||
|             }) | ||||
|             .unwrap(); | ||||
|  | ||||
|         db_snap!(index, facet_id_string_docids, @r###" | ||||
|         3   0  first        1  [1, ] | ||||
|         3   0  second       1  [2, ] | ||||
|         3   0  third        1  [3, ] | ||||
|         3   0  zeroth       1  [0, ] | ||||
|         "###); | ||||
|         db_snap!(index, field_id_docid_facet_strings, @r###" | ||||
|         3   0    zeroth       zeroth | ||||
|         3   1    first        first | ||||
|         3   2    second       second | ||||
|         3   3    third        third | ||||
|         "###); | ||||
|         db_snap!(index, string_faceted_documents_ids, @r###" | ||||
|         0   [] | ||||
|         1   [] | ||||
|         2   [] | ||||
|         3   [0, 1, 2, 3, ] | ||||
|         "###); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::convert::TryInto; | ||||
| use std::fs::File; | ||||
| use std::io; | ||||
| @@ -17,8 +18,8 @@ use crate::heed_codec::facet::new::{FacetKeyCodec, MyByteSlice}; | ||||
| use crate::update::index_documents::helpers::as_cloneable_grenad; | ||||
| use crate::update::FacetsUpdateIncremental; | ||||
| use crate::{ | ||||
|     lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, | ||||
|     Result, | ||||
|     lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, | ||||
|     Index, Result, | ||||
| }; | ||||
|  | ||||
| pub(crate) enum TypedChunk { | ||||
| @@ -138,14 +139,41 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 facet_id_f64_docids_iter, | ||||
|                 &index.facet_id_f64_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             // merge cbo roaring bitmaps is not the correct merger because the data in the DB | ||||
|             // is FacetGroupValue and not RoaringBitmap | ||||
|             // so I need to create my own merging function | ||||
|  | ||||
|             // facet_id_string_docids is encoded as: | ||||
|             // key: FacetKeyCodec<StrRefCodec> | ||||
|             // value: CboRoaringBitmapCodec | ||||
|             // basically | ||||
|  | ||||
|             // TODO: a condition saying "if I have more than 1/50th of the DB to add, | ||||
|             // then I do it in bulk, otherwise I do it incrementally". But instead of 1/50, | ||||
|             // it is a ratio I determine empirically | ||||
|  | ||||
|             // for now I only do it incrementally, to see if things work | ||||
|             let indexer = FacetsUpdateIncremental::new( | ||||
|                 index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), | ||||
|             ); | ||||
|  | ||||
|             let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default(); | ||||
|  | ||||
|             let mut cursor = facet_id_f64_docids_iter.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 let key = | ||||
|                     FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?; | ||||
|                 let docids = | ||||
|                     CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; | ||||
|                 indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; | ||||
|                 *new_faceted_docids.entry(key.field_id).or_default() |= docids; | ||||
|             } | ||||
|             for (field_id, new_docids) in new_faceted_docids { | ||||
|                 let mut docids = index.number_faceted_documents_ids(wtxn, field_id)?; | ||||
|                 docids |= new_docids; | ||||
|                 index.put_number_faceted_documents_ids(wtxn, field_id, &docids)?; | ||||
|             } | ||||
|  | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { | ||||
| @@ -163,16 +191,24 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|             // it is a ratio I determine empirically | ||||
|  | ||||
|             // for now I only do it incrementally, to see if things work | ||||
|             let builder = FacetsUpdateIncremental::new( | ||||
|             let indexer = FacetsUpdateIncremental::new( | ||||
|                 index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), | ||||
|             ); | ||||
|             let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default(); | ||||
|  | ||||
|             let mut cursor = facet_id_string_docids.into_cursor()?; | ||||
|             while let Some((key, value)) = cursor.move_on_next()? { | ||||
|                 let key = | ||||
|                     FacetKeyCodec::<MyByteSlice>::bytes_decode(key).ok_or(heed::Error::Encoding)?; | ||||
|                 let value = | ||||
|                 let docids = | ||||
|                     CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; | ||||
|                 builder.insert(wtxn, key.field_id, key.left_bound, &value)?; | ||||
|                 indexer.insert(wtxn, key.field_id, key.left_bound, &docids)?; | ||||
|                 *new_faceted_docids.entry(key.field_id).or_default() |= docids; | ||||
|             } | ||||
|             for (field_id, new_docids) in new_faceted_docids { | ||||
|                 let mut docids = index.string_faceted_documents_ids(wtxn, field_id)?; | ||||
|                 docids |= new_docids; | ||||
|                 index.put_string_faceted_documents_ids(wtxn, field_id, &docids)?; | ||||
|             } | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user