mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Merge #712
712: Fix bulk facet indexing bug r=Kerollmops a=loiclec # Pull Request ## Related issue Fixes (partially, until merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/3165 ## What does this PR do? Fixes a bug where indexing certain numbers of filterable attribute values in bulk led to corrupted facet databases. This was due to a lossy integer conversion which would ultimately prevent entire levels of the facet database to be written into LMDB. More specifically, this change was made: ```diff - if cur_writer_len as u8 >= self.min_level_size { + if cur_writer_len >= self.min_level_size as usize { ``` I also checked other comparisons to `min_level_size` and other conversions such as `x as u8` in this part of the codebase. Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
		| @@ -288,6 +288,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
|                 for bitmap in sub_bitmaps { |                 for bitmap in sub_bitmaps { | ||||||
|                     combined_bitmap |= bitmap; |                     combined_bitmap |= bitmap; | ||||||
|                 } |                 } | ||||||
|  |                 // The conversion of sub_bitmaps.len() to a u8 will always be correct | ||||||
|  |                 // since its length is bounded by max_group_size, which is a u8. | ||||||
|                 group_sizes.push(sub_bitmaps.len() as u8); |                 group_sizes.push(sub_bitmaps.len() as u8); | ||||||
|                 left_bounds.push(left_bound); |                 left_bounds.push(left_bound); | ||||||
|  |  | ||||||
| @@ -340,7 +342,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         // if we inserted enough elements to reach the minimum level size, then we push the writer |         // if we inserted enough elements to reach the minimum level size, then we push the writer | ||||||
|         if cur_writer_len as u8 >= self.min_level_size { |         if cur_writer_len >= self.min_level_size as usize { | ||||||
|             sub_writers.push(writer_into_reader(cur_writer)?); |             sub_writers.push(writer_into_reader(cur_writer)?); | ||||||
|         } else { |         } else { | ||||||
|             // otherwise, if there are still leftover elements, we give them to the level above |             // otherwise, if there are still leftover elements, we give them to the level above | ||||||
| @@ -357,11 +359,15 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> { | |||||||
| mod tests { | mod tests { | ||||||
|     use std::iter::once; |     use std::iter::once; | ||||||
|  |  | ||||||
|  |     use big_s::S; | ||||||
|  |     use maplit::hashset; | ||||||
|     use roaring::RoaringBitmap; |     use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  |     use crate::documents::documents_batch_reader_from_objects; | ||||||
|     use crate::heed_codec::facet::OrderedF64Codec; |     use crate::heed_codec::facet::OrderedF64Codec; | ||||||
|     use crate::milli_snap; |     use crate::index::tests::TempIndex; | ||||||
|     use crate::update::facet::tests::FacetIndex; |     use crate::update::facet::tests::FacetIndex; | ||||||
|  |     use crate::{db_snap, milli_snap}; | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn insert() { |     fn insert() { | ||||||
| @@ -443,4 +449,46 @@ mod tests { | |||||||
|         test("large_group_small_min_level", 16, 2); |         test("large_group_small_min_level", 16, 2); | ||||||
|         test("odd_group_odd_min_level", 7, 3); |         test("odd_group_odd_min_level", 7, 3); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn bug_3165() { | ||||||
|  |         // Indexing a number of facet values that falls within certains ranges (e.g. 22_540 qualifies) | ||||||
|  |         // would lead to a facet DB which was missing some levels. | ||||||
|  |         // That was because before writing a level into the database, we would | ||||||
|  |         // check that its size was higher than the minimum level size using | ||||||
|  |         // a lossy integer conversion: `level_size as u8 >= min_level_size`. | ||||||
|  |         // | ||||||
|  |         // This missing level in the facet DBs would make the incremental indexer | ||||||
|  |         // (and other search algorithms) crash. | ||||||
|  |         // | ||||||
|  |         // https://github.com/meilisearch/meilisearch/issues/3165 | ||||||
|  |         let index = TempIndex::new_with_map_size(4096 * 1000 * 100); | ||||||
|  |  | ||||||
|  |         index | ||||||
|  |             .update_settings(|settings| { | ||||||
|  |                 settings.set_primary_key("id".to_owned()); | ||||||
|  |                 settings.set_filterable_fields(hashset! { S("id") }); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         let mut documents = vec![]; | ||||||
|  |         for i in 0..=22_540 { | ||||||
|  |             documents.push( | ||||||
|  |                 serde_json::json! { | ||||||
|  |                     { | ||||||
|  |                         "id": i as u64, | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 .as_object() | ||||||
|  |                 .unwrap() | ||||||
|  |                 .clone(), | ||||||
|  |             ); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let documents = documents_batch_reader_from_objects(documents); | ||||||
|  |         index.add_documents(documents).unwrap(); | ||||||
|  |  | ||||||
|  |         db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a"); | ||||||
|  |         db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521"); | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -436,6 +436,8 @@ impl FacetsUpdateIncrementalInner { | |||||||
|                 level: highest_level + 1, |                 level: highest_level + 1, | ||||||
|                 left_bound: first_key.unwrap().left_bound, |                 left_bound: first_key.unwrap().left_bound, | ||||||
|             }; |             }; | ||||||
|  |             // Note: nbr_leftover_elements can be casted to a u8 since it is bounded by `max_group_size` | ||||||
|  |             // when it is created above. | ||||||
|             let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; |             let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; | ||||||
|             to_add.push((key.into_owned(), value)); |             to_add.push((key.into_owned(), value)); | ||||||
|         } |         } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user