mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Merge remote-tracking branch 'origin/main' into search-refactor
Conflicts | resolution ----------|----------- Cargo.lock | added mimalloc Cargo.toml | took origin/main version milli/src/search/criteria/exactness.rs | deleted after checking it was only clippy changes milli/src/search/query_tree.rs | deleted after checking it was only clippy changes
This commit is contained in:
		| @@ -82,6 +82,8 @@ pub mod db_name { | ||||
|     pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; | ||||
|     pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids"; | ||||
|     pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids"; | ||||
|     pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; | ||||
|     pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; | ||||
|     pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; | ||||
|     pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; | ||||
|     pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; | ||||
| @@ -136,6 +138,10 @@ pub struct Index { | ||||
|  | ||||
|     /// Maps the facet field id and the docids for which this field exists | ||||
|     pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the facet field id and the docids for which this field is set as null | ||||
|     pub facet_id_is_null_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the facet field id and the docids for which this field is considered empty | ||||
|     pub facet_id_is_empty_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, | ||||
|  | ||||
|     /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. | ||||
|     pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>, | ||||
| @@ -184,6 +190,8 @@ impl Index { | ||||
|         let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?; | ||||
|         let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?; | ||||
|         let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?; | ||||
|         let facet_id_is_null_docids = env.create_database(Some(FACET_ID_IS_NULL_DOCIDS))?; | ||||
|         let facet_id_is_empty_docids = env.create_database(Some(FACET_ID_IS_EMPTY_DOCIDS))?; | ||||
|  | ||||
|         let field_id_docid_facet_f64s = env.create_database(Some(FIELD_ID_DOCID_FACET_F64S))?; | ||||
|         let field_id_docid_facet_strings = | ||||
| @@ -212,6 +220,8 @@ impl Index { | ||||
|             facet_id_f64_docids, | ||||
|             facet_id_string_docids, | ||||
|             facet_id_exists_docids, | ||||
|             facet_id_is_null_docids, | ||||
|             facet_id_is_empty_docids, | ||||
|             field_id_docid_facet_f64s, | ||||
|             field_id_docid_facet_strings, | ||||
|             documents, | ||||
| @@ -844,6 +854,30 @@ impl Index { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Retrieve all the documents which contain this field id set as null | ||||
|     pub fn null_faceted_documents_ids( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         field_id: FieldId, | ||||
|     ) -> heed::Result<RoaringBitmap> { | ||||
|         match self.facet_id_is_null_docids.get(rtxn, &BEU16::new(field_id))? { | ||||
|             Some(docids) => Ok(docids), | ||||
|             None => Ok(RoaringBitmap::new()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Retrieve all the documents which contain this field id and that is considered empty | ||||
|     pub fn empty_faceted_documents_ids( | ||||
|         &self, | ||||
|         rtxn: &RoTxn, | ||||
|         field_id: FieldId, | ||||
|     ) -> heed::Result<RoaringBitmap> { | ||||
|         match self.facet_id_is_empty_docids.get(rtxn, &BEU16::new(field_id))? { | ||||
|             Some(docids) => Ok(docids), | ||||
|             None => Ok(RoaringBitmap::new()), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     /// Retrieve all the documents which contain this field id | ||||
|     pub fn exists_faceted_documents_ids( | ||||
|         &self, | ||||
|   | ||||
| @@ -211,6 +211,14 @@ impl<'a> Filter<'a> { | ||||
|             Condition::Between { from, to } => { | ||||
|                 (Included(from.parse_finite_float()?), Included(to.parse_finite_float()?)) | ||||
|             } | ||||
|             Condition::Null => { | ||||
|                 let is_null = index.null_faceted_documents_ids(rtxn, field_id)?; | ||||
|                 return Ok(is_null); | ||||
|             } | ||||
|             Condition::Empty => { | ||||
|                 let is_empty = index.empty_faceted_documents_ids(rtxn, field_id)?; | ||||
|                 return Ok(is_empty); | ||||
|             } | ||||
|             Condition::Exists => { | ||||
|                 let exist = index.exists_faceted_documents_ids(rtxn, field_id)?; | ||||
|                 return Ok(exist); | ||||
|   | ||||
| @@ -276,6 +276,16 @@ pub fn snap_facet_id_exists_docids(index: &Index) -> String { | ||||
|         &format!("{facet_id:<3} {}", display_bitmap(&docids)) | ||||
|     }) | ||||
| } | ||||
| pub fn snap_facet_id_is_null_docids(index: &Index) -> String { | ||||
|     make_db_snap_from_iter!(index, facet_id_is_null_docids, |(facet_id, docids)| { | ||||
|         &format!("{facet_id:<3} {}", display_bitmap(&docids)) | ||||
|     }) | ||||
| } | ||||
| pub fn snap_facet_id_is_empty_docids(index: &Index) -> String { | ||||
|     make_db_snap_from_iter!(index, facet_id_is_empty_docids, |(facet_id, docids)| { | ||||
|         &format!("{facet_id:<3} {}", display_bitmap(&docids)) | ||||
|     }) | ||||
| } | ||||
| pub fn snap_facet_id_string_docids(index: &Index) -> String { | ||||
|     make_db_snap_from_iter!(index, facet_id_string_docids, |( | ||||
|         FacetGroupKey { field_id, level, left_bound }, | ||||
| @@ -503,6 +513,12 @@ macro_rules! full_snap_of_db { | ||||
|     ($index:ident, facet_id_exists_docids) => {{ | ||||
|         $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, facet_id_is_null_docids) => {{ | ||||
|         $crate::snapshot_tests::snap_facet_id_is_null_docids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, facet_id_is_empty_docids) => {{ | ||||
|         $crate::snapshot_tests::snap_facet_id_is_empty_docids(&$index) | ||||
|     }}; | ||||
|     ($index:ident, documents_ids) => {{ | ||||
|         $crate::snapshot_tests::snap_documents_ids(&$index) | ||||
|     }}; | ||||
|   | ||||
| @@ -36,6 +36,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|             facet_id_f64_docids, | ||||
|             facet_id_string_docids, | ||||
|             facet_id_exists_docids, | ||||
|             facet_id_is_null_docids, | ||||
|             facet_id_is_empty_docids, | ||||
|             field_id_docid_facet_f64s, | ||||
|             field_id_docid_facet_strings, | ||||
|             documents, | ||||
| @@ -90,6 +92,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         script_language_docids.clear(self.wtxn)?; | ||||
|         facet_id_f64_docids.clear(self.wtxn)?; | ||||
|         facet_id_exists_docids.clear(self.wtxn)?; | ||||
|         facet_id_is_null_docids.clear(self.wtxn)?; | ||||
|         facet_id_is_empty_docids.clear(self.wtxn)?; | ||||
|         facet_id_string_docids.clear(self.wtxn)?; | ||||
|         field_id_docid_facet_f64s.clear(self.wtxn)?; | ||||
|         field_id_docid_facet_strings.clear(self.wtxn)?; | ||||
|   | ||||
| @@ -247,6 +247,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             field_id_docid_facet_strings: _, | ||||
|             script_language_docids, | ||||
|             facet_id_exists_docids, | ||||
|             facet_id_is_null_docids, | ||||
|             facet_id_is_empty_docids, | ||||
|             documents, | ||||
|         } = self.index; | ||||
|  | ||||
| @@ -445,12 +447,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             &self.to_delete_docids, | ||||
|         )?; | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         remove_docids_from_facet_id_exists_docids( | ||||
|         remove_docids_from_facet_id_docids( | ||||
|             self.wtxn, | ||||
|             facet_id_exists_docids, | ||||
|             &self.to_delete_docids, | ||||
|         )?; | ||||
|  | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         remove_docids_from_facet_id_docids( | ||||
|             self.wtxn, | ||||
|             facet_id_is_null_docids, | ||||
|             &self.to_delete_docids, | ||||
|         )?; | ||||
|  | ||||
|         // We delete the documents ids that are under the facet field id values. | ||||
|         remove_docids_from_facet_id_docids( | ||||
|             self.wtxn, | ||||
|             facet_id_is_empty_docids, | ||||
|             &self.to_delete_docids, | ||||
|         )?; | ||||
|  | ||||
|         self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?; | ||||
|  | ||||
|         Ok(DetailedDocumentDeletionResult { | ||||
| @@ -577,7 +593,7 @@ fn remove_docids_from_field_id_docid_facet_value( | ||||
|     Ok(all_affected_facet_values) | ||||
| } | ||||
|  | ||||
| fn remove_docids_from_facet_id_exists_docids<'a, C>( | ||||
| fn remove_docids_from_facet_id_docids<'a, C>( | ||||
|     wtxn: &'a mut heed::RwTxn, | ||||
|     db: &heed::Database<C, CboRoaringBitmapCodec>, | ||||
|     to_remove: &RoaringBitmap, | ||||
|   | ||||
| @@ -181,7 +181,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st | ||||
|     fn inner(value: &Value, output: &mut String) -> bool { | ||||
|         use std::fmt::Write; | ||||
|         match value { | ||||
|             Value::Null => false, | ||||
|             Value::Null | Value::Object(_) => false, | ||||
|             Value::Bool(boolean) => write!(output, "{}", boolean).is_ok(), | ||||
|             Value::Number(number) => write!(output, "{}", number).is_ok(), | ||||
|             Value::String(string) => write!(output, "{}", string).is_ok(), | ||||
| @@ -196,23 +196,6 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st | ||||
|                 // check that at least one value was written | ||||
|                 count != 0 | ||||
|             } | ||||
|             Value::Object(object) => { | ||||
|                 let mut buffer = String::new(); | ||||
|                 let mut count = 0; | ||||
|                 for (key, value) in object { | ||||
|                     buffer.clear(); | ||||
|                     let _ = write!(&mut buffer, "{}: ", key); | ||||
|                     if inner(value, &mut buffer) { | ||||
|                         buffer.push_str(". "); | ||||
|                         // We write the "key: value. " pair only when | ||||
|                         // we are sure that the value can be written. | ||||
|                         output.push_str(&buffer); | ||||
|                         count += 1; | ||||
|                     } | ||||
|                 } | ||||
|                 // check that at least one value was written | ||||
|                 count != 0 | ||||
|             } | ||||
|         } | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -7,7 +7,7 @@ use std::mem::size_of; | ||||
| use heed::zerocopy::AsBytes; | ||||
| use heed::BytesEncode; | ||||
| use roaring::RoaringBitmap; | ||||
| use serde_json::Value; | ||||
| use serde_json::{from_slice, Value}; | ||||
|  | ||||
| use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters}; | ||||
| use crate::error::InternalError; | ||||
| @@ -15,6 +15,15 @@ use crate::facet::value_encoding::f64_into_bytes; | ||||
| use crate::update::index_documents::{create_writer, writer_into_reader}; | ||||
| use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH}; | ||||
|  | ||||
| /// The extracted facet values stored in grenad files by type. | ||||
| pub struct ExtractedFacetValues { | ||||
|     pub docid_fid_facet_numbers_chunk: grenad::Reader<File>, | ||||
|     pub docid_fid_facet_strings_chunk: grenad::Reader<File>, | ||||
|     pub fid_facet_is_null_docids_chunk: grenad::Reader<File>, | ||||
|     pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>, | ||||
|     pub fid_facet_exists_docids_chunk: grenad::Reader<File>, | ||||
| } | ||||
|  | ||||
| /// Extracts the facet values of each faceted field of each document. | ||||
| /// | ||||
| /// Returns the generated grenad reader containing the docid the fid and the orginal value as key | ||||
| @@ -24,7 +33,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|     obkv_documents: grenad::Reader<R>, | ||||
|     indexer: GrenadParameters, | ||||
|     faceted_fields: &HashSet<FieldId>, | ||||
| ) -> Result<(grenad::Reader<File>, grenad::Reader<File>, grenad::Reader<File>)> { | ||||
| ) -> Result<ExtractedFacetValues> { | ||||
|     let max_memory = indexer.max_memory_by_thread(); | ||||
|  | ||||
|     let mut fid_docid_facet_numbers_sorter = create_sorter( | ||||
| @@ -46,6 +55,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|     ); | ||||
|  | ||||
|     let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); | ||||
|     let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); | ||||
|     let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new(); | ||||
|  | ||||
|     let mut key_buffer = Vec::new(); | ||||
|     let mut cursor = obkv_documents.into_cursor()?; | ||||
| @@ -69,33 +80,44 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|                 // For the other extraction tasks, prefix the key with the field_id and the document_id | ||||
|                 key_buffer.extend_from_slice(docid_bytes); | ||||
|  | ||||
|                 let value = | ||||
|                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|                 let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?; | ||||
|  | ||||
|                 let (numbers, strings) = extract_facet_values(&value); | ||||
|  | ||||
|                 // insert facet numbers in sorter | ||||
|                 for number in numbers { | ||||
|                     key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                     if let Some(value_bytes) = f64_into_bytes(number) { | ||||
|                         key_buffer.extend_from_slice(&value_bytes); | ||||
|                         key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||
|  | ||||
|                         fid_docid_facet_numbers_sorter.insert(&key_buffer, ().as_bytes())?; | ||||
|                 match extract_facet_values(&value) { | ||||
|                     FilterableValues::Null => { | ||||
|                         facet_is_null_docids.entry(field_id).or_default().insert(document); | ||||
|                     } | ||||
|                 } | ||||
|                     FilterableValues::Empty => { | ||||
|                         facet_is_empty_docids.entry(field_id).or_default().insert(document); | ||||
|                     } | ||||
|                     FilterableValues::Values { numbers, strings } => { | ||||
|                         // insert facet numbers in sorter | ||||
|                         for number in numbers { | ||||
|                             key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                             if let Some(value_bytes) = f64_into_bytes(number) { | ||||
|                                 key_buffer.extend_from_slice(&value_bytes); | ||||
|                                 key_buffer.extend_from_slice(&number.to_be_bytes()); | ||||
|  | ||||
|                 // insert normalized and original facet string in sorter | ||||
|                 for (normalized, original) in strings.into_iter().filter(|(n, _)| !n.is_empty()) { | ||||
|                     let normalised_truncated_value: String = normalized | ||||
|                         .char_indices() | ||||
|                         .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) | ||||
|                         .map(|(_, c)| c) | ||||
|                         .collect(); | ||||
|                                 fid_docid_facet_numbers_sorter | ||||
|                                     .insert(&key_buffer, ().as_bytes())?; | ||||
|                             } | ||||
|                         } | ||||
|  | ||||
|                     key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                     key_buffer.extend_from_slice(normalised_truncated_value.as_bytes()); | ||||
|                     fid_docid_facet_strings_sorter.insert(&key_buffer, original.as_bytes())?; | ||||
|                         // insert normalized and original facet string in sorter | ||||
|                         for (normalized, original) in | ||||
|                             strings.into_iter().filter(|(n, _)| !n.is_empty()) | ||||
|                         { | ||||
|                             let normalized_truncated_value: String = normalized | ||||
|                                 .char_indices() | ||||
|                                 .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH) | ||||
|                                 .map(|(_, c)| c) | ||||
|                                 .collect(); | ||||
|  | ||||
|                             key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>()); | ||||
|                             key_buffer.extend_from_slice(normalized_truncated_value.as_bytes()); | ||||
|                             fid_docid_facet_strings_sorter | ||||
|                                 .insert(&key_buffer, original.as_bytes())?; | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
| @@ -112,14 +134,48 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||
|     } | ||||
|     let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?; | ||||
|  | ||||
|     Ok(( | ||||
|         sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, | ||||
|         sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, | ||||
|         facet_exists_docids_reader, | ||||
|     )) | ||||
|     let mut facet_is_null_docids_writer = create_writer( | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|     for (fid, bitmap) in facet_is_null_docids.into_iter() { | ||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); | ||||
|         facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; | ||||
|     } | ||||
|     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?; | ||||
|  | ||||
|     let mut facet_is_empty_docids_writer = create_writer( | ||||
|         indexer.chunk_compression_type, | ||||
|         indexer.chunk_compression_level, | ||||
|         tempfile::tempfile()?, | ||||
|     ); | ||||
|     for (fid, bitmap) in facet_is_empty_docids.into_iter() { | ||||
|         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap(); | ||||
|         facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?; | ||||
|     } | ||||
|     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?; | ||||
|  | ||||
|     Ok(ExtractedFacetValues { | ||||
|         docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?, | ||||
|         docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?, | ||||
|         fid_facet_is_null_docids_chunk: facet_is_null_docids_reader, | ||||
|         fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader, | ||||
|         fid_facet_exists_docids_chunk: facet_exists_docids_reader, | ||||
|     }) | ||||
| } | ||||
|  | ||||
| fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) { | ||||
| /// Represent what a document field contains. | ||||
| enum FilterableValues { | ||||
|     /// Corresponds to the JSON `null` value. | ||||
|     Null, | ||||
|     /// Corresponds to either, an empty string `""`, an empty array `[]`, or an empty object `{}`. | ||||
|     Empty, | ||||
|     /// Represents all the numbers and strings values found in this document field. | ||||
|     Values { numbers: Vec<f64>, strings: Vec<(String, String)> }, | ||||
| } | ||||
|  | ||||
| fn extract_facet_values(value: &Value) -> FilterableValues { | ||||
|     fn inner_extract_facet_values( | ||||
|         value: &Value, | ||||
|         can_recurse: bool, | ||||
| @@ -149,9 +205,16 @@ fn extract_facet_values(value: &Value) -> (Vec<f64>, Vec<(String, String)>) { | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     let mut facet_number_values = Vec::new(); | ||||
|     let mut facet_string_values = Vec::new(); | ||||
|     inner_extract_facet_values(value, true, &mut facet_number_values, &mut facet_string_values); | ||||
|  | ||||
|     (facet_number_values, facet_string_values) | ||||
|     match value { | ||||
|         Value::Null => FilterableValues::Null, | ||||
|         Value::String(s) if s.is_empty() => FilterableValues::Empty, | ||||
|         Value::Array(a) if a.is_empty() => FilterableValues::Empty, | ||||
|         Value::Object(o) if o.is_empty() => FilterableValues::Empty, | ||||
|         otherwise => { | ||||
|             let mut numbers = Vec::new(); | ||||
|             let mut strings = Vec::new(); | ||||
|             inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings); | ||||
|             FilterableValues::Values { numbers, strings } | ||||
|         } | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -19,7 +19,7 @@ use rayon::prelude::*; | ||||
| use self::extract_docid_word_positions::extract_docid_word_positions; | ||||
| use self::extract_facet_number_docids::extract_facet_number_docids; | ||||
| use self::extract_facet_string_docids::extract_facet_string_docids; | ||||
| use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; | ||||
| use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, ExtractedFacetValues}; | ||||
| use self::extract_fid_word_count_docids::extract_fid_word_count_docids; | ||||
| use self::extract_geo_points::extract_geo_points; | ||||
| use self::extract_word_docids::extract_word_docids; | ||||
| @@ -57,28 +57,35 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         .collect::<Result<()>>()?; | ||||
|  | ||||
|     #[allow(clippy::type_complexity)] | ||||
|     let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))> = flattened_obkv_chunks | ||||
|         .par_bridge() | ||||
|         .map(|flattened_obkv_chunks| { | ||||
|             send_and_extract_flattened_documents_data( | ||||
|                 flattened_obkv_chunks, | ||||
|                 indexer, | ||||
|                 lmdb_writer_sx.clone(), | ||||
|                 &searchable_fields, | ||||
|                 &faceted_fields, | ||||
|                 primary_key_id, | ||||
|                 geo_fields_ids, | ||||
|                 &stop_words, | ||||
|                 max_positions_per_attributes, | ||||
|             ) | ||||
|         }) | ||||
|         .collect(); | ||||
|     let result: Result<(Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, (Vec<_>, Vec<_>)))))> = | ||||
|         flattened_obkv_chunks | ||||
|             .par_bridge() | ||||
|             .map(|flattened_obkv_chunks| { | ||||
|                 send_and_extract_flattened_documents_data( | ||||
|                     flattened_obkv_chunks, | ||||
|                     indexer, | ||||
|                     lmdb_writer_sx.clone(), | ||||
|                     &searchable_fields, | ||||
|                     &faceted_fields, | ||||
|                     primary_key_id, | ||||
|                     geo_fields_ids, | ||||
|                     &stop_words, | ||||
|                     max_positions_per_attributes, | ||||
|                 ) | ||||
|             }) | ||||
|             .collect(); | ||||
|  | ||||
|     let ( | ||||
|         docid_word_positions_chunks, | ||||
|         ( | ||||
|             docid_fid_facet_numbers_chunks, | ||||
|             (docid_fid_facet_strings_chunks, facet_exists_docids_chunks), | ||||
|             ( | ||||
|                 docid_fid_facet_strings_chunks, | ||||
|                 ( | ||||
|                     facet_is_null_docids_chunks, | ||||
|                     (facet_is_empty_docids_chunks, facet_exists_docids_chunks), | ||||
|                 ), | ||||
|             ), | ||||
|         ), | ||||
|     ) = result?; | ||||
|  | ||||
| @@ -98,6 +105,38 @@ pub(crate) fn data_from_obkv_documents( | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     // merge facet_is_null_docids and send them as a typed chunk | ||||
|     { | ||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             debug!("merge {} database", "facet-id-is-null-docids"); | ||||
|             match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { | ||||
|                 Ok(reader) => { | ||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader))); | ||||
|                 } | ||||
|                 Err(e) => { | ||||
|                     let _ = lmdb_writer_sx.send(Err(e)); | ||||
|                 } | ||||
|             } | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     // merge facet_is_empty_docids and send them as a typed chunk | ||||
|     { | ||||
|         let lmdb_writer_sx = lmdb_writer_sx.clone(); | ||||
|         rayon::spawn(move || { | ||||
|             debug!("merge {} database", "facet-id-is-empty-docids"); | ||||
|             match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) { | ||||
|                 Ok(reader) => { | ||||
|                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader))); | ||||
|                 } | ||||
|                 Err(e) => { | ||||
|                     let _ = lmdb_writer_sx.send(Err(e)); | ||||
|                 } | ||||
|             } | ||||
|         }); | ||||
|     } | ||||
|  | ||||
|     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>( | ||||
|         docid_word_positions_chunks.clone(), | ||||
|         indexer, | ||||
| @@ -246,7 +285,10 @@ fn send_and_extract_flattened_documents_data( | ||||
|     grenad::Reader<CursorClonableMmap>, | ||||
|     ( | ||||
|         grenad::Reader<CursorClonableMmap>, | ||||
|         (grenad::Reader<CursorClonableMmap>, grenad::Reader<File>), | ||||
|         ( | ||||
|             grenad::Reader<CursorClonableMmap>, | ||||
|             (grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)), | ||||
|         ), | ||||
|     ), | ||||
| )> { | ||||
|     let flattened_documents_chunk = | ||||
| @@ -292,11 +334,13 @@ fn send_and_extract_flattened_documents_data( | ||||
|                 Ok(docid_word_positions_chunk) | ||||
|             }, | ||||
|             || { | ||||
|                 let ( | ||||
|                 let ExtractedFacetValues { | ||||
|                     docid_fid_facet_numbers_chunk, | ||||
|                     docid_fid_facet_strings_chunk, | ||||
|                     fid_facet_is_null_docids_chunk, | ||||
|                     fid_facet_is_empty_docids_chunk, | ||||
|                     fid_facet_exists_docids_chunk, | ||||
|                 ) = extract_fid_docid_facet_values( | ||||
|                 } = extract_fid_docid_facet_values( | ||||
|                     flattened_documents_chunk.clone(), | ||||
|                     indexer, | ||||
|                     faceted_fields, | ||||
| @@ -320,7 +364,13 @@ fn send_and_extract_flattened_documents_data( | ||||
|  | ||||
|                 Ok(( | ||||
|                     docid_fid_facet_numbers_chunk, | ||||
|                     (docid_fid_facet_strings_chunk, fid_facet_exists_docids_chunk), | ||||
|                     ( | ||||
|                         docid_fid_facet_strings_chunk, | ||||
|                         ( | ||||
|                             fid_facet_is_null_docids_chunk, | ||||
|                             (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk), | ||||
|                         ), | ||||
|                     ), | ||||
|                 )) | ||||
|             }, | ||||
|         ); | ||||
|   | ||||
| @@ -1779,6 +1779,187 @@ mod tests { | ||||
|         check_ok(&index); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_documents_check_is_null_database() { | ||||
|         let content = || { | ||||
|             documents!([ | ||||
|                 { | ||||
|                     "id": 0, | ||||
|                     "colour": null, | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 1, | ||||
|                     "colour": [null], // must not be returned | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 6, | ||||
|                     "colour": { | ||||
|                         "green": null | ||||
|                     } | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 7, | ||||
|                     "colour": { | ||||
|                         "green": { | ||||
|                             "blue": null | ||||
|                         } | ||||
|                     } | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 8, | ||||
|                     "colour": 0, | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 9, | ||||
|                     "colour": [] | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 10, | ||||
|                     "colour": {} | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 12, | ||||
|                     "colour": [1] | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 13 | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 14, | ||||
|                     "colour": { | ||||
|                         "green": 1 | ||||
|                     } | ||||
|                 }, | ||||
|                 { | ||||
|                     "id": 15, | ||||
|                     "colour": { | ||||
|                         "green": { | ||||
|                             "blue": [] | ||||
|                         } | ||||
|                     } | ||||
|                 } | ||||
|             ]) | ||||
|         }; | ||||
|  | ||||
|         let check_ok = |index: &Index| { | ||||
|             let rtxn = index.read_txn().unwrap(); | ||||
|             let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
|             assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); | ||||
|  | ||||
|             let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); | ||||
|             let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); | ||||
|             let colour_blue_id = | ||||
|                 index.fields_ids_map(&rtxn).unwrap().id("colour.green.blue").unwrap(); | ||||
|  | ||||
|             let bitmap_null_colour = | ||||
|                 index.facet_id_is_null_docids.get(&rtxn, &BEU16::new(colour_id)).unwrap().unwrap(); | ||||
|             assert_eq!(bitmap_null_colour.into_iter().collect::<Vec<_>>(), vec![0]); | ||||
|  | ||||
|             let bitmap_colour_green = index | ||||
|                 .facet_id_is_null_docids | ||||
|                 .get(&rtxn, &BEU16::new(colour_green_id)) | ||||
|                 .unwrap() | ||||
|                 .unwrap(); | ||||
|             assert_eq!(bitmap_colour_green.into_iter().collect::<Vec<_>>(), vec![2]); | ||||
|  | ||||
|             let bitmap_colour_blue = index | ||||
|                 .facet_id_is_null_docids | ||||
|                 .get(&rtxn, &BEU16::new(colour_blue_id)) | ||||
|                 .unwrap() | ||||
|                 .unwrap(); | ||||
|             assert_eq!(bitmap_colour_blue.into_iter().collect::<Vec<_>>(), vec![3]); | ||||
|         }; | ||||
|  | ||||
|         let faceted_fields = hashset!(S("colour")); | ||||
|  | ||||
|         let index = TempIndex::new(); | ||||
|         index.add_documents(content()).unwrap(); | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields(faceted_fields.clone()); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|         check_ok(&index); | ||||
|  | ||||
|         let index = TempIndex::new(); | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields(faceted_fields.clone()); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|         index.add_documents(content()).unwrap(); | ||||
|         check_ok(&index); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn index_documents_check_is_empty_database() { | ||||
|         let content = || { | ||||
|             documents!([ | ||||
|                 {"id": 0, "tags": null }, | ||||
|                 {"id": 1, "tags": [null] }, | ||||
|                 {"id": 2, "tags": [] }, | ||||
|                 {"id": 3, "tags": ["hello","world"] }, | ||||
|                 {"id": 4, "tags": [""] }, | ||||
|                 {"id": 5 }, | ||||
|                 {"id": 6, "tags": {} }, | ||||
|                 {"id": 7, "tags": {"green": "cool"} }, | ||||
|                 {"id": 8, "tags": {"green": ""} }, | ||||
|                 {"id": 9, "tags": "" }, | ||||
|                 {"id": 10, "tags": { "green": null } }, | ||||
|                 {"id": 11, "tags": { "green": { "blue": null } } }, | ||||
|                 {"id": 12, "tags": { "green": { "blue": [] } } } | ||||
|             ]) | ||||
|         }; | ||||
|  | ||||
|         let check_ok = |index: &Index| { | ||||
|             let rtxn = index.read_txn().unwrap(); | ||||
|             let facets = index.faceted_fields(&rtxn).unwrap(); | ||||
|             assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); | ||||
|  | ||||
|             let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); | ||||
|             let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); | ||||
|             let tags_blue_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green.blue").unwrap(); | ||||
|  | ||||
|             let bitmap_empty_tags = | ||||
|                 index.facet_id_is_empty_docids.get(&rtxn, &BEU16::new(tags_id)).unwrap().unwrap(); | ||||
|             assert_eq!(bitmap_empty_tags.into_iter().collect::<Vec<_>>(), vec![2, 6, 9]); | ||||
|  | ||||
|             let bitmap_tags_green = index | ||||
|                 .facet_id_is_empty_docids | ||||
|                 .get(&rtxn, &BEU16::new(tags_green_id)) | ||||
|                 .unwrap() | ||||
|                 .unwrap(); | ||||
|             assert_eq!(bitmap_tags_green.into_iter().collect::<Vec<_>>(), vec![8]); | ||||
|  | ||||
|             let bitmap_tags_blue = index | ||||
|                 .facet_id_is_empty_docids | ||||
|                 .get(&rtxn, &BEU16::new(tags_blue_id)) | ||||
|                 .unwrap() | ||||
|                 .unwrap(); | ||||
|             assert_eq!(bitmap_tags_blue.into_iter().collect::<Vec<_>>(), vec![12]); | ||||
|         }; | ||||
|  | ||||
|         let faceted_fields = hashset!(S("tags")); | ||||
|  | ||||
|         let index = TempIndex::new(); | ||||
|         index.add_documents(content()).unwrap(); | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields(faceted_fields.clone()); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|         check_ok(&index); | ||||
|  | ||||
|         let index = TempIndex::new(); | ||||
|         index | ||||
|             .update_settings(|settings| { | ||||
|                 settings.set_filterable_fields(faceted_fields.clone()); | ||||
|             }) | ||||
|             .unwrap(); | ||||
|         index.add_documents(content()).unwrap(); | ||||
|         check_ok(&index); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn primary_key_must_not_contain_floats() { | ||||
|         let index = TempIndex::new_with_map_size(4096 * 100); | ||||
|   | ||||
| @@ -40,6 +40,8 @@ pub(crate) enum TypedChunk { | ||||
|     FieldIdFacetStringDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetNumberDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetExistsDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetIsNullDocids(grenad::Reader<File>), | ||||
|     FieldIdFacetIsEmptyDocids(grenad::Reader<File>), | ||||
|     GeoPoints(grenad::Reader<File>), | ||||
|     ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>), | ||||
| } | ||||
| @@ -173,6 +175,28 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdFacetIsNullDocids(facet_id_is_null_docids) => { | ||||
|             append_entries_into_database( | ||||
|                 facet_id_is_null_docids, | ||||
|                 &index.facet_id_is_null_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::FieldIdFacetIsEmptyDocids(facet_id_is_empty_docids) => { | ||||
|             append_entries_into_database( | ||||
|                 facet_id_is_empty_docids, | ||||
|                 &index.facet_id_is_empty_docids, | ||||
|                 wtxn, | ||||
|                 index_is_empty, | ||||
|                 |value, _buffer| Ok(value), | ||||
|                 merge_cbo_roaring_bitmaps, | ||||
|             )?; | ||||
|             is_merged_database = true; | ||||
|         } | ||||
|         TypedChunk::WordPairProximityDocids(word_pair_proximity_docids_iter) => { | ||||
|             append_entries_into_database( | ||||
|                 word_pair_proximity_docids_iter, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user