mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Merge #240
240: Field distribution r=Kerollmops a=irevoire closes #199 closes #198 Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
		| @@ -26,7 +26,7 @@ pub mod main_key { | ||||
|     pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; | ||||
|     pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||
|     pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; | ||||
|     pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; | ||||
|     pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; | ||||
|     pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | ||||
|     pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; | ||||
|     pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; | ||||
| @@ -290,28 +290,28 @@ impl Index { | ||||
|             .unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     /* fields distribution */ | ||||
|     /* field distribution */ | ||||
|  | ||||
|     /// Writes the fields distribution which associates every field name with | ||||
|     /// Writes the field distribution which associates every field name with | ||||
|     /// the number of times it occurs in the documents. | ||||
|     pub(crate) fn put_fields_distribution( | ||||
|     pub(crate) fn put_field_distribution( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         distribution: &FieldsDistribution, | ||||
|     ) -> heed::Result<()> { | ||||
|         self.main.put::<_, Str, SerdeJson<FieldsDistribution>>( | ||||
|             wtxn, | ||||
|             main_key::FIELDS_DISTRIBUTION_KEY, | ||||
|             main_key::FIELD_DISTRIBUTION_KEY, | ||||
|             distribution, | ||||
|         ) | ||||
|     } | ||||
|  | ||||
|     /// Returns the fields distribution which associates every field name with | ||||
|     /// Returns the field distribution which associates every field name with | ||||
|     /// the number of times it occurs in the documents. | ||||
|     pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { | ||||
|     pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { | ||||
|         Ok(self | ||||
|             .main | ||||
|             .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? | ||||
|             .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? | ||||
|             .unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
| @@ -791,7 +791,7 @@ pub(crate) mod tests { | ||||
|     use std::ops::Deref; | ||||
|  | ||||
|     use heed::EnvOpenOptions; | ||||
|     use maplit::hashmap; | ||||
|     use maplit::btreemap; | ||||
|     use tempfile::TempDir; | ||||
|  | ||||
|     use crate::update::{IndexDocuments, UpdateFormat}; | ||||
| @@ -823,7 +823,7 @@ pub(crate) mod tests { | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|     fn initial_fields_distribution() { | ||||
|     fn initial_field_distribution() { | ||||
|         let path = tempfile::tempdir().unwrap(); | ||||
|         let mut options = EnvOpenOptions::new(); | ||||
|         options.map_size(10 * 1024 * 1024); // 10 MB | ||||
| @@ -842,14 +842,57 @@ pub(crate) mod tests { | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let fields_distribution = index.fields_distribution(&rtxn).unwrap(); | ||||
|         let field_distribution = index.field_distribution(&rtxn).unwrap(); | ||||
|         assert_eq!( | ||||
|             fields_distribution, | ||||
|             hashmap! { | ||||
|             field_distribution, | ||||
|             btreemap! { | ||||
|                 "id".to_string() => 2, | ||||
|                 "name".to_string() => 2, | ||||
|                 "age".to_string() => 1, | ||||
|             } | ||||
|         ); | ||||
|  | ||||
|         // we add all the documents a second time. we are supposed to get the same | ||||
|         // field_distribution in the end | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||
|         builder.update_format(UpdateFormat::Json); | ||||
|         builder.execute(content, |_, _| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let field_distribution = index.field_distribution(&rtxn).unwrap(); | ||||
|         assert_eq!( | ||||
|             field_distribution, | ||||
|             btreemap! { | ||||
|                 "id".to_string() => 2, | ||||
|                 "name".to_string() => 2, | ||||
|                 "age".to_string() => 1, | ||||
|             } | ||||
|         ); | ||||
|  | ||||
|         // then we update a document by removing one field and another by adding one field | ||||
|         let content = &br#"[ | ||||
|             { "id": 1, "name": "kevin", "has_dog": true }, | ||||
|             { "id": 2, "name": "bob" } | ||||
|         ]"#[..]; | ||||
|         let mut wtxn = index.write_txn().unwrap(); | ||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||
|         builder.update_format(UpdateFormat::Json); | ||||
|         builder.execute(content, |_, _| ()).unwrap(); | ||||
|         wtxn.commit().unwrap(); | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         let field_distribution = index.field_distribution(&rtxn).unwrap(); | ||||
|         assert_eq!( | ||||
|             field_distribution, | ||||
|             btreemap! { | ||||
|                 "id".to_string() => 2, | ||||
|                 "name".to_string() => 2, | ||||
|                 "has_dog".to_string() => 1, | ||||
|             } | ||||
|         ); | ||||
|     } | ||||
| } | ||||
|   | ||||
| @@ -14,7 +14,7 @@ pub mod tree_level; | ||||
| pub mod update; | ||||
|  | ||||
| use std::borrow::Cow; | ||||
| use std::collections::HashMap; | ||||
| use std::collections::{BTreeMap, HashMap}; | ||||
| use std::hash::BuildHasherDefault; | ||||
| use std::result::Result as StdResult; | ||||
|  | ||||
| @@ -22,7 +22,9 @@ use fxhash::{FxHasher32, FxHasher64}; | ||||
| use serde_json::{Map, Value}; | ||||
|  | ||||
| pub use self::criterion::{default_criteria, Criterion}; | ||||
| pub use self::error::{Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError}; | ||||
| pub use self::error::{ | ||||
|     Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, | ||||
| }; | ||||
| pub use self::external_documents_ids::ExternalDocumentsIds; | ||||
| pub use self::fields_ids_map::FieldsIdsMap; | ||||
| pub use self::heed_codec::{ | ||||
| @@ -48,7 +50,7 @@ pub type Attribute = u32; | ||||
| pub type DocumentId = u32; | ||||
| pub type FieldId = u8; | ||||
| pub type Position = u32; | ||||
| pub type FieldsDistribution = HashMap<String, u64>; | ||||
| pub type FieldsDistribution = BTreeMap<String, u64>; | ||||
|  | ||||
| type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>; | ||||
|  | ||||
|   | ||||
| @@ -47,7 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | ||||
|         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; | ||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; | ||||
|         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; | ||||
|         self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; | ||||
|         self.index.put_field_distribution(self.wtxn, &FieldsDistribution::default())?; | ||||
|  | ||||
|         // We clean all the faceted documents ids. | ||||
|         let empty = RoaringBitmap::default(); | ||||
| @@ -113,7 +113,7 @@ mod tests { | ||||
|         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.documents_ids(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.field_distribution(&rtxn).unwrap().is_empty()); | ||||
|  | ||||
|         assert!(index.word_docids.is_empty(&rtxn).unwrap()); | ||||
|         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); | ||||
|   | ||||
| @@ -1,4 +1,4 @@ | ||||
| use std::collections::hash_map::Entry; | ||||
| use std::collections::btree_map::Entry; | ||||
| use std::collections::HashMap; | ||||
|  | ||||
| use chrono::Utc; | ||||
| @@ -147,7 +147,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         let mut fields_distribution = self.index.fields_distribution(self.wtxn)?; | ||||
|         let mut field_distribution = self.index.field_distribution(self.wtxn)?; | ||||
|  | ||||
|         // We use pre-calculated number of fields occurrences that needs to be deleted | ||||
|         // to reflect deleted documents. | ||||
| @@ -155,7 +155,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|         // Otherwise, insert new number of occurrences (current_count - count_diff). | ||||
|         for (field_id, count_diff) in fields_ids_distribution_diff { | ||||
|             let field_name = fields_ids_map.name(field_id).unwrap(); | ||||
|             if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { | ||||
|             if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) { | ||||
|                 match entry.get().checked_sub(count_diff) { | ||||
|                     Some(0) | None => entry.remove(), | ||||
|                     Some(count) => entry.insert(count), | ||||
| @@ -163,7 +163,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; | ||||
|         self.index.put_field_distribution(self.wtxn, &field_distribution)?; | ||||
|  | ||||
|         // We create the FST map of the external ids that we must delete. | ||||
|         external_ids.sort_unstable(); | ||||
| @@ -479,7 +479,7 @@ mod tests { | ||||
|  | ||||
|         let rtxn = index.read_txn().unwrap(); | ||||
|  | ||||
|         assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); | ||||
|         assert!(index.field_distribution(&rtxn).unwrap().is_empty()); | ||||
|     } | ||||
|  | ||||
|     #[test] | ||||
|   | ||||
| @@ -378,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         let TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map, | ||||
|             fields_distribution, | ||||
|             field_distribution, | ||||
|             external_documents_ids, | ||||
|             new_documents_ids, | ||||
|             replaced_documents_ids, | ||||
| @@ -594,8 +594,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | ||||
|         // We write the fields ids map into the main database | ||||
|         self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; | ||||
|  | ||||
|         // We write the fields distribution into the main database | ||||
|         self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; | ||||
|         // We write the field distribution into the main database | ||||
|         self.index.put_field_distribution(self.wtxn, &field_distribution)?; | ||||
|  | ||||
|         // We write the primary key field id into the main database | ||||
|         self.index.put_primary_key(self.wtxn, &primary_key)?; | ||||
|   | ||||
| @@ -1,4 +1,5 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::collections::btree_map::Entry; | ||||
| use std::fs::File; | ||||
| use std::io::{Read, Seek, SeekFrom}; | ||||
| use std::iter::Peekable; | ||||
| @@ -25,7 +26,7 @@ const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | ||||
| pub struct TransformOutput { | ||||
|     pub primary_key: String, | ||||
|     pub fields_ids_map: FieldsIdsMap, | ||||
|     pub fields_distribution: FieldsDistribution, | ||||
|     pub field_distribution: FieldsDistribution, | ||||
|     pub external_documents_ids: ExternalDocumentsIds<'static>, | ||||
|     pub new_documents_ids: RoaringBitmap, | ||||
|     pub replaced_documents_ids: RoaringBitmap, | ||||
| @@ -127,7 +128,7 @@ impl Transform<'_, '_> { | ||||
|             return Ok(TransformOutput { | ||||
|                 primary_key, | ||||
|                 fields_ids_map, | ||||
|                 fields_distribution: self.index.fields_distribution(self.rtxn)?, | ||||
|                 field_distribution: self.index.field_distribution(self.rtxn)?, | ||||
|                 external_documents_ids: ExternalDocumentsIds::default(), | ||||
|                 new_documents_ids: RoaringBitmap::new(), | ||||
|                 replaced_documents_ids: RoaringBitmap::new(), | ||||
| @@ -385,7 +386,7 @@ impl Transform<'_, '_> { | ||||
|         Error: From<E>, | ||||
|     { | ||||
|         let documents_ids = self.index.documents_ids(self.rtxn)?; | ||||
|         let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; | ||||
|         let mut field_distribution = self.index.field_distribution(self.rtxn)?; | ||||
|         let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); | ||||
|  | ||||
|         // Once we have sort and deduplicated the documents we write them into a final file. | ||||
| @@ -419,18 +420,32 @@ impl Transform<'_, '_> { | ||||
|                     // we use it and insert it in the list of replaced documents. | ||||
|                     replaced_documents_ids.insert(docid); | ||||
|  | ||||
|                     let key = BEU32::new(docid); | ||||
|                     let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( | ||||
|                         InternalError::DatabaseMissingEntry { | ||||
|                             db_name: db_name::DOCUMENTS, | ||||
|                             key: None, | ||||
|                         }, | ||||
|                     )?; | ||||
|  | ||||
|                     // we remove all the fields that were already counted | ||||
|                     for (field_id, _) in base_obkv.iter() { | ||||
|                         let field_name = fields_ids_map.name(field_id).unwrap(); | ||||
|                         if let Entry::Occupied(mut entry) = | ||||
|                             field_distribution.entry(field_name.to_string()) | ||||
|                         { | ||||
|                             match entry.get().checked_sub(1) { | ||||
|                                 Some(0) | None => entry.remove(), | ||||
|                                 Some(count) => entry.insert(count), | ||||
|                             }; | ||||
|                         } | ||||
|                     } | ||||
|  | ||||
|                     // Depending on the update indexing method we will merge | ||||
|                     // the document update with the current document or not. | ||||
|                     match self.index_documents_method { | ||||
|                         IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), | ||||
|                         IndexDocumentsMethod::UpdateDocuments => { | ||||
|                             let key = BEU32::new(docid); | ||||
|                             let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( | ||||
|                                 InternalError::DatabaseMissingEntry { | ||||
|                                     db_name: db_name::DOCUMENTS, | ||||
|                                     key: None, | ||||
|                                 }, | ||||
|                             )?; | ||||
|                             let update_obkv = obkv::KvReader::new(update_obkv); | ||||
|                             merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); | ||||
|                             (docid, obkv_buffer.as_slice()) | ||||
| @@ -455,7 +470,7 @@ impl Transform<'_, '_> { | ||||
|             let reader = obkv::KvReader::new(obkv); | ||||
|             for (field_id, _) in reader.iter() { | ||||
|                 let field_name = fields_ids_map.name(field_id).unwrap(); | ||||
|                 *fields_distribution.entry(field_name.to_string()).or_default() += 1; | ||||
|                 *field_distribution.entry(field_name.to_string()).or_default() += 1; | ||||
|             } | ||||
|         } | ||||
|  | ||||
| @@ -485,7 +500,7 @@ impl Transform<'_, '_> { | ||||
|         Ok(TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map, | ||||
|             fields_distribution, | ||||
|             field_distribution, | ||||
|             external_documents_ids: external_documents_ids.into_static(), | ||||
|             new_documents_ids, | ||||
|             replaced_documents_ids, | ||||
| @@ -503,7 +518,7 @@ impl Transform<'_, '_> { | ||||
|         old_fields_ids_map: FieldsIdsMap, | ||||
|         new_fields_ids_map: FieldsIdsMap, | ||||
|     ) -> Result<TransformOutput> { | ||||
|         let fields_distribution = self.index.fields_distribution(self.rtxn)?; | ||||
|         let field_distribution = self.index.field_distribution(self.rtxn)?; | ||||
|         let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; | ||||
|         let documents_ids = self.index.documents_ids(self.rtxn)?; | ||||
|         let documents_count = documents_ids.len() as usize; | ||||
| @@ -540,7 +555,7 @@ impl Transform<'_, '_> { | ||||
|         Ok(TransformOutput { | ||||
|             primary_key, | ||||
|             fields_ids_map: new_fields_ids_map, | ||||
|             fields_distribution, | ||||
|             field_distribution, | ||||
|             external_documents_ids: external_documents_ids.into_static(), | ||||
|             new_documents_ids: documents_ids, | ||||
|             replaced_documents_ids: RoaringBitmap::default(), | ||||
|   | ||||
		Reference in New Issue
	
	Block a user