Merge #240

240: Field distribution r=Kerollmops a=irevoire closes #199 closes #198 Co-authored-by: Tamo <tamo@meilisearch.com>
2025-10-31 07:56:28 +00:00 · 2021-06-19 10:14:25 +00:00
parent a9e552ab18 d08cfda796
commit 5b19dd23d9
6 changed files with 100 additions and 40 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -26,7 +26,7 @@ pub mod main_key {
    pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
    pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
    pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
-    pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
+    pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
    pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
    pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
    pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
@@ -290,28 +290,28 @@ impl Index {
            .unwrap_or_default())
    }

-    /* fields distribution */
+    /* field distribution */

-    /// Writes the fields distribution which associates every field name with
+    /// Writes the field distribution which associates every field name with
    /// the number of times it occurs in the documents.
-    pub(crate) fn put_fields_distribution(
+    pub(crate) fn put_field_distribution(
        &self,
        wtxn: &mut RwTxn,
        distribution: &FieldsDistribution,
    ) -> heed::Result<()> {
        self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(
            wtxn,
-            main_key::FIELDS_DISTRIBUTION_KEY,
+            main_key::FIELD_DISTRIBUTION_KEY,
            distribution,
        )
    }

-    /// Returns the fields distribution which associates every field name with
+    /// Returns the field distribution which associates every field name with
    /// the number of times it occurs in the documents.
-    pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
+    pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
        Ok(self
            .main
-            .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)?
+            .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)?
            .unwrap_or_default())
    }

@@ -791,7 +791,7 @@ pub(crate) mod tests {
    use std::ops::Deref;

    use heed::EnvOpenOptions;
-    use maplit::hashmap;
+    use maplit::btreemap;
    use tempfile::TempDir;

    use crate::update::{IndexDocuments, UpdateFormat};
@@ -823,7 +823,7 @@ pub(crate) mod tests {
    }

    #[test]
-    fn initial_fields_distribution() {
+    fn initial_field_distribution() {
        let path = tempfile::tempdir().unwrap();
        let mut options = EnvOpenOptions::new();
        options.map_size(10 * 1024 * 1024); // 10 MB
@@ -842,14 +842,57 @@ pub(crate) mod tests {

        let rtxn = index.read_txn().unwrap();

-        let fields_distribution = index.fields_distribution(&rtxn).unwrap();
+        let field_distribution = index.field_distribution(&rtxn).unwrap();
        assert_eq!(
-            fields_distribution,
-            hashmap! {
+            field_distribution,
+            btreemap! {
                "id".to_string() => 2,
                "name".to_string() => 2,
                "age".to_string() => 1,
            }
        );
+
+        // we add all the documents a second time. we are supposed to get the same
+        // field_distribution in the end
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        let field_distribution = index.field_distribution(&rtxn).unwrap();
+        assert_eq!(
+            field_distribution,
+            btreemap! {
+                "id".to_string() => 2,
+                "name".to_string() => 2,
+                "age".to_string() => 1,
+            }
+        );
+
+        // then we update a document by removing one field and another by adding one field
+        let content = &br#"[
+            { "id": 1, "name": "kevin", "has_dog": true },
+            { "id": 2, "name": "bob" }
+        ]"#[..];
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        let field_distribution = index.field_distribution(&rtxn).unwrap();
+        assert_eq!(
+            field_distribution,
+            btreemap! {
+                "id".to_string() => 2,
+                "name".to_string() => 2,
+                "has_dog".to_string() => 1,
+            }
+        );
    }
 }
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -14,7 +14,7 @@ pub mod tree_level;
 pub mod update;

 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::hash::BuildHasherDefault;
 use std::result::Result as StdResult;

@@ -22,7 +22,9 @@ use fxhash::{FxHasher32, FxHasher64};
 use serde_json::{Map, Value};

 pub use self::criterion::{default_criteria, Criterion};
-pub use self::error::{Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError};
+pub use self::error::{
+    Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
+};
 pub use self::external_documents_ids::ExternalDocumentsIds;
 pub use self::fields_ids_map::FieldsIdsMap;
 pub use self::heed_codec::{
@@ -48,7 +50,7 @@ pub type Attribute = u32;
 pub type DocumentId = u32;
 pub type FieldId = u8;
 pub type Position = u32;
-pub type FieldsDistribution = HashMap<String, u64>;
+pub type FieldsDistribution = BTreeMap<String, u64>;

 type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;

--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -47,7 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
        self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
        self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
-        self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?;
+        self.index.put_field_distribution(self.wtxn, &FieldsDistribution::default())?;

        // We clean all the faceted documents ids.
        let empty = RoaringBitmap::default();
@@ -113,7 +113,7 @@ mod tests {
        assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
        assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
        assert!(index.documents_ids(&rtxn).unwrap().is_empty());
-        assert!(index.fields_distribution(&rtxn).unwrap().is_empty());
+        assert!(index.field_distribution(&rtxn).unwrap().is_empty());

        assert!(index.word_docids.is_empty(&rtxn).unwrap());
        assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -1,4 +1,4 @@
-use std::collections::hash_map::Entry;
+use std::collections::btree_map::Entry;
 use std::collections::HashMap;

 use chrono::Utc;
@@ -147,7 +147,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            }
        }

-        let mut fields_distribution = self.index.fields_distribution(self.wtxn)?;
+        let mut field_distribution = self.index.field_distribution(self.wtxn)?;

        // We use pre-calculated number of fields occurrences that needs to be deleted
        // to reflect deleted documents.
@@ -155,7 +155,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        // Otherwise, insert new number of occurrences (current_count - count_diff).
        for (field_id, count_diff) in fields_ids_distribution_diff {
            let field_name = fields_ids_map.name(field_id).unwrap();
-            if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
+            if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) {
                match entry.get().checked_sub(count_diff) {
                    Some(0) | None => entry.remove(),
                    Some(count) => entry.insert(count),
@@ -163,7 +163,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            }
        }

-        self.index.put_fields_distribution(self.wtxn, &fields_distribution)?;
+        self.index.put_field_distribution(self.wtxn, &field_distribution)?;

        // We create the FST map of the external ids that we must delete.
        external_ids.sort_unstable();
@@ -479,7 +479,7 @@ mod tests {

        let rtxn = index.read_txn().unwrap();

-        assert!(index.fields_distribution(&rtxn).unwrap().is_empty());
+        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
    }

    #[test]
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -378,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
        let TransformOutput {
            primary_key,
            fields_ids_map,
-            fields_distribution,
+            field_distribution,
            external_documents_ids,
            new_documents_ids,
            replaced_documents_ids,
@@ -594,8 +594,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
        // We write the fields ids map into the main database
        self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;

-        // We write the fields distribution into the main database
-        self.index.put_fields_distribution(self.wtxn, &fields_distribution)?;
+        // We write the field distribution into the main database
+        self.index.put_field_distribution(self.wtxn, &field_distribution)?;

        // We write the primary key field id into the main database
        self.index.put_primary_key(self.wtxn, &primary_key)?;
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::btree_map::Entry;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::iter::Peekable;
@@ -25,7 +26,7 @@ const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
 pub struct TransformOutput {
    pub primary_key: String,
    pub fields_ids_map: FieldsIdsMap,
-    pub fields_distribution: FieldsDistribution,
+    pub field_distribution: FieldsDistribution,
    pub external_documents_ids: ExternalDocumentsIds<'static>,
    pub new_documents_ids: RoaringBitmap,
    pub replaced_documents_ids: RoaringBitmap,
@@ -127,7 +128,7 @@ impl Transform<'_, '_> {
            return Ok(TransformOutput {
                primary_key,
                fields_ids_map,
-                fields_distribution: self.index.fields_distribution(self.rtxn)?,
+                field_distribution: self.index.field_distribution(self.rtxn)?,
                external_documents_ids: ExternalDocumentsIds::default(),
                new_documents_ids: RoaringBitmap::new(),
                replaced_documents_ids: RoaringBitmap::new(),
@@ -385,7 +386,7 @@ impl Transform<'_, '_> {
        Error: From<E>,
    {
        let documents_ids = self.index.documents_ids(self.rtxn)?;
-        let mut fields_distribution = self.index.fields_distribution(self.rtxn)?;
+        let mut field_distribution = self.index.field_distribution(self.rtxn)?;
        let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);

        // Once we have sort and deduplicated the documents we write them into a final file.
@@ -419,18 +420,32 @@ impl Transform<'_, '_> {
                    // we use it and insert it in the list of replaced documents.
                    replaced_documents_ids.insert(docid);

+                    let key = BEU32::new(docid);
+                    let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
+                        InternalError::DatabaseMissingEntry {
+                            db_name: db_name::DOCUMENTS,
+                            key: None,
+                        },
+                    )?;
+
+                    // we remove all the fields that were already counted
+                    for (field_id, _) in base_obkv.iter() {
+                        let field_name = fields_ids_map.name(field_id).unwrap();
+                        if let Entry::Occupied(mut entry) =
+                            field_distribution.entry(field_name.to_string())
+                        {
+                            match entry.get().checked_sub(1) {
+                                Some(0) | None => entry.remove(),
+                                Some(count) => entry.insert(count),
+                            };
+                        }
+                    }
+
                    // Depending on the update indexing method we will merge
                    // the document update with the current document or not.
                    match self.index_documents_method {
                        IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
                        IndexDocumentsMethod::UpdateDocuments => {
-                            let key = BEU32::new(docid);
-                            let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
-                                InternalError::DatabaseMissingEntry {
-                                    db_name: db_name::DOCUMENTS,
-                                    key: None,
-                                },
-                            )?;
                            let update_obkv = obkv::KvReader::new(update_obkv);
                            merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
                            (docid, obkv_buffer.as_slice())
@@ -455,7 +470,7 @@ impl Transform<'_, '_> {
            let reader = obkv::KvReader::new(obkv);
            for (field_id, _) in reader.iter() {
                let field_name = fields_ids_map.name(field_id).unwrap();
-                *fields_distribution.entry(field_name.to_string()).or_default() += 1;
+                *field_distribution.entry(field_name.to_string()).or_default() += 1;
            }
        }

@@ -485,7 +500,7 @@ impl Transform<'_, '_> {
        Ok(TransformOutput {
            primary_key,
            fields_ids_map,
-            fields_distribution,
+            field_distribution,
            external_documents_ids: external_documents_ids.into_static(),
            new_documents_ids,
            replaced_documents_ids,
@@ -503,7 +518,7 @@ impl Transform<'_, '_> {
        old_fields_ids_map: FieldsIdsMap,
        new_fields_ids_map: FieldsIdsMap,
    ) -> Result<TransformOutput> {
-        let fields_distribution = self.index.fields_distribution(self.rtxn)?;
+        let field_distribution = self.index.field_distribution(self.rtxn)?;
        let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
        let documents_ids = self.index.documents_ids(self.rtxn)?;
        let documents_count = documents_ids.len() as usize;
@@ -540,7 +555,7 @@ impl Transform<'_, '_> {
        Ok(TransformOutput {
            primary_key,
            fields_ids_map: new_fields_ids_map,
-            fields_distribution,
+            field_distribution,
            external_documents_ids: external_documents_ids.into_static(),
            new_documents_ids: documents_ids,
            replaced_documents_ids: RoaringBitmap::default(),