mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-30 23:46:28 +00:00 
			
		
		
		
	feat(index): update fields distribution in clear & delete operations
fixes after review bump the version of the tokenizer implement a first version of the stop_words The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo more fixes after review
This commit is contained in:
		
							
								
								
									
										6
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										6
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -1520,8 +1520,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | |||||||
| [[package]] | [[package]] | ||||||
| name = "pest" | name = "pest" | ||||||
| version = "2.1.3" | version = "2.1.3" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" | ||||||
| checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" |  | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "ucd-trie", |  "ucd-trie", | ||||||
| ] | ] | ||||||
| @@ -1529,7 +1528,8 @@ dependencies = [ | |||||||
| [[package]] | [[package]] | ||||||
| name = "pest" | name = "pest" | ||||||
| version = "2.1.3" | version = "2.1.3" | ||||||
| source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "ucd-trie", |  "ucd-trie", | ||||||
| ] | ] | ||||||
|   | |||||||
| @@ -19,6 +19,11 @@ impl<'a> ExternalDocumentsIds<'a> { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Returns `true` if hard and soft external documents lists are empty. | ||||||
|  |     pub fn is_empty(&self) -> bool { | ||||||
|  |         self.hard.is_empty() && self.soft.is_empty() | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> { |     pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> { | ||||||
|         let external_id = external_id.as_ref(); |         let external_id = external_id.as_ref(); | ||||||
|         match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { |         match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) { | ||||||
|   | |||||||
| @@ -10,7 +10,7 @@ use chrono::{Utc, DateTime}; | |||||||
|  |  | ||||||
| use crate::facet::FacetType; | use crate::facet::FacetType; | ||||||
| use crate::fields_ids_map::FieldsIdsMap; | use crate::fields_ids_map::FieldsIdsMap; | ||||||
| use crate::{default_criteria, Criterion, Search, FacetDistribution}; | use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution}; | ||||||
| use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; | use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds}; | ||||||
| use crate::{ | use crate::{ | ||||||
|     RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, |     RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec, | ||||||
| @@ -34,8 +34,6 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst"; | |||||||
| const CREATED_AT_KEY: &str = "created-at"; | const CREATED_AT_KEY: &str = "created-at"; | ||||||
| const UPDATED_AT_KEY: &str = "updated-at"; | const UPDATED_AT_KEY: &str = "updated-at"; | ||||||
|  |  | ||||||
| pub type FieldsDistribution = HashMap<String, u64>; |  | ||||||
|  |  | ||||||
| #[derive(Clone)] | #[derive(Clone)] | ||||||
| pub struct Index { | pub struct Index { | ||||||
|     /// The LMDB environment which this index is associated with. |     /// The LMDB environment which this index is associated with. | ||||||
| @@ -209,14 +207,14 @@ impl Index { | |||||||
|  |  | ||||||
|     /* fields distribution */ |     /* fields distribution */ | ||||||
|  |  | ||||||
|     /// Writes the fields distribution which associate the field with the number of times |     /// Writes the fields distribution which associates every field name with | ||||||
|     /// it occurs in the obkv documents. |     /// the number of times it occurs in the documents. | ||||||
|     pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { |     pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> { | ||||||
|         self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution) |         self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, distribution) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Returns the fields distribution which associate the field with the number of times |     /// Returns the fields distribution which associates every field name with | ||||||
|     /// it occurs in the obkv documents. |     /// the number of times it occurs in the documents. | ||||||
|     pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { |     pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { | ||||||
|         Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) |         Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default()) | ||||||
|     } |     } | ||||||
| @@ -472,35 +470,29 @@ mod tests { | |||||||
|     use crate::Index; |     use crate::Index; | ||||||
|     use crate::update::{IndexDocuments, UpdateFormat}; |     use crate::update::{IndexDocuments, UpdateFormat}; | ||||||
|  |  | ||||||
|     fn prepare_index() -> Index { |     #[test] | ||||||
|  |     fn initial_fields_distribution() { | ||||||
|         let path = tempfile::tempdir().unwrap(); |         let path = tempfile::tempdir().unwrap(); | ||||||
|         let mut options = EnvOpenOptions::new(); |         let mut options = EnvOpenOptions::new(); | ||||||
|         options.map_size(10 * 1024 * 1024); // 10 MB |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|         let index = Index::new(options, &path).unwrap(); |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|         let content = &br#" |         let content = &br#"[ | ||||||
|         { "name": "kevin" } |             { "name": "kevin" }, | ||||||
|         { "name": "bob", "age": 20 } |             { "name": "bob", "age": 20 } | ||||||
|         "#[..]; |         ]"#[..]; | ||||||
|         let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); |         let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||||
|         builder.update_format(UpdateFormat::JsonStream); |         builder.update_format(UpdateFormat::Json); | ||||||
|         builder.execute(content, |_, _| ()).unwrap(); |         builder.execute(content, |_, _| ()).unwrap(); | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|         index |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn initial_fields_distribution() { |  | ||||||
|         let index = prepare_index(); |  | ||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|         let fields_distribution = index.fields_distribution(&rtxn).unwrap(); |         let fields_distribution = index.fields_distribution(&rtxn).unwrap(); | ||||||
|         assert_eq!(fields_distribution, hashmap!{ |         assert_eq!(fields_distribution, hashmap!{ | ||||||
|  |             "name".to_string() => 2, | ||||||
|             "age".to_string() => 1, |             "age".to_string() => 1, | ||||||
|             "name".to_string() => 2 |  | ||||||
|         }); |         }); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -41,6 +41,7 @@ pub type Attribute = u32; | |||||||
| pub type DocumentId = u32; | pub type DocumentId = u32; | ||||||
| pub type FieldId = u8; | pub type FieldId = u8; | ||||||
| pub type Position = u32; | pub type Position = u32; | ||||||
|  | pub type FieldsDistribution = HashMap<String, u64>; | ||||||
|  |  | ||||||
| type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>; | type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| use chrono::Utc; | use chrono::Utc; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use crate::{ExternalDocumentsIds, Index}; | use crate::{ExternalDocumentsIds, Index, FieldsDistribution}; | ||||||
|  |  | ||||||
| pub struct ClearDocuments<'t, 'u, 'i> { | pub struct ClearDocuments<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
| @@ -42,6 +42,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; |         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; | ||||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; |         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; | ||||||
|         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; |         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; | ||||||
|  |         self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; | ||||||
|  |  | ||||||
|         // We clean all the faceted documents ids. |         // We clean all the faceted documents ids. | ||||||
|         for (field_id, _) in faceted_fields { |         for (field_id, _) in faceted_fields { | ||||||
| @@ -61,3 +62,54 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|         Ok(number_of_documents) |         Ok(number_of_documents) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use heed::EnvOpenOptions; | ||||||
|  |  | ||||||
|  |     use crate::update::{IndexDocuments, UpdateFormat}; | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn clear_documents() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let content = &br#"[ | ||||||
|  |             { "id": 0, "name": "kevin", "age": 20 }, | ||||||
|  |             { "id": 1, "name": "kevina" }, | ||||||
|  |             { "id": 2, "name": "benoit", "country": "France" } | ||||||
|  |         ]"#[..]; | ||||||
|  |         let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); | ||||||
|  |         builder.update_format(UpdateFormat::Json); | ||||||
|  |         builder.execute(content, |_, _| ()).unwrap(); | ||||||
|  |  | ||||||
|  |         // Clear all documents from the database. | ||||||
|  |         let builder = ClearDocuments::new(&mut wtxn, &index, 1); | ||||||
|  |         assert_eq!(builder.execute().unwrap(), 3); | ||||||
|  |  | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 4); | ||||||
|  |  | ||||||
|  |         assert!(index.words_fst(&rtxn).unwrap().is_empty()); | ||||||
|  |         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); | ||||||
|  |         assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); | ||||||
|  |         assert!(index.documents_ids(&rtxn).unwrap().is_empty()); | ||||||
|  |         assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); | ||||||
|  |  | ||||||
|  |         assert!(index.word_docids.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap()); | ||||||
|  |         assert!(index.documents.is_empty(&rtxn).unwrap()); | ||||||
|  |     } | ||||||
|  | } | ||||||
|   | |||||||
| @@ -1,3 +1,6 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | use std::collections::hash_map::Entry; | ||||||
|  |  | ||||||
| use anyhow::anyhow; | use anyhow::anyhow; | ||||||
| use chrono::Utc; | use chrono::Utc; | ||||||
| use fst::IntoStreamer; | use fst::IntoStreamer; | ||||||
| @@ -90,6 +93,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             documents, |             documents, | ||||||
|         } = self.index; |         } = self.index; | ||||||
|  |  | ||||||
|  |         // Number of fields for each document that has been deleted. | ||||||
|  |         let mut fields_ids_distribution_diff = HashMap::new(); | ||||||
|  |  | ||||||
|         // Retrieve the words and the external documents ids contained in the documents. |         // Retrieve the words and the external documents ids contained in the documents. | ||||||
|         let mut words = Vec::new(); |         let mut words = Vec::new(); | ||||||
|         let mut external_ids = Vec::new(); |         let mut external_ids = Vec::new(); | ||||||
| @@ -100,6 +106,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             let key = BEU32::new(docid); |             let key = BEU32::new(docid); | ||||||
|             let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; |             let mut iter = documents.range_mut(self.wtxn, &(key..=key))?; | ||||||
|             if let Some((_key, obkv)) = iter.next().transpose()? { |             if let Some((_key, obkv)) = iter.next().transpose()? { | ||||||
|  |                 for (field_id, _) in obkv.iter() { | ||||||
|  |                     *fields_ids_distribution_diff.entry(field_id).or_default() += 1; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|                 if let Some(content) = obkv.get(id_field) { |                 if let Some(content) = obkv.get(id_field) { | ||||||
|                     let external_id = match serde_json::from_slice(content).unwrap() { |                     let external_id = match serde_json::from_slice(content).unwrap() { | ||||||
|                         Value::String(string) => SmallString32::from(string.as_str()), |                         Value::String(string) => SmallString32::from(string.as_str()), | ||||||
| @@ -112,7 +122,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             } |             } | ||||||
|             drop(iter); |             drop(iter); | ||||||
|  |  | ||||||
|             // We iterate througt the words positions of the document id, |             // We iterate through the words positions of the document id, | ||||||
|             // retrieve the word and delete the positions. |             // retrieve the word and delete the positions. | ||||||
|             let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; |             let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; | ||||||
|             while let Some(result) = iter.next() { |             while let Some(result) = iter.next() { | ||||||
| @@ -123,6 +133,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         let mut fields_distribution = self.index.fields_distribution(self.wtxn)?; | ||||||
|  |  | ||||||
|  |         // We use pre-calculated number of fields occurrences that needs to be deleted | ||||||
|  |         // to reflect deleted documents. | ||||||
|  |         // If all field occurrences are removed, delete the entry from distribution. | ||||||
|  |         // Otherwise, insert new number of occurrences (current_count - count_diff). | ||||||
|  |         for (field_id, count_diff) in fields_ids_distribution_diff { | ||||||
|  |             let field_name = fields_ids_map.name(field_id).unwrap(); | ||||||
|  |             if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { | ||||||
|  |                 match entry.get().checked_sub(count_diff) { | ||||||
|  |                     Some(0) | None => entry.remove(), | ||||||
|  |                     Some(count) => entry.insert(count) | ||||||
|  |                 }; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; | ||||||
|  |  | ||||||
|         // We create the FST map of the external ids that we must delete. |         // We create the FST map of the external ids that we must delete. | ||||||
|         external_ids.sort_unstable(); |         external_ids.sort_unstable(); | ||||||
|         let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; |         let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?; | ||||||
| @@ -347,5 +375,9 @@ mod tests { | |||||||
|         builder.execute().unwrap(); |         builder.execute().unwrap(); | ||||||
|  |  | ||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |  | ||||||
|  |         assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  | use std::collections::HashMap; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{Read, Seek, SeekFrom}; | use std::io::{Read, Seek, SeekFrom}; | ||||||
| use std::iter::Peekable; | use std::iter::Peekable; | ||||||
| @@ -10,11 +11,10 @@ use log::info; | |||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::{Map, Value}; | use serde_json::{Map, Value}; | ||||||
|  |  | ||||||
| use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId}; | use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution}; | ||||||
| use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||||
| use super::merge_function::merge_two_obkvs; | use super::merge_function::merge_two_obkvs; | ||||||
| use super::{create_writer, create_sorter, IndexDocumentsMethod}; | use super::{create_writer, create_sorter, IndexDocumentsMethod}; | ||||||
| use crate::index::FieldsDistribution; |  | ||||||
|  |  | ||||||
| const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; | ||||||
|  |  | ||||||
| @@ -137,6 +137,8 @@ impl Transform<'_, '_> { | |||||||
|         let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; |         let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; | ||||||
|         let mut documents_count = 0; |         let mut documents_count = 0; | ||||||
|  |  | ||||||
|  |         let mut fields_ids_distribution = HashMap::new(); | ||||||
|  |  | ||||||
|         for result in documents { |         for result in documents { | ||||||
|             let document = result?; |             let document = result?; | ||||||
|  |  | ||||||
| @@ -151,9 +153,9 @@ impl Transform<'_, '_> { | |||||||
|  |  | ||||||
|             // We prepare the fields ids map with the documents keys. |             // We prepare the fields ids map with the documents keys. | ||||||
|             for (key, _value) in &document { |             for (key, _value) in &document { | ||||||
|                 fields_ids_map.insert(&key).context("field id limit reached")?; |                 let field_id = fields_ids_map.insert(&key).context("field id limit reached")?; | ||||||
|  |  | ||||||
|                 *fields_distribution.entry(key.to_owned()).or_default() += 1; |                 *fields_ids_distribution.entry(field_id).or_insert(0) += 1; | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // We retrieve the user id from the document based on the primary key name, |             // We retrieve the user id from the document based on the primary key name, | ||||||
| @@ -196,6 +198,11 @@ impl Transform<'_, '_> { | |||||||
|             documents_count += 1; |             documents_count += 1; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         for (field_id, count) in fields_ids_distribution { | ||||||
|  |             let field_name = fields_ids_map.name(field_id).unwrap(); | ||||||
|  |             *fields_distribution.entry(field_name.to_string()).or_default() += count; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { |         progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { | ||||||
|             documents_seen: documents_count, |             documents_seen: documents_count, | ||||||
|         }); |         }); | ||||||
| @@ -277,6 +284,8 @@ impl Transform<'_, '_> { | |||||||
|         let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; |         let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; | ||||||
|         let mut documents_count = 0; |         let mut documents_count = 0; | ||||||
|  |  | ||||||
|  |         let mut fields_ids_distribution = HashMap::new(); | ||||||
|  |  | ||||||
|         let mut record = csv::StringRecord::new(); |         let mut record = csv::StringRecord::new(); | ||||||
|         while csv.read_record(&mut record)? { |         while csv.read_record(&mut record)? { | ||||||
|             obkv_buffer.clear(); |             obkv_buffer.clear(); | ||||||
| @@ -316,9 +325,7 @@ impl Transform<'_, '_> { | |||||||
|                 serde_json::to_writer(&mut json_buffer, &field)?; |                 serde_json::to_writer(&mut json_buffer, &field)?; | ||||||
|                 writer.insert(*field_id, &json_buffer)?; |                 writer.insert(*field_id, &json_buffer)?; | ||||||
|  |  | ||||||
|                 let field_name = fields_ids_map.name(*field_id).unwrap(); |                 *fields_ids_distribution.entry(*field_id).or_insert(0) += 1; | ||||||
|  |  | ||||||
|                 *fields_distribution.entry(field_name.to_string()).or_default() += 1; |  | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             // We use the extracted/generated user id as the key for this document. |             // We use the extracted/generated user id as the key for this document. | ||||||
| @@ -326,6 +333,11 @@ impl Transform<'_, '_> { | |||||||
|             documents_count += 1; |             documents_count += 1; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         for (field_id, count) in fields_ids_distribution { | ||||||
|  |             let field_name = fields_ids_map.name(field_id).unwrap(); | ||||||
|  |             *fields_distribution.entry(field_name.to_string()).or_default() += count; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { |         progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat { | ||||||
|             documents_seen: documents_count, |             documents_seen: documents_count, | ||||||
|         }); |         }); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user