mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge #607
607: Better threshold r=Kerollmops a=irevoire # Pull Request ## What does this PR do? Fixes #570 This PR tries to improve the threshold used to trigger the real deletion of documents. The deletion is now triggered in two cases; - 10% of the total available space is used by soft deleted documents - 90% of the total available space is used. In this context, « total available space » means the `map_size` of lmdb. And the size used by the soft deleted documents is actually an estimation. We can't determine precisely the size used by one document thus what we do is; take the total space used, divide it by the number of documents + soft deleted documents to estimate the size of one average document. Then multiply the size of one avg document by the number of soft deleted document. -------- <img width="808" alt="image" src="https://user-images.githubusercontent.com/7032172/185083075-92cf379e-8ae1-4bfc-9ca6-93b54e6ab4e9.png"> Here we can see we have a ~10GB drift in the end between the space used by the soft deleted and the real space used by the documents. Personally I don’t think that's a big issue because once the red line reach 90GB everything will be freed but now you know. If you have an idea on how to improve this estimation I would love to hear it. It look like the difference is linear so maybe we could simply multiply the current estimation by two? Co-authored-by: Irevoire <tamo@meilisearch.com>
This commit is contained in:
		| @@ -5,7 +5,7 @@ use std::fs::{create_dir_all, remove_dir_all}; | |||||||
| use std::path::Path; | use std::path::Path; | ||||||
|  |  | ||||||
| use criterion::{criterion_group, criterion_main, Criterion}; | use criterion::{criterion_group, criterion_main, Criterion}; | ||||||
| use heed::{EnvOpenOptions, RwTxn}; | use milli::heed::{EnvOpenOptions, RwTxn}; | ||||||
| use milli::update::{ | use milli::update::{ | ||||||
|     DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, |     DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -6,8 +6,8 @@ use std::num::ParseFloatError; | |||||||
| use std::path::Path; | use std::path::Path; | ||||||
|  |  | ||||||
| use criterion::BenchmarkId; | use criterion::BenchmarkId; | ||||||
| use heed::EnvOpenOptions; |  | ||||||
| use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
|  | use milli::heed::EnvOpenOptions; | ||||||
| use milli::update::{ | use milli::update::{ | ||||||
|     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, |     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, | ||||||
| }; | }; | ||||||
|   | |||||||
| @@ -11,7 +11,6 @@ byte-unit = { version = "4.0.14", features = ["serde"] } | |||||||
| color-eyre = "0.6.1" | color-eyre = "0.6.1" | ||||||
| csv = "1.1.6" | csv = "1.1.6" | ||||||
| eyre = "0.6.7" | eyre = "0.6.7" | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } |  | ||||||
| indicatif = "0.16.2" | indicatif = "0.16.2" | ||||||
| milli = { path = "../milli" } | milli = { path = "../milli" } | ||||||
| mimalloc = { version = "0.1.29", default-features = false } | mimalloc = { version = "0.1.29", default-features = false } | ||||||
|   | |||||||
| @@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{ | |||||||
|     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, |     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, | ||||||
| }; | }; | ||||||
| use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; | use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; | ||||||
| use milli::{Index, Object}; | use milli::{heed, Index, Object}; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  |  | ||||||
| #[global_allocator] | #[global_allocator] | ||||||
|   | |||||||
| @@ -9,7 +9,6 @@ publish = false | |||||||
| [dependencies] | [dependencies] | ||||||
| anyhow = "1.0.56" | anyhow = "1.0.56" | ||||||
| byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } | byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } |  | ||||||
| milli = { path = "../milli" } | milli = { path = "../milli" } | ||||||
| mimalloc = { version = "0.1.29", default-features = false } | mimalloc = { version = "0.1.29", default-features = false } | ||||||
| stderrlog = "0.5.1" | stderrlog = "0.5.1" | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
|  |  | ||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use heed::{CompactionOption, Env, EnvOpenOptions}; | use milli::heed::{CompactionOption, Env, EnvOpenOptions}; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
| use Command::*; | use Command::*; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -10,7 +10,6 @@ publish = false | |||||||
| anyhow = "1.0.56" | anyhow = "1.0.56" | ||||||
| byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } | byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } | ||||||
| crossbeam-channel = "0.5.2" | crossbeam-channel = "0.5.2" | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } |  | ||||||
| memmap2 = "0.5.3" | memmap2 = "0.5.3" | ||||||
| milli = { path = "../milli" } | milli = { path = "../milli" } | ||||||
| mimalloc = { version = "0.1.29", default-features = false } | mimalloc = { version = "0.1.29", default-features = false } | ||||||
|   | |||||||
| @@ -17,8 +17,8 @@ use byte_unit::Byte; | |||||||
| use either::Either; | use either::Either; | ||||||
| use flate2::read::GzDecoder; | use flate2::read::GzDecoder; | ||||||
| use futures::{stream, FutureExt, StreamExt}; | use futures::{stream, FutureExt, StreamExt}; | ||||||
| use heed::EnvOpenOptions; |  | ||||||
| use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; | ||||||
|  | use milli::heed::EnvOpenOptions; | ||||||
| use milli::tokenizer::TokenizerBuilder; | use milli::tokenizer::TokenizerBuilder; | ||||||
| use milli::update::UpdateIndexingStep::*; | use milli::update::UpdateIndexingStep::*; | ||||||
| use milli::update::{ | use milli::update::{ | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ use std::sync::Arc; | |||||||
| use crossbeam_channel::Sender; | use crossbeam_channel::Sender; | ||||||
| use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; | use heed::types::{ByteSlice, DecodeIgnore, OwnedType, SerdeJson}; | ||||||
| use heed::{Database, Env, EnvOpenOptions}; | use heed::{Database, Env, EnvOpenOptions}; | ||||||
|  | use milli::heed; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
| pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; | pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>; | ||||||
|   | |||||||
| @@ -9,7 +9,6 @@ publish = false | |||||||
| anyhow = "1.0.56" | anyhow = "1.0.56" | ||||||
| byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } | byte-unit = { version = "4.0.14", default-features = false, features = ["std"] } | ||||||
| csv = "1.1.6" | csv = "1.1.6" | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1" } |  | ||||||
| milli = { path = "../milli" } | milli = { path = "../milli" } | ||||||
| mimalloc = { version = "0.1.29", default-features = false } | mimalloc = { version = "0.1.29", default-features = false } | ||||||
| roaring = "0.9.0" | roaring = "0.9.0" | ||||||
|   | |||||||
| @@ -7,7 +7,7 @@ use byte_unit::Byte; | |||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use milli::facet::FacetType; | use milli::facet::FacetType; | ||||||
| use milli::index::db_name::*; | use milli::index::db_name::*; | ||||||
| use milli::{FieldId, Index}; | use milli::{heed, FieldId, Index}; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
| use Command::*; | use Command::*; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -18,7 +18,7 @@ fst = "0.4.7" | |||||||
| fxhash = "0.2.1" | fxhash = "0.2.1" | ||||||
| geoutils = "0.4.1" | geoutils = "0.4.1" | ||||||
| grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } | grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } | ||||||
| heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.3", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||||
| json-depth-checker = { path = "../json-depth-checker" } | json-depth-checker = { path = "../json-depth-checker" } | ||||||
| levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } | ||||||
| memmap2 = "0.5.3" | memmap2 = "0.5.3" | ||||||
|   | |||||||
| @@ -116,6 +116,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco | |||||||
|         } |         } | ||||||
|     )] |     )] | ||||||
|     InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, |     InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> }, | ||||||
|  |     #[error("{}", HeedError::BadOpenOptions)] | ||||||
|  |     InvalidLmdbOpenOptions, | ||||||
|     #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] |     #[error("The sort ranking rule must be specified in the ranking rules settings to use the sort parameter at search time.")] | ||||||
|     SortRankingRuleMissing, |     SortRankingRuleMissing, | ||||||
|     #[error("The database file is in an invalid state.")] |     #[error("The database file is in an invalid state.")] | ||||||
| @@ -244,6 +246,7 @@ impl From<HeedError> for Error { | |||||||
|             HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), |             HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })), | ||||||
|             HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), |             HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping), | ||||||
|             HeedError::DatabaseClosing => InternalError(DatabaseClosing), |             HeedError::DatabaseClosing => InternalError(DatabaseClosing), | ||||||
|  |             HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -223,6 +223,16 @@ impl Index { | |||||||
|         self.env.path() |         self.env.path() | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Returns the size used by the index without the cached pages. | ||||||
|  |     pub fn used_size(&self) -> Result<u64> { | ||||||
|  |         Ok(self.env.non_free_pages_size()?) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Returns the real size used by the index. | ||||||
|  |     pub fn on_disk_size(&self) -> Result<u64> { | ||||||
|  |         Ok(self.env.real_disk_size()?) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn copy_to_path<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> { |     pub fn copy_to_path<P: AsRef<Path>>(&self, path: P, option: CompactionOption) -> Result<File> { | ||||||
|         self.env.copy_to_path(path, option).map_err(Into::into) |         self.env.copy_to_path(path, option).map_err(Into::into) | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -20,10 +20,6 @@ use crate::{ | |||||||
|     RoaringBitmapCodec, SmallString32, BEU32, |     RoaringBitmapCodec, SmallString32, BEU32, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| /// The threshold we use to determine after which number of documents we want to clear the |  | ||||||
| /// soft-deleted database and delete documents for real. |  | ||||||
| const DELETE_DOCUMENTS_THRESHOLD: u64 = 10_000; |  | ||||||
|  |  | ||||||
| pub struct DeleteDocuments<'t, 'u, 'i> { | pub struct DeleteDocuments<'t, 'u, 'i> { | ||||||
|     wtxn: &'t mut heed::RwTxn<'i, 'u>, |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|     index: &'i Index, |     index: &'i Index, | ||||||
| @@ -129,7 +125,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|  |  | ||||||
|         // if we have less documents to delete than the threshold we simply save them in |         // if we have less documents to delete than the threshold we simply save them in | ||||||
|         // the `soft_deleted_documents_ids` bitmap and early exit. |         // the `soft_deleted_documents_ids` bitmap and early exit. | ||||||
|         if soft_deleted_docids.len() < DELETE_DOCUMENTS_THRESHOLD { |         let size_used = self.index.used_size()?; | ||||||
|  |         let map_size = self.index.env.map_size()? as u64; | ||||||
|  |         let nb_documents = self.index.number_of_documents(&self.wtxn)?; | ||||||
|  |         let nb_soft_deleted = soft_deleted_docids.len(); | ||||||
|  |  | ||||||
|  |         let percentage_available = 100 - (size_used * 100 / map_size); | ||||||
|  |         let estimated_document_size = size_used / (nb_documents + nb_soft_deleted); | ||||||
|  |         let estimated_size_used_by_soft_deleted = estimated_document_size * nb_soft_deleted; | ||||||
|  |         let percentage_used_by_soft_deleted_documents = | ||||||
|  |             estimated_size_used_by_soft_deleted * 100 / map_size; | ||||||
|  |  | ||||||
|  |         // if we have more than 10% of disk space available and the soft deleted | ||||||
|  |         // documents uses less than 10% of the total space available, | ||||||
|  |         // we skip the deletion. Eg. | ||||||
|  |         // - With 100Go of disk and 20Go used including 5Go of soft-deleted documents | ||||||
|  |         //   We don’t delete anything. | ||||||
|  |         // - With 100Go of disk and 95Go used including 1mo of soft-deleted documents | ||||||
|  |         //   We run the deletion. | ||||||
|  |         // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents | ||||||
|  |         //   We run the deletion. | ||||||
|  |         if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { | ||||||
|             self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; |             self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; | ||||||
|             return Ok(DocumentDeletionResult { |             return Ok(DocumentDeletionResult { | ||||||
|                 deleted_documents: self.to_delete_docids.len(), |                 deleted_documents: self.to_delete_docids.len(), | ||||||
|   | |||||||
| @@ -278,27 +278,30 @@ where | |||||||
|         let stop_words = self.index.stop_words(self.wtxn)?; |         let stop_words = self.index.stop_words(self.wtxn)?; | ||||||
|         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; |         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; | ||||||
|  |  | ||||||
|         // Run extraction pipeline in parallel. |         let pool_params = GrenadParameters { | ||||||
|         pool.install(|| { |  | ||||||
|             let params = GrenadParameters { |  | ||||||
|             chunk_compression_type: self.indexer_config.chunk_compression_type, |             chunk_compression_type: self.indexer_config.chunk_compression_type, | ||||||
|             chunk_compression_level: self.indexer_config.chunk_compression_level, |             chunk_compression_level: self.indexer_config.chunk_compression_level, | ||||||
|             max_memory: self.indexer_config.max_memory, |             max_memory: self.indexer_config.max_memory, | ||||||
|             max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. |             max_nb_chunks: self.indexer_config.max_nb_chunks, // default value, may be chosen. | ||||||
|         }; |         }; | ||||||
|  |         let documents_chunk_size = | ||||||
|  |             self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4); // 4MiB | ||||||
|  |         let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; | ||||||
|  |  | ||||||
|  |         // Run extraction pipeline in parallel. | ||||||
|  |         pool.install(|| { | ||||||
|             // split obkv file into several chunks |             // split obkv file into several chunks | ||||||
|             let original_chunk_iter = grenad_obkv_into_chunks( |             let original_chunk_iter = grenad_obkv_into_chunks( | ||||||
|                 original_documents, |                 original_documents, | ||||||
|                 params.clone(), |                 pool_params.clone(), | ||||||
|                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB |                 documents_chunk_size, | ||||||
|             ); |             ); | ||||||
|  |  | ||||||
|             // split obkv file into several chunks |             // split obkv file into several chunks | ||||||
|             let flattened_chunk_iter = grenad_obkv_into_chunks( |             let flattened_chunk_iter = grenad_obkv_into_chunks( | ||||||
|                 flattened_documents, |                 flattened_documents, | ||||||
|                 params.clone(), |                 pool_params.clone(), | ||||||
|                 self.indexer_config.documents_chunk_size.unwrap_or(1024 * 1024 * 4), // 4MiB |                 documents_chunk_size, | ||||||
|             ); |             ); | ||||||
|  |  | ||||||
|             let result = original_chunk_iter |             let result = original_chunk_iter | ||||||
| @@ -308,14 +311,14 @@ where | |||||||
|                     extract::data_from_obkv_documents( |                     extract::data_from_obkv_documents( | ||||||
|                         original_chunk, |                         original_chunk, | ||||||
|                         flattened_chunk, |                         flattened_chunk, | ||||||
|                         params, |                         pool_params, | ||||||
|                         lmdb_writer_sx.clone(), |                         lmdb_writer_sx.clone(), | ||||||
|                         searchable_fields, |                         searchable_fields, | ||||||
|                         faceted_fields, |                         faceted_fields, | ||||||
|                         primary_key_id, |                         primary_key_id, | ||||||
|                         geo_fields_ids, |                         geo_fields_ids, | ||||||
|                         stop_words, |                         stop_words, | ||||||
|                         self.indexer_config.max_positions_per_attributes, |                         max_positions_per_attributes, | ||||||
|                         exact_attributes, |                         exact_attributes, | ||||||
|                     ) |                     ) | ||||||
|                 }); |                 }); | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user