mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Replace the arc cache by a simple linked hash map
This commit is contained in:
		
							
								
								
									
										18
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										18
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -12,14 +12,6 @@ version = "1.0.31" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" | checksum = "85bb70cc08ec97ca5450e6eba421deeea5f172c0fc61f78b5357b2a8e8be195f" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "arc-cache" |  | ||||||
| version = "0.2.4" |  | ||||||
| source = "git+https://github.com/Kerollmops/rust-arc-cache.git?rev=56530f2#56530f2d219823f8f88dc03851f8fe057bd72564" |  | ||||||
| dependencies = [ |  | ||||||
|  "xlru-cache", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "arc-swap" | name = "arc-swap" | ||||||
| version = "0.4.6" | version = "0.4.6" | ||||||
| @@ -957,7 +949,6 @@ name = "milli" | |||||||
| version = "0.1.0" | version = "0.1.0" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "arc-cache", |  | ||||||
|  "askama", |  "askama", | ||||||
|  "askama_warp", |  "askama_warp", | ||||||
|  "bstr", |  "bstr", | ||||||
| @@ -971,6 +962,7 @@ dependencies = [ | |||||||
|  "itertools", |  "itertools", | ||||||
|  "jemallocator", |  "jemallocator", | ||||||
|  "levenshtein_automata", |  "levenshtein_automata", | ||||||
|  |  "linked-hash-map", | ||||||
|  "log 0.4.11", |  "log 0.4.11", | ||||||
|  "memmap", |  "memmap", | ||||||
|  "near-proximity", |  "near-proximity", | ||||||
| @@ -2356,14 +2348,6 @@ dependencies = [ | |||||||
|  "winapi-build", |  "winapi-build", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "xlru-cache" |  | ||||||
| version = "0.1.2" |  | ||||||
| source = "git+https://github.com/Kerollmops/rust-xlru-cache.git?rev=3c90f49#3c90f49e11758ee0cc4ff145b2606ba143188b77" |  | ||||||
| dependencies = [ |  | ||||||
|  "linked-hash-map", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "zerocopy" | name = "zerocopy" | ||||||
| version = "0.3.0" | version = "0.3.0" | ||||||
|   | |||||||
| @@ -7,7 +7,6 @@ default-run = "indexer" | |||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| anyhow = "1.0.28" | anyhow = "1.0.28" | ||||||
| arc-cache = { git = "https://github.com/Kerollmops/rust-arc-cache.git", rev = "56530f2" } |  | ||||||
| bstr = "0.2.13" | bstr = "0.2.13" | ||||||
| byteorder = "1.3.4" | byteorder = "1.3.4" | ||||||
| csv = "1.1.3" | csv = "1.1.3" | ||||||
| @@ -17,6 +16,7 @@ fxhash = "0.2.1" | |||||||
| heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } | heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } | ||||||
| jemallocator = "0.3.2" | jemallocator = "0.3.2" | ||||||
| levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | ||||||
|  | linked-hash-map = "0.5.3" | ||||||
| memmap = "0.7.0" | memmap = "0.7.0" | ||||||
| near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } | near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } | ||||||
| once_cell = "1.4.0" | once_cell = "1.4.0" | ||||||
|   | |||||||
| @@ -8,12 +8,12 @@ use std::{iter, thread}; | |||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|  |  | ||||||
| use anyhow::Context; | use anyhow::Context; | ||||||
| use arc_cache::ArcCache; |  | ||||||
| use bstr::ByteSlice as _; | use bstr::ByteSlice as _; | ||||||
| use csv::StringRecord; | use csv::StringRecord; | ||||||
| use flate2::read::GzDecoder; | use flate2::read::GzDecoder; | ||||||
| use fst::IntoStreamer; | use fst::IntoStreamer; | ||||||
| use heed::{EnvOpenOptions, BytesEncode, types::*}; | use heed::{EnvOpenOptions, BytesEncode, types::*}; | ||||||
|  | use linked_hash_map::LinkedHashMap; | ||||||
| use log::{debug, info}; | use log::{debug, info}; | ||||||
| use memmap::Mmap; | use memmap::Mmap; | ||||||
| use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; | use oxidized_mtbl::{Reader, Writer, Merger, Sorter, CompressionType}; | ||||||
| @@ -89,9 +89,10 @@ struct IndexerOpt { | |||||||
|     #[structopt(long, default_value = "1610612736")] // 1.5 GB |     #[structopt(long, default_value = "1610612736")] // 1.5 GB | ||||||
|     max_memory: usize, |     max_memory: usize, | ||||||
|  |  | ||||||
|     /// Size of the ARC cache when indexing. |     /// Size of the linked hash map cache when indexing. | ||||||
|     #[structopt(long, default_value = "43690")] |     /// The bigger it is, the faster the indexing is but the more memory it takes. | ||||||
|     arc_cache_size: usize, |     #[structopt(long, default_value = "4096")] | ||||||
|  |     linked_hash_map_size: usize, | ||||||
|  |  | ||||||
|     /// The name of the compression algorithm to use when compressing intermediate |     /// The name of the compression algorithm to use when compressing intermediate | ||||||
|     /// chunks during indexing documents. |     /// chunks during indexing documents. | ||||||
| @@ -159,7 +160,7 @@ fn compute_words_pair_proximities( | |||||||
| type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>; | type MergeFn = fn(&[u8], &[Vec<u8>]) -> Result<Vec<u8>, ()>; | ||||||
|  |  | ||||||
| struct Store { | struct Store { | ||||||
|     word_docids: ArcCache<SmallVec32<u8>, RoaringBitmap>, |     word_docids: LinkedHashMap<SmallVec32<u8>, RoaringBitmap>, | ||||||
|     documents_ids: RoaringBitmap, |     documents_ids: RoaringBitmap, | ||||||
|     sorter: Sorter<MergeFn>, |     sorter: Sorter<MergeFn>, | ||||||
|     documents_sorter: Sorter<MergeFn>, |     documents_sorter: Sorter<MergeFn>, | ||||||
| @@ -169,7 +170,7 @@ struct Store { | |||||||
|  |  | ||||||
| impl Store { | impl Store { | ||||||
|     pub fn new( |     pub fn new( | ||||||
|         arc_cache_size: usize, |         linked_hash_map_size: usize, | ||||||
|         max_nb_chunks: Option<usize>, |         max_nb_chunks: Option<usize>, | ||||||
|         max_memory: Option<usize>, |         max_memory: Option<usize>, | ||||||
|         chunk_compression_type: CompressionType, |         chunk_compression_type: CompressionType, | ||||||
| @@ -195,7 +196,8 @@ impl Store { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         Store { |         Store { | ||||||
|             word_docids: ArcCache::new(arc_cache_size), |             // We overflow by one before poping the LRU element. | ||||||
|  |             word_docids: LinkedHashMap::with_capacity(linked_hash_map_size + 1), | ||||||
|             documents_ids: RoaringBitmap::new(), |             documents_ids: RoaringBitmap::new(), | ||||||
|             sorter: builder.build(), |             sorter: builder.build(), | ||||||
|             documents_sorter: documents_builder.build(), |             documents_sorter: documents_builder.build(), | ||||||
| @@ -207,9 +209,21 @@ impl Store { | |||||||
|     // Save the documents ids under the position and word we have seen it. |     // Save the documents ids under the position and word we have seen it. | ||||||
|     fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { |     fn insert_word_docid(&mut self, word: &str, id: DocumentId) -> anyhow::Result<()> { | ||||||
|         let word_vec = SmallVec32::from(word.as_bytes()); |         let word_vec = SmallVec32::from(word.as_bytes()); | ||||||
|         let ids = RoaringBitmap::from_iter(Some(id)); |         // if get_refresh finds the element it is assured to be at the end of the linked hash map. | ||||||
|         let (_, lrus) = self.word_docids.insert(word_vec, ids, |old, new| old.union_with(&new)); |         match self.word_docids.get_refresh(&word_vec) { | ||||||
|         Self::write_word_docids(&mut self.sorter, lrus)?; |             Some(old) => { old.insert(id); }, | ||||||
|  |             None => { | ||||||
|  |                 // A newly inserted element is append at the end of the linked hash map. | ||||||
|  |                 self.word_docids.insert(word_vec, RoaringBitmap::from_iter(Some(id))); | ||||||
|  |                 // If the word docids just reached it's capacity we must make sure to remove | ||||||
|  |                 // one element, this way next time we insert we doesn't grow the capacity. | ||||||
|  |                 if self.word_docids.len() == self.word_docids.capacity() { | ||||||
|  |                     // Removing the front element is equivalent to removing the LRU element. | ||||||
|  |                     let lru = self.word_docids.pop_front(); | ||||||
|  |                     Self::write_word_docids(&mut self.sorter, lru)?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -600,7 +614,7 @@ fn main() -> anyhow::Result<()> { | |||||||
|     let index = Index::new(&env)?; |     let index = Index::new(&env)?; | ||||||
|  |  | ||||||
|     let num_threads = rayon::current_num_threads(); |     let num_threads = rayon::current_num_threads(); | ||||||
|     let arc_cache_size = opt.indexer.arc_cache_size; |     let linked_hash_map_size = opt.indexer.linked_hash_map_size; | ||||||
|     let max_nb_chunks = opt.indexer.max_nb_chunks; |     let max_nb_chunks = opt.indexer.max_nb_chunks; | ||||||
|     let max_memory = opt.indexer.max_memory; |     let max_memory = opt.indexer.max_memory; | ||||||
|     let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type); |     let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type); | ||||||
| @@ -611,7 +625,7 @@ fn main() -> anyhow::Result<()> { | |||||||
|         .enumerate() |         .enumerate() | ||||||
|         .map(|(i, rdr)| { |         .map(|(i, rdr)| { | ||||||
|             Store::new( |             Store::new( | ||||||
|                 arc_cache_size, |                 linked_hash_map_size, | ||||||
|                 max_nb_chunks, |                 max_nb_chunks, | ||||||
|                 Some(max_memory), |                 Some(max_memory), | ||||||
|                 chunk_compression_type, |                 chunk_compression_type, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user