mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Merge #308
308: Implement a better parallel indexer r=Kerollmops a=ManyTheFish Rewrite the indexer: - enhance memory consumption control - optimize parallelism using rayon and crossbeam channel - factorize the different parts and make new DB implementation easier - optimize and fix prefix databases Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
		| @@ -207,6 +207,24 @@ enum Command { | ||||
|         word2: String, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs a CSV with the proximities for the two specified words and | ||||
|     /// the documents ids where these relations appears. | ||||
|     /// | ||||
|     /// `word1`, `prefix` defines the word pair specified *in this specific order*. | ||||
|     /// `proximity` defines the proximity between the two specified words. | ||||
|     /// `documents_ids` defines the documents ids where the relation appears. | ||||
|     WordPrefixPairProximitiesDocids { | ||||
|         /// Display the whole documents ids in details. | ||||
|         #[structopt(long)] | ||||
|         full_display: bool, | ||||
|  | ||||
|         /// First word of the word pair. | ||||
|         word1: String, | ||||
|  | ||||
|         /// Second word of the word pair. | ||||
|         prefix: String, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs the words FST to standard output. | ||||
|     /// | ||||
|     /// One can use the FST binary helper to dissect and analyze it, | ||||
| @@ -282,6 +300,9 @@ fn main() -> anyhow::Result<()> { | ||||
|         WordPairProximitiesDocids { full_display, word1, word2 } => { | ||||
|             word_pair_proximities_docids(&index, &rtxn, !full_display, word1, word2) | ||||
|         } | ||||
|         WordPrefixPairProximitiesDocids { full_display, word1, prefix } => { | ||||
|             word_prefix_pair_proximities_docids(&index, &rtxn, !full_display, word1, prefix) | ||||
|         } | ||||
|         ExportWordsFst => export_words_fst(&index, &rtxn), | ||||
|         ExportWordsPrefixFst => export_words_prefix_fst(&index, &rtxn), | ||||
|         ExportDocuments { internal_documents_ids } => { | ||||
| @@ -1131,3 +1152,46 @@ fn word_pair_proximities_docids( | ||||
|  | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|  | ||||
| fn word_prefix_pair_proximities_docids( | ||||
|     index: &Index, | ||||
|     rtxn: &heed::RoTxn, | ||||
|     debug: bool, | ||||
|     word1: String, | ||||
|     word_prefix: String, | ||||
| ) -> anyhow::Result<()> { | ||||
|     use heed::types::ByteSlice; | ||||
|     use milli::RoaringBitmapCodec; | ||||
|  | ||||
|     let stdout = io::stdout(); | ||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||
|     wtr.write_record(&["word1", "word_prefix", "proximity", "documents_ids"])?; | ||||
|  | ||||
|     // Create the prefix key with only the pair of words. | ||||
|     let mut prefix = Vec::with_capacity(word1.len() + word_prefix.len() + 1); | ||||
|     prefix.extend_from_slice(word1.as_bytes()); | ||||
|     prefix.push(0); | ||||
|     prefix.extend_from_slice(word_prefix.as_bytes()); | ||||
|  | ||||
|     let db = index.word_prefix_pair_proximity_docids.as_polymorph(); | ||||
|     let iter = db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &prefix)?; | ||||
|     for result in iter { | ||||
|         let (key, docids) = result?; | ||||
|  | ||||
|         // Skip keys that are longer than the requested one, | ||||
|         // a longer key means that the second word is a prefix of the request word. | ||||
|         if key.len() != prefix.len() + 1 { | ||||
|             continue; | ||||
|         } | ||||
|  | ||||
|         let proximity = key.last().unwrap(); | ||||
|         let docids = if debug { | ||||
|             format!("{:?}", docids) | ||||
|         } else { | ||||
|             format!("{:?}", docids.iter().collect::<Vec<_>>()) | ||||
|         }; | ||||
|         wtr.write_record(&[&word1, &word_prefix, &proximity.to_string(), &docids])?; | ||||
|     } | ||||
|  | ||||
|     Ok(wtr.flush()?) | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user