mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	Introduce two new infos subcommands
This commit is contained in:
		| @@ -52,6 +52,12 @@ enum Command { | ||||
|         limit: usize, | ||||
|     }, | ||||
|  | ||||
|     /// Outputs the total size of all the docid-word-positions keys and values. | ||||
|     TotalDocidWordPositionsSize, | ||||
|  | ||||
|     /// Outputs the average number of *different* words by document. | ||||
|     AverageNumberOfWordsByDoc, | ||||
|  | ||||
|     /// Outputs the words FST to disk. | ||||
|     /// | ||||
|     /// One can use the FST binary helper to dissect and analyze it, | ||||
| @@ -84,6 +90,8 @@ fn main() -> anyhow::Result<()> { | ||||
|     match opt.command { | ||||
|         MostCommonWords { limit } => most_common_words(&index, &rtxn, limit), | ||||
|         BiggestValues { limit } => biggest_value_sizes(&index, &rtxn, limit), | ||||
|         TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), | ||||
|         AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), | ||||
|         ExportWordsFst { output } => export_words_fst(&index, &rtxn, output), | ||||
|     } | ||||
| } | ||||
| @@ -181,3 +189,64 @@ fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyho | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn total_docid_word_positions_size(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { | ||||
|     use heed::types::ByteSlice; | ||||
|  | ||||
|     let mut total_key_size = 0; | ||||
|     let mut total_val_size = 0; | ||||
|     let mut count = 0; | ||||
|  | ||||
|     let iter = index.docid_word_positions.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)?; | ||||
|     for result in iter { | ||||
|         let (key, val) = result?; | ||||
|         total_key_size += key.len(); | ||||
|         total_val_size += val.len(); | ||||
|         count += 1; | ||||
|     } | ||||
|  | ||||
|     println!("number of keys: {}", count); | ||||
|     println!("total key size: {}", total_key_size); | ||||
|     println!("total value size: {}", total_val_size); | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| fn average_number_of_words_by_doc(index: &Index, rtxn: &heed::RoTxn) -> anyhow::Result<()> { | ||||
|     use heed::types::DecodeIgnore; | ||||
|     use milli::{DocumentId, BEU32StrCodec}; | ||||
|  | ||||
|     let mut words_counts = Vec::new(); | ||||
|     let mut count = 0; | ||||
|     let mut prev = None as Option<(DocumentId, u32)>; | ||||
|  | ||||
|     let iter = index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, DecodeIgnore>(rtxn)?; | ||||
|     for result in iter { | ||||
|         let ((docid, _word), ()) = result?; | ||||
|  | ||||
|         match prev.as_mut() { | ||||
|             Some((prev_docid, prev_count)) if docid == *prev_docid => { | ||||
|                 *prev_count += 1; | ||||
|             }, | ||||
|             Some((prev_docid, prev_count)) => { | ||||
|                 words_counts.push(*prev_count); | ||||
|                 *prev_docid = docid; | ||||
|                 *prev_count = 0; | ||||
|                 count += 1; | ||||
|             }, | ||||
|             None => prev = Some((docid, 1)), | ||||
|         } | ||||
|     } | ||||
|  | ||||
|     if let Some((_, prev_count)) = prev.take() { | ||||
|         words_counts.push(prev_count); | ||||
|         count += 1; | ||||
|     } | ||||
|  | ||||
|     let words_count = words_counts.into_iter().map(|c| c as usize).sum::<usize>() as f64; | ||||
|     let count = count as f64; | ||||
|  | ||||
|     println!("average number of different words by document: {}", words_count / count); | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user