mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 21:16:28 +00:00 
			
		
		
		
	Introduce the StrBEU32Codec heed codec
This commit is contained in:
		| @@ -118,10 +118,7 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: | ||||
|     for result in index.word_position_docids.iter(rtxn)? { | ||||
|         if limit == 0 { break } | ||||
|  | ||||
|         let (bytes, postings) = result?; | ||||
|         let (word, _position) = bytes.split_at(bytes.len() - 4); | ||||
|         let word = str::from_utf8(word)?; | ||||
|  | ||||
|         let ((word, _position), postings) = result?; | ||||
|         match prev.as_mut() { | ||||
|             Some((prev_word, freq, docids)) if prev_word == word => { | ||||
|                 *freq += postings.len(); | ||||
| @@ -153,6 +150,9 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: | ||||
| } | ||||
|  | ||||
| fn words_frequencies(index: &Index, rtxn: &heed::RoTxn, words: Vec<String>) -> anyhow::Result<()> { | ||||
|     use heed::BytesDecode; | ||||
|     use heed::types::ByteSlice; | ||||
|     use milli::heed_codec::{RoaringBitmapCodec, StrBEU32Codec}; | ||||
|     use roaring::RoaringBitmap; | ||||
|  | ||||
|     let stdout = io::stdout(); | ||||
| @@ -162,13 +162,14 @@ fn words_frequencies(index: &Index, rtxn: &heed::RoTxn, words: Vec<String>) -> a | ||||
|     for word in words { | ||||
|         let mut document_frequency = RoaringBitmap::new(); | ||||
|         let mut frequency = 0; | ||||
|         for result in index.word_position_docids.prefix_iter(rtxn, word.as_bytes())? { | ||||
|         let db = index.word_position_docids.as_polymorph(); | ||||
|         for result in db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, word.as_bytes())? { | ||||
|             let (bytes, postings) = result?; | ||||
|             let (w, _position) = bytes.split_at(bytes.len() - 4); | ||||
|             let (w, _position) = StrBEU32Codec::bytes_decode(bytes).unwrap(); | ||||
|  | ||||
|             // if the word is not exactly the word we requested then it means | ||||
|             // we found a word that *starts with* the requested word and we must stop. | ||||
|             if word.as_bytes() != w { break } | ||||
|             if word != w { break } | ||||
|  | ||||
|             document_frequency.union_with(&postings); | ||||
|             frequency += postings.len(); | ||||
| @@ -182,8 +183,9 @@ fn words_frequencies(index: &Index, rtxn: &heed::RoTxn, words: Vec<String>) -> a | ||||
| fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | ||||
|     use std::cmp::Reverse; | ||||
|     use std::collections::BinaryHeap; | ||||
|     use std::convert::TryInto; | ||||
|     use heed::BytesDecode; | ||||
|     use heed::types::{Str, ByteSlice}; | ||||
|     use milli::heed_codec::StrBEU32Codec; | ||||
|  | ||||
|     let main_name = "main"; | ||||
|     let word_positions_name = "word_positions"; | ||||
| @@ -206,10 +208,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|  | ||||
|         for result in index.word_position_docids.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)? { | ||||
|             let (key_bytes, value) = result?; | ||||
|             let (word, position) = key_bytes.split_at(key_bytes.len() - 4); | ||||
|             let word = str::from_utf8(word)?; | ||||
|             let position = position.try_into().map(u32::from_be_bytes)?; | ||||
|  | ||||
|             let (word, position) = StrBEU32Codec::bytes_decode(key_bytes).unwrap(); | ||||
|             let key = format!("{} {}", word, position); | ||||
|             heap.push(Reverse((value.len(), key, word_position_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
| @@ -217,10 +216,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
|  | ||||
|         for result in index.word_attribute_docids.as_polymorph().iter::<_, ByteSlice, ByteSlice>(rtxn)? { | ||||
|             let (key_bytes, value) = result?; | ||||
|             let (word, attribute) = key_bytes.split_at(key_bytes.len() - 4); | ||||
|             let word = str::from_utf8(word)?; | ||||
|             let attribute = attribute.try_into().map(u32::from_be_bytes)?; | ||||
|  | ||||
|             let (word, attribute) = StrBEU32Codec::bytes_decode(key_bytes).unwrap(); | ||||
|             let key = format!("{} {}", word, attribute); | ||||
|             heap.push(Reverse((value.len(), key, word_attribute_docids_name))); | ||||
|             if heap.len() > limit { heap.pop(); } | ||||
| @@ -239,7 +235,9 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho | ||||
| } | ||||
|  | ||||
| fn word_position_doc_ids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<String>) -> anyhow::Result<()> { | ||||
|     use std::convert::TryInto; | ||||
|     use heed::BytesDecode; | ||||
|     use heed::types::ByteSlice; | ||||
|     use milli::heed_codec::{RoaringBitmapCodec, StrBEU32Codec}; | ||||
|  | ||||
|     let stdout = io::stdout(); | ||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||
| @@ -247,14 +245,14 @@ fn word_position_doc_ids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: | ||||
|  | ||||
|     let mut non_debug = Vec::new(); | ||||
|     for word in words { | ||||
|         for result in index.word_position_docids.prefix_iter(rtxn, word.as_bytes())? { | ||||
|         let db = index.word_position_docids.as_polymorph(); | ||||
|         for result in db.prefix_iter::<_, ByteSlice, RoaringBitmapCodec>(rtxn, word.as_bytes())? { | ||||
|             let (bytes, postings) = result?; | ||||
|             let (w, position) = bytes.split_at(bytes.len() - 4); | ||||
|             let position = position.try_into().map(u32::from_be_bytes)?; | ||||
|             let (w, position) = StrBEU32Codec::bytes_decode(bytes).unwrap(); | ||||
|  | ||||
|             // if the word is not exactly the word we requested then it means | ||||
|             // we found a word that *starts with* the requested word and we must stop. | ||||
|             if word.as_bytes() != w { break } | ||||
|             if word != w { break } | ||||
|  | ||||
|             let postings_string = if debug { | ||||
|                 format!("{:?}", postings) | ||||
|   | ||||
| @@ -1,3 +1,5 @@ | ||||
| mod roaring_bitmap_codec; | ||||
| mod str_beu32_codec; | ||||
|  | ||||
| pub use self::roaring_bitmap_codec::RoaringBitmapCodec; | ||||
| pub use self::str_beu32_codec::StrBEU32Codec; | ||||
|   | ||||
							
								
								
									
										28
									
								
								src/heed_codec/str_beu32_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								src/heed_codec/str_beu32_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,28 @@ | ||||
| use std::borrow::Cow; | ||||
| use std::convert::TryInto; | ||||
| use std::str; | ||||
|  | ||||
| pub struct StrBEU32Codec; | ||||
|  | ||||
| impl<'a> heed::BytesDecode<'a> for StrBEU32Codec { | ||||
|     type DItem = (&'a str, u32); | ||||
|  | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let str_len = bytes.len().checked_sub(4)?; | ||||
|         let (str_bytes, n_bytes) = bytes.split_at(str_len); | ||||
|         let s = str::from_utf8(str_bytes).ok()?; | ||||
|         let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?; | ||||
|         Some((s, n)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl<'a> heed::BytesEncode<'a> for StrBEU32Codec { | ||||
|     type EItem = (&'a str, u32); | ||||
|  | ||||
|     fn bytes_encode((s, n): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let mut bytes = Vec::with_capacity(s.len() + 4); | ||||
|         bytes.extend_from_slice(s.as_bytes()); | ||||
|         bytes.extend_from_slice(&n.to_be_bytes()); | ||||
|         Some(Cow::Owned(bytes)) | ||||
|     } | ||||
| } | ||||
							
								
								
									
										10
									
								
								src/lib.rs
									
									
									
									
									
								
							
							
						
						
									
										10
									
								
								src/lib.rs
									
									
									
									
									
								
							| @@ -1,9 +1,9 @@ | ||||
| mod criterion; | ||||
| mod heed_codec; | ||||
| mod node; | ||||
| mod query_tokens; | ||||
| mod search; | ||||
| mod transitive_arc; | ||||
| pub mod heed_codec; | ||||
| pub mod lexer; | ||||
|  | ||||
| use std::collections::HashMap; | ||||
| @@ -21,7 +21,7 @@ use oxidized_mtbl as omtbl; | ||||
|  | ||||
| pub use self::search::{Search, SearchResult}; | ||||
| pub use self::criterion::{Criterion, default_criteria}; | ||||
| use self::heed_codec::RoaringBitmapCodec; | ||||
| use self::heed_codec::{RoaringBitmapCodec, StrBEU32Codec}; | ||||
| use self::transitive_arc::TransitiveArc; | ||||
|  | ||||
| pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; | ||||
| @@ -44,10 +44,10 @@ pub struct Index { | ||||
|     pub word_positions: Database<Str, RoaringBitmapCodec>, | ||||
|     pub prefix_word_positions: Database<Str, RoaringBitmapCodec>, | ||||
|     /// Maps a word at a position (u32) and all the documents ids where it appears. | ||||
|     pub word_position_docids: Database<ByteSlice, RoaringBitmapCodec>, | ||||
|     pub prefix_word_position_docids: Database<ByteSlice, RoaringBitmapCodec>, | ||||
|     pub word_position_docids: Database<StrBEU32Codec, RoaringBitmapCodec>, | ||||
|     pub prefix_word_position_docids: Database<StrBEU32Codec, RoaringBitmapCodec>, | ||||
|     /// Maps a word and an attribute (u32) to all the documents ids that it appears in. | ||||
|     pub word_attribute_docids: Database<ByteSlice, RoaringBitmapCodec>, | ||||
|     pub word_attribute_docids: Database<StrBEU32Codec, RoaringBitmapCodec>, | ||||
|     /// The MTBL store that contains the documents content. | ||||
|     documents: omtbl::Reader<TransitiveArc<Mmap>>, | ||||
| } | ||||
|   | ||||
| @@ -138,14 +138,10 @@ impl<'a> Search<'a> { | ||||
|         let number_of_attributes = index.number_of_attributes(rtxn)?.map_or(0, |n| n as u32); | ||||
|  | ||||
|         for (i, derived_words) in derived_words.iter().enumerate() { | ||||
|  | ||||
|             let mut union_docids = RoaringBitmap::new(); | ||||
|             for (word, _distance, _positions) in derived_words { | ||||
|                 for attr in 0..number_of_attributes { | ||||
|  | ||||
|                     let mut key = word.clone().into_bytes(); | ||||
|                     key.extend_from_slice(&attr.to_be_bytes()); | ||||
|                     if let Some(docids) = index.word_attribute_docids.get(rtxn, &key)? { | ||||
|                     if let Some(docids) = index.word_attribute_docids.get(rtxn, &(word, attr))? { | ||||
|                         union_docids.union_with(&docids); | ||||
|                     } | ||||
|                 } | ||||
| @@ -172,9 +168,7 @@ impl<'a> Search<'a> { | ||||
|         let mut union_docids = RoaringBitmap::new(); | ||||
|         for (word, _distance, positions) in words { | ||||
|             if positions.contains(position) { | ||||
|                 let mut key = word.clone().into_bytes(); | ||||
|                 key.extend_from_slice(&position.to_be_bytes()); | ||||
|                 if let Some(docids) = index.word_position_docids.get(rtxn, &key)? { | ||||
|                 if let Some(docids) = index.word_position_docids.get(rtxn, &(word, position))? { | ||||
|                     union_docids.union_with(&docids); | ||||
|                 } | ||||
|             } | ||||
| @@ -192,9 +186,7 @@ impl<'a> Search<'a> { | ||||
|     { | ||||
|         let mut union_docids = RoaringBitmap::new(); | ||||
|         for (word, _distance, _positions) in words { | ||||
|             let mut key = word.clone().into_bytes(); | ||||
|             key.extend_from_slice(&attribute.to_be_bytes()); | ||||
|             if let Some(docids) = index.word_attribute_docids.get(rtxn, &key)? { | ||||
|             if let Some(docids) = index.word_attribute_docids.get(rtxn, &(word, attribute))? { | ||||
|                 union_docids.union_with(&docids); | ||||
|             } | ||||
|         } | ||||
|   | ||||
		Reference in New Issue
	
	Block a user