mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 13:06:27 +00:00 
			
		
		
		
	Fix bug in encoding of word_position_docids and word_fid_docids
This commit is contained in:
		| @@ -45,11 +45,12 @@ impl<'a> heed::BytesDecode<'a> for StrBEU16Codec { | ||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||
|         let footer_len = size_of::<u16>(); | ||||
|  | ||||
|         if bytes.len() < footer_len { | ||||
|         if bytes.len() < footer_len + 1 { | ||||
|             return None; | ||||
|         } | ||||
|  | ||||
|         let (word, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||
|         let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len); | ||||
|         let (_, word) = word_plus_nul_byte.split_last()?; | ||||
|         let word = str::from_utf8(word).ok()?; | ||||
|         let pos = bytes.try_into().map(u16::from_be_bytes).ok()?; | ||||
|  | ||||
| @@ -63,8 +64,9 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec { | ||||
|     fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> { | ||||
|         let pos = pos.to_be_bytes(); | ||||
|  | ||||
|         let mut bytes = Vec::with_capacity(word.len() + pos.len()); | ||||
|         let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len()); | ||||
|         bytes.extend_from_slice(word.as_bytes()); | ||||
|         bytes.push(0); | ||||
|         bytes.extend_from_slice(&pos[..]); | ||||
|  | ||||
|         Some(Cow::Owned(bytes)) | ||||
|   | ||||
| @@ -126,9 +126,9 @@ pub struct Index { | ||||
|  | ||||
|     /// Maps the field id and the word count with the docids that corresponds to it. | ||||
|     pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the position of a word prefix with all the docids where this prefix appears. | ||||
|     /// Maps the word prefix and a position with all the docids where the prefix appears at the position. | ||||
|     pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||
|     /// Maps the word and the field id with the docids that corresponds to it. | ||||
|     /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field | ||||
|     pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||
|  | ||||
|     /// Maps the script and language with all the docids that corresponds to it. | ||||
|   | ||||
| @@ -261,22 +261,6 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|         .transpose() | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_position_docids( | ||||
|         &mut self, | ||||
|         word: Interned<String>, | ||||
|         position: u16, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         DatabaseCache::get_value( | ||||
|             self.txn, | ||||
|             (word, position), | ||||
|             &(self.word_interner.get(word).as_str(), position), | ||||
|             &mut self.db_cache.word_position_docids, | ||||
|             self.index.word_position_docids.remap_data_type::<ByteSlice>(), | ||||
|         )? | ||||
|         .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) | ||||
|         .transpose() | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_fid_docids( | ||||
|         &mut self, | ||||
|         word: Interned<String>, | ||||
| @@ -361,6 +345,22 @@ impl<'ctx> SearchContext<'ctx> { | ||||
|         Ok(fids) | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_position_docids( | ||||
|         &mut self, | ||||
|         word: Interned<String>, | ||||
|         position: u16, | ||||
|     ) -> Result<Option<RoaringBitmap>> { | ||||
|         DatabaseCache::get_value( | ||||
|             self.txn, | ||||
|             (word, position), | ||||
|             &(self.word_interner.get(word).as_str(), position), | ||||
|             &mut self.db_cache.word_position_docids, | ||||
|             self.index.word_position_docids.remap_data_type::<ByteSlice>(), | ||||
|         )? | ||||
|         .map(|bytes| CboRoaringBitmapCodec::bytes_decode(bytes).ok_or(heed::Error::Decoding.into())) | ||||
|         .transpose() | ||||
|     } | ||||
|  | ||||
|     pub fn get_db_word_prefix_position_docids( | ||||
|         &mut self, | ||||
|         word_prefix: Interned<String>, | ||||
|   | ||||
| @@ -1,4 +1,6 @@ | ||||
| use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; | ||||
| use crate::{ | ||||
|     db_snap, index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy, | ||||
| }; | ||||
|  | ||||
| fn create_index() -> TempIndex { | ||||
|     let index = TempIndex::new(); | ||||
| @@ -6,7 +8,7 @@ fn create_index() -> TempIndex { | ||||
|     index | ||||
|         .update_settings(|s| { | ||||
|             s.set_primary_key("id".to_owned()); | ||||
|             s.set_searchable_fields(vec!["text".to_owned()]); | ||||
|             s.set_searchable_fields(vec!["text".to_owned(), "other".to_owned()]); | ||||
|             s.set_criteria(vec![Criterion::Attribute]); | ||||
|         }) | ||||
|         .unwrap(); | ||||
| @@ -33,20 +35,84 @@ fn create_index() -> TempIndex { | ||||
|                 "id": 4, | ||||
|                 "text": "the quick brown fox", | ||||
|             }, | ||||
|             { | ||||
|                 "id": 5, | ||||
|                 "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 the quick brown fox", | ||||
|             }, | ||||
|             { | ||||
|                 "id": 6, | ||||
|                 "text": "quick a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 brown", | ||||
|             }, | ||||
|             { | ||||
|                 "id": 7, | ||||
|                 "text": "a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a  | ||||
|                 a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a   | ||||
|                 quick brown", | ||||
|             }, | ||||
|         ])) | ||||
|         .unwrap(); | ||||
|     index | ||||
| } | ||||
|  | ||||
| #[test] | ||||
| fn test_attribute_fid_simple() { | ||||
| fn test_attribute_position_simple() { | ||||
|     let index = create_index(); | ||||
|  | ||||
|     db_snap!(index, word_position_docids, @"fe86911166fa4c0903c512fd86ec65e4"); | ||||
|  | ||||
|     let txn = index.read_txn().unwrap(); | ||||
|  | ||||
|     let mut s = Search::new(&txn, &index); | ||||
|     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||
|     s.query("quick brown"); | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0, 6, 7, 5]"); | ||||
| } | ||||
| #[test] | ||||
| fn test_attribute_position_repeated() { | ||||
|     let index = create_index(); | ||||
|  | ||||
|     let txn = index.read_txn().unwrap(); | ||||
|  | ||||
|     let mut s = Search::new(&txn, &index); | ||||
|     s.terms_matching_strategy(TermsMatchingStrategy::All); | ||||
|     s.query("the quick brown fox"); | ||||
|     s.query("a a a a a"); | ||||
|     let SearchResult { documents_ids, .. } = s.execute().unwrap(); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 2, 1, 0]"); | ||||
|     insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 7, 6]"); | ||||
| } | ||||
|   | ||||
| @@ -36,6 +36,7 @@ pub fn extract_word_fid_docids<R: io::Read + io::Seek>( | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word_bytes); | ||||
|             key_buffer.push(0); | ||||
|             let (fid, _) = relative_from_absolute_position(position); | ||||
|             key_buffer.extend_from_slice(&fid.to_be_bytes()); | ||||
|             word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; | ||||
|   | ||||
| @@ -39,6 +39,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>( | ||||
|         for position in read_u32_ne_bytes(value) { | ||||
|             key_buffer.clear(); | ||||
|             key_buffer.extend_from_slice(word_bytes); | ||||
|             key_buffer.push(0); | ||||
|             let (_, position) = relative_from_absolute_position(position); | ||||
|             let position = bucketed_position(position); | ||||
|             key_buffer.extend_from_slice(&position.to_be_bytes()); | ||||
|   | ||||
		Reference in New Issue
	
	Block a user