mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 07:56:28 +00:00 
			
		
		
		
	Merge #300
300: Fix prefix level position docids database r=curquiza a=ManyTheFish The prefix search was inverted when we generated the DB. Instead of searching if word had a prefix in prefix fst, we were searching if the word was a prefix of a prefix contained in the prefix fst. The indexer, now, iterate over prefix contained in the fst and search them by prefix in the word-level-position-docids database, aggregating matches in a sorter. Fix #299 Co-authored-by: many <maxime@meilisearch.com>
This commit is contained in:
		| @@ -3,16 +3,16 @@ use std::fs::File; | ||||
| use std::num::NonZeroU32; | ||||
| use std::{cmp, str}; | ||||
|  | ||||
| use fst::automaton::{self, Automaton}; | ||||
| use fst::{IntoStreamer, Streamer}; | ||||
| use fst::Streamer; | ||||
| use grenad::{CompressionType, FileFuse, Reader, Writer}; | ||||
| use heed::types::{ByteSlice, DecodeIgnore, Str}; | ||||
| use heed::{BytesEncode, Error}; | ||||
| use log::debug; | ||||
| use roaring::RoaringBitmap; | ||||
|  | ||||
| use crate::error::InternalError; | ||||
| use crate::error::{InternalError, SerializationError}; | ||||
| use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; | ||||
| use crate::index::main_key::WORDS_PREFIXES_FST_KEY; | ||||
| use crate::update::index_documents::{ | ||||
|     cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, | ||||
|     write_into_lmdb_database, writer_into_reader, WriteMethod, | ||||
| @@ -102,13 +102,22 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> { | ||||
|         // in the prefix FST previously constructed. | ||||
|         let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; | ||||
|         let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>(); | ||||
|         for result in db.iter(self.wtxn)? { | ||||
|             let ((word, level, left, right), data) = result?; | ||||
|             if level == TreeLevel::min_value() { | ||||
|                 let automaton = automaton::Str::new(word).starts_with(); | ||||
|                 let mut matching_prefixes = prefix_fst.search(automaton).into_stream(); | ||||
|                 while let Some(prefix) = matching_prefixes.next() { | ||||
|                     let prefix = str::from_utf8(prefix)?; | ||||
|         // iter over all prefixes in the prefix fst. | ||||
|         let mut word_stream = prefix_fst.stream(); | ||||
|         while let Some(prefix_bytes) = word_stream.next() { | ||||
|             let prefix = str::from_utf8(prefix_bytes).map_err(|_| { | ||||
|                 SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } | ||||
|             })?; | ||||
|  | ||||
|             // iter over all lines of the DB where the key is prefixed by the current prefix. | ||||
|             let mut iter = db | ||||
|                 .remap_key_type::<ByteSlice>() | ||||
|                 .prefix_iter(self.wtxn, &prefix_bytes)? | ||||
|                 .remap_key_type::<StrLevelPositionCodec>(); | ||||
|             while let Some(((_word, level, left, right), data)) = iter.next().transpose()? { | ||||
|                 // if level is 0, we push the line in the sorter | ||||
|                 // replacing the complete word by the prefix. | ||||
|                 if level == TreeLevel::min_value() { | ||||
|                     let key = (prefix, level, left, right); | ||||
|                     let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); | ||||
|                     word_prefix_level_positions_docids_sorter.insert(bytes, data)?; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user