Fix prefix level position docids database

The prefix search was inverted when we generated the DB.
Instead of searching if word had a prefix in prefix fst,
we were searching if the word was a prefix of a prefix contained in the prefix fst.
The indexer, now, iterate over prefix contained in the fst
and search them by prefix in the word-level-position-docids database,
aggregating matches in a sorter.

Fix #299
This commit is contained in:
many
2021-08-03 17:26:39 +02:00
parent 77de82aaa4
commit cdeb07f0fd

View File

@@ -3,16 +3,16 @@ use std::fs::File;
use std::num::NonZeroU32; use std::num::NonZeroU32;
use std::{cmp, str}; use std::{cmp, str};
use fst::automaton::{self, Automaton}; use fst::Streamer;
use fst::{IntoStreamer, Streamer};
use grenad::{CompressionType, FileFuse, Reader, Writer}; use grenad::{CompressionType, FileFuse, Reader, Writer};
use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::{BytesEncode, Error}; use heed::{BytesEncode, Error};
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::InternalError; use crate::error::{InternalError, SerializationError};
use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec}; use crate::heed_codec::{CboRoaringBitmapCodec, StrLevelPositionCodec};
use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
use crate::update::index_documents::{ use crate::update::index_documents::{
cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database, cbo_roaring_bitmap_merge, create_sorter, create_writer, sorter_into_lmdb_database,
write_into_lmdb_database, writer_into_reader, WriteMethod, write_into_lmdb_database, writer_into_reader, WriteMethod,
@@ -102,13 +102,22 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
// in the prefix FST previously constructed. // in the prefix FST previously constructed.
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?; let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>(); let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>();
for result in db.iter(self.wtxn)? { // iter over all prefixes in the prefix fst.
let ((word, level, left, right), data) = result?; let mut word_stream = prefix_fst.stream();
while let Some(prefix_bytes) = word_stream.next() {
let prefix = str::from_utf8(prefix_bytes).map_err(|_| {
SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
})?;
// iter over all lines of the DB where the key is prefixed by the current prefix.
let mut iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter(self.wtxn, &prefix_bytes)?
.remap_key_type::<StrLevelPositionCodec>();
while let Some(((_word, level, left, right), data)) = iter.next().transpose()? {
// if level is 0, we push the line in the sorter
// replacing the complete word by the prefix.
if level == TreeLevel::min_value() { if level == TreeLevel::min_value() {
let automaton = automaton::Str::new(word).starts_with();
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
while let Some(prefix) = matching_prefixes.next() {
let prefix = str::from_utf8(prefix)?;
let key = (prefix, level, left, right); let key = (prefix, level, left, right);
let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap(); let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap();
word_prefix_level_positions_docids_sorter.insert(bytes, data)?; word_prefix_level_positions_docids_sorter.insert(bytes, data)?;