Fix a bug where prefixes were never deleted

This commit is contained in:
Kerollmops
2025-10-03 10:50:05 +02:00
parent ea92c64fdc
commit 851694e323

View File

@@ -4,7 +4,7 @@ use std::io::{BufReader, BufWriter, Read, Seek, Write};
use std::iter; use std::iter;
use hashbrown::HashMap; use hashbrown::HashMap;
use heed::types::Bytes; use heed::types::{Bytes, DecodeIgnore};
use heed::{BytesDecode, Database, Error, RoTxn, RwTxn}; use heed::{BytesDecode, Database, Error, RoTxn, RwTxn};
use rayon::iter::{IndexedParallelIterator as _, IntoParallelIterator, ParallelIterator as _}; use rayon::iter::{IndexedParallelIterator as _, IntoParallelIterator, ParallelIterator as _};
use roaring::MultiOps; use roaring::MultiOps;
@@ -225,37 +225,33 @@ impl<'i> WordPrefixIntegerDocids<'i> {
continue; continue;
} }
let mut bitmaps_bytes_at_positions = HashMap::new(); let mut bitmap_bytes_at_positions = HashMap::new();
let mut prev_pos = None;
for result in self for result in self
.database .database
.remap_data_type::<Bytes>()
.prefix_iter(&rtxn, prefix.as_bytes())? .prefix_iter(&rtxn, prefix.as_bytes())?
.remap_types::<StrBEU16Codec, Bytes>()
{ {
let (key, bitmap_bytes) = result?; let ((_word, pos), bitmap_bytes) = result?;
let (_word, pos) = bitmap_bytes_at_positions
StrBEU16Codec::bytes_decode(key).map_err(Error::Decoding)?;
// Track positions with no corresponding bitmap bytes,
// these means that the prefix no longer exists in the database
// and must, therefore, be removed from the index.
if let Some(prev_pos) = prev_pos {
// A range with the left bigger than the right bound results in an empty range.
for pos in prev_pos..pos {
// They are represented by an empty set of bitmaps.
bitmaps_bytes_at_positions.entry(pos).or_insert_with(Vec::new);
}
}
bitmaps_bytes_at_positions
.entry(pos) .entry(pos)
.or_insert_with(Vec::new) .or_insert_with(Vec::new)
.push(bitmap_bytes); .push(bitmap_bytes);
prev_pos = Some(pos);
} }
for (pos, bitmaps_bytes) in bitmaps_bytes_at_positions { // We track positions with no corresponding bitmap bytes,
// these means that the prefix no longer exists in the database
// and must, therefore, be removed from the index.
for result in self
.prefix_database
.prefix_iter(&rtxn, prefix.as_bytes())?
.remap_types::<StrBEU16Codec, DecodeIgnore>()
{
let ((_word, pos), ()) = result?;
// They are represented by an empty set of bitmaps.
bitmap_bytes_at_positions.entry(pos).or_insert_with(Vec::new);
}
for (pos, bitmaps_bytes) in bitmap_bytes_at_positions {
if bitmaps_bytes.is_empty() { if bitmaps_bytes.is_empty() {
indexes.push(PrefixIntegerEntry { indexes.push(PrefixIntegerEntry {
prefix, prefix,