Compute prefix databases

We are now computing the prefix FST and a prefix delta in the Merger thread,
after all the databases are written, the main thread will recompute the prefix databases based on the prefix delta without needing any grenad temporary file anymore
This commit is contained in:
ManyTheFish
2024-10-01 09:56:49 +02:00
parent 64589278ac
commit bb7a503e5d
7 changed files with 313 additions and 77 deletions

View File

@ -2,50 +2,37 @@ use std::{fs::File, io::BufWriter};
use fst::{Set, SetBuilder, Streamer};
use memmap2::Mmap;
use std::collections::HashSet;
use tempfile::tempfile;
use crate::{update::del_add::DelAdd, Result, SmallString32};
use crate::{update::del_add::DelAdd, Prefix, Result};
pub struct WordFstBuilder<'a> {
stream: Option<fst::set::Stream<'a>>,
word_fst_builder: SetBuilder<BufWriter<File>>,
/// TODO: Replace the full memory allocation
prefix_fst_builders: Vec<SetBuilder<Vec<u8>>>,
max_prefix_length: usize,
last_word: Option<Vec<u8>>,
current_prefix: Vec<SmallString32>,
current_prefix_count: Vec<u64>,
prefix_count_threshold: u64,
prefix_fst_builder: Option<PrefixFstBuilder>,
inserted_words: usize,
registered_words: usize,
base_set_length: usize,
}
impl<'a> WordFstBuilder<'a> {
pub fn new(
words_fst: &'a Set<std::borrow::Cow<'a, [u8]>>,
max_prefix_length: usize,
) -> Result<Self> {
let mut prefix_fst_builders = Vec::new();
for _ in 0..max_prefix_length {
prefix_fst_builders.push(SetBuilder::memory());
}
pub fn new(words_fst: &'a Set<std::borrow::Cow<'a, [u8]>>) -> Result<Self> {
Ok(Self {
stream: Some(words_fst.stream()),
word_fst_builder: SetBuilder::new(BufWriter::new(tempfile()?))?,
prefix_fst_builders,
max_prefix_length,
prefix_fst_builder: None,
last_word: None,
current_prefix: vec![SmallString32::new(); max_prefix_length],
current_prefix_count: vec![0; max_prefix_length],
prefix_count_threshold: 100,
inserted_words: 0,
registered_words: 0,
base_set_length: words_fst.len(),
})
}
pub fn with_prefix_settings(&mut self, prefix_settings: PrefixSettings) -> &Self {
self.prefix_fst_builder = PrefixFstBuilder::new(prefix_settings);
self
}
pub fn register_word(&mut self, deladd: DelAdd, right: &[u8]) -> Result<()> {
if deladd == DelAdd::Addition {
self.registered_words += 1;
@ -85,7 +72,7 @@ impl<'a> WordFstBuilder<'a> {
// If we reach this point, it means that the stream is empty
// and we need to insert the incoming word
self.insert_word(right)?;
self.insert_word(right, deladd, true)?;
self.stream = Some(stream);
}
@ -104,26 +91,18 @@ impl<'a> WordFstBuilder<'a> {
match left.cmp(right) {
std::cmp::Ordering::Less => {
// We need to insert the last word from the current fst
self.insert_word(left)?;
self.insert_word(left, DelAdd::Addition, false)?;
left_inserted = true;
}
std::cmp::Ordering::Equal => {
// Addition: We insert the word
// Deletion: We delete the word by not inserting it
if deladd == DelAdd::Addition {
self.insert_word(right)?;
}
self.insert_word(right, deladd, true)?;
left_inserted = true;
right_inserted = true;
}
std::cmp::Ordering::Greater => {
// Addition: We insert the word and keep the last word
// Deletion: We keep the current word until the left word to delete is greater or equal
if deladd == DelAdd::Addition {
self.insert_word(right)?;
}
self.insert_word(right, deladd, true)?;
right_inserted = true;
}
@ -132,14 +111,111 @@ impl<'a> WordFstBuilder<'a> {
Ok((left_inserted, right_inserted))
}
fn insert_word(&mut self, bytes: &[u8]) -> Result<()> {
self.inserted_words += 1;
self.word_fst_builder.insert(bytes)?;
fn insert_word(&mut self, bytes: &[u8], deladd: DelAdd, is_modified: bool) -> Result<()> {
// Addition: We insert the word
// Deletion: We delete the word by not inserting it
if deladd == DelAdd::Addition {
self.inserted_words += 1;
self.word_fst_builder.insert(bytes)?;
}
if let Some(prefix_fst_builder) = self.prefix_fst_builder.as_mut() {
prefix_fst_builder.insert_word(bytes, deladd, is_modified)?;
}
Ok(())
}
fn drain_stream(&mut self) -> Result<()> {
if let Some(mut stream) = self.stream.take() {
while let Some(current) = stream.next() {
self.insert_word(current, DelAdd::Addition, false)?;
}
}
Ok(())
}
pub fn build(
mut self,
index: &crate::Index,
rtxn: &heed::RoTxn,
) -> Result<(Mmap, Option<PrefixData>)> {
self.drain_stream()?;
/// TODO: ugly unwrap
let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap();
let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? };
let prefix_data = self
.prefix_fst_builder
.map(|prefix_fst_builder| prefix_fst_builder.build(index, rtxn))
.transpose()?;
Ok((words_fst_mmap, prefix_data))
}
}
#[derive(Debug)]
pub struct PrefixSettings {
pub prefix_count_threshold: u64,
pub max_prefix_length: usize,
pub compute_prefixes: bool,
}
pub struct PrefixData {
pub prefixes_fst_mmap: Mmap,
pub prefix_delta: PrefixDelta,
}
#[derive(Debug)]
pub struct PrefixDelta {
pub modified: HashSet<Prefix>,
pub deleted: HashSet<Prefix>,
}
struct PrefixFstBuilder {
prefix_count_threshold: u64,
max_prefix_length: usize,
/// TODO: Replace the full memory allocation
prefix_fst_builders: Vec<SetBuilder<Vec<u8>>>,
current_prefix: Vec<Prefix>,
current_prefix_count: Vec<u64>,
modified_prefixes: HashSet<Prefix>,
current_prefix_is_modified: Vec<bool>,
}
impl PrefixFstBuilder {
pub fn new(prefix_settings: PrefixSettings) -> Option<Self> {
let PrefixSettings { prefix_count_threshold, max_prefix_length, compute_prefixes } =
prefix_settings;
if !compute_prefixes {
return None;
}
let mut prefix_fst_builders = Vec::new();
for _ in 0..max_prefix_length {
prefix_fst_builders.push(SetBuilder::memory());
}
Some(Self {
prefix_count_threshold,
max_prefix_length,
prefix_fst_builders,
current_prefix: vec![Prefix::new(); max_prefix_length],
current_prefix_count: vec![0; max_prefix_length],
modified_prefixes: HashSet::new(),
current_prefix_is_modified: vec![false; max_prefix_length],
})
}
fn insert_word(&mut self, bytes: &[u8], deladd: DelAdd, is_modified: bool) -> Result<()> {
for n in 0..self.max_prefix_length {
let current_prefix = &mut self.current_prefix[n];
let current_prefix_count = &mut self.current_prefix_count[n];
let builder = &mut self.prefix_fst_builders[n];
let current_prefix_is_modified = &mut self.current_prefix_is_modified[n];
// We try to get the first n bytes out of this string but we only want
// to split at valid characters bounds. If we try to split in the middle of
@ -153,43 +229,36 @@ impl<'a> WordFstBuilder<'a> {
// This is the first iteration of the loop,
// or the current word doesn't starts with the current prefix.
if *current_prefix_count == 0 || prefix != current_prefix.as_str() {
*current_prefix = SmallString32::from(prefix);
*current_prefix = Prefix::from(prefix);
*current_prefix_count = 0;
*current_prefix_is_modified = false;
}
*current_prefix_count += 1;
*current_prefix_is_modified |= is_modified;
if deladd == DelAdd::Addition {
*current_prefix_count += 1;
}
// There is enough words corresponding to this prefix to add it to the cache.
/// TODO: (LEGACY) Replace this by `==` to avoid inserting several times the same prefix?
if *current_prefix_count >= self.prefix_count_threshold {
if *current_prefix_count == self.prefix_count_threshold {
builder.insert(prefix)?;
if *current_prefix_is_modified {
self.modified_prefixes.insert(current_prefix.clone());
}
}
}
Ok(())
}
fn drain_stream(&mut self) -> Result<()> {
if let Some(mut stream) = self.stream.take() {
while let Some(current) = stream.next() {
self.insert_word(current)?;
}
}
Ok(())
}
pub fn build(mut self) -> Result<(Mmap, Mmap)> {
self.drain_stream()?;
/// TODO: ugly unwrap
let words_fst_file = self.word_fst_builder.into_inner()?.into_inner().unwrap();
let words_fst_mmap = unsafe { Mmap::map(&words_fst_file)? };
fn build(self, index: &crate::Index, rtxn: &heed::RoTxn) -> Result<PrefixData> {
// We merge all of the previously computed prefixes into on final set.
let mut prefix_fsts = Vec::new();
for builder in self.prefix_fst_builders {
prefix_fsts.push(builder.into_set());
for builder in self.prefix_fst_builders.into_iter() {
let prefix_fst = builder.into_set();
prefix_fsts.push(prefix_fst);
}
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
let mut builder = SetBuilder::new(BufWriter::new(tempfile()?))?;
@ -197,14 +266,22 @@ impl<'a> WordFstBuilder<'a> {
/// TODO: ugly unwrap
let prefix_fst_file = builder.into_inner()?.into_inner().unwrap();
let prefix_fst_mmap = unsafe { Mmap::map(&prefix_fst_file)? };
let new_prefix_fst = Set::new(&prefix_fst_mmap)?;
let old_prefix_fst = index.words_prefixes_fst(rtxn)?;
let mut deleted_prefixes = HashSet::new();
{
let mut deleted_prefixes_stream = old_prefix_fst.op().add(&new_prefix_fst).difference();
while let Some(prefix) = deleted_prefixes_stream.next() {
deleted_prefixes.insert(Prefix::from(std::str::from_utf8(prefix)?));
}
}
eprintln!("================================================");
eprintln!(
"inserted words: {}, registered words: {}, base set len: {}",
self.inserted_words, self.registered_words, self.base_set_length
);
eprintln!("================================================");
Ok((words_fst_mmap, prefix_fst_mmap))
Ok(PrefixData {
prefixes_fst_mmap: prefix_fst_mmap,
prefix_delta: PrefixDelta {
modified: self.modified_prefixes,
deleted: deleted_prefixes,
},
})
}
}