feat(index): update fields distribution in clear & delete operations

fixes after review

bump the version of the tokenizer

implement a first version of the stop_words

The front must provide a BTreeSet containing the stop words
The stop_words are set at None if an empty Set is provided
add the stop-words in the http-ui interface

Use maplit in the test
and remove all the useless drop(rtxn) at the end of all tests

Integrate the stop_words in the querytree

remove the stop_words from the querytree except if it was a prefix or a typo

more fixes after review
This commit is contained in:
Alexey Shekhirin
2021-04-01 10:07:16 +03:00
parent 27c7ab6e00
commit 2658c5c545
7 changed files with 128 additions and 34 deletions

View File

@@ -1,3 +1,6 @@
use std::collections::HashMap;
use std::collections::hash_map::Entry;
use anyhow::anyhow;
use chrono::Utc;
use fst::IntoStreamer;
@@ -90,6 +93,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
documents,
} = self.index;
// Number of fields for each document that has been deleted.
let mut fields_ids_distribution_diff = HashMap::new();
// Retrieve the words and the external documents ids contained in the documents.
let mut words = Vec::new();
let mut external_ids = Vec::new();
@@ -100,6 +106,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
let key = BEU32::new(docid);
let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
if let Some((_key, obkv)) = iter.next().transpose()? {
for (field_id, _) in obkv.iter() {
*fields_ids_distribution_diff.entry(field_id).or_default() += 1;
}
if let Some(content) = obkv.get(id_field) {
let external_id = match serde_json::from_slice(content).unwrap() {
Value::String(string) => SmallString32::from(string.as_str()),
@@ -112,7 +122,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
drop(iter);
// We iterate througt the words positions of the document id,
// We iterate through the words positions of the document id,
// retrieve the word and delete the positions.
let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
while let Some(result) = iter.next() {
@@ -123,6 +133,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
}
}
let mut fields_distribution = self.index.fields_distribution(self.wtxn)?;
// We use pre-calculated number of fields occurrences that needs to be deleted
// to reflect deleted documents.
// If all field occurrences are removed, delete the entry from distribution.
// Otherwise, insert new number of occurrences (current_count - count_diff).
for (field_id, count_diff) in fields_ids_distribution_diff {
let field_name = fields_ids_map.name(field_id).unwrap();
if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count)
};
}
}
self.index.put_fields_distribution(self.wtxn, &fields_distribution)?;
// We create the FST map of the external ids that we must delete.
external_ids.sort_unstable();
let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?;
@@ -347,5 +375,9 @@ mod tests {
builder.execute().unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
assert!(index.fields_distribution(&rtxn).unwrap().is_empty());
}
}