Differentiate short words as prefix or exact matches

This commit is contained in:
Clément Renault
2020-01-16 12:01:51 +01:00
parent 9809ded23d
commit 70d4f47f37
2 changed files with 46 additions and 23 deletions

View File

@@ -195,11 +195,16 @@ pub fn apply_documents_addition<'a, 'b>(
pplc_store.clear(writer)?;
for prefix_len in 1..=2 {
// compute prefixes and store those in the PrefixPostingsListsCache.
// compute prefixes and store those in the PrefixPostingsListsCache store.
let mut previous_prefix: Option<([u8; 4], Vec<_>)> = None;
let mut stream = words_fst.into_stream();
while let Some(input) = stream.next() {
if input.len() < prefix_len { continue }
// We skip the prefixes that are shorter than the current length
// we want to cache (<). We must ignore the input when it is exactly the
// same word as the prefix because if we match exactly on it we need
// to consider it as an exact match and not as a prefix (=).
if input.len() <= prefix_len { continue }
if let Some(postings_list) = postings_lists_store.postings_list(writer, input)?.map(|p| p.matches.into_owned()) {
let prefix = &input[..prefix_len];
@@ -208,38 +213,33 @@ pub fn apply_documents_addition<'a, 'b>(
arr_prefix[..prefix_len].copy_from_slice(prefix);
match previous_prefix {
Some((ref mut prev_prefix, ref mut prev_postings_list)) if *prev_prefix != arr_prefix => {
prev_postings_list.sort_unstable();
prev_postings_list.dedup();
Some((ref mut prev_prefix, ref mut prev_pl)) if *prev_prefix != arr_prefix => {
prev_pl.sort_unstable();
prev_pl.dedup();
if let Ok(prefix) = std::str::from_utf8(&prev_prefix[..prefix_len]) {
debug!("writing the prefix of {:?} of length {}",
prefix, prev_postings_list.len());
debug!("writing the prefix of {:?} of length {}", prefix, prev_pl.len());
}
let pls = Set::new_unchecked(&prev_postings_list);
let pls = Set::new_unchecked(&prev_pl);
pplc_store.put_prefix_postings_list(writer, *prev_prefix, &pls)?;
*prev_prefix = arr_prefix;
prev_postings_list.clear();
prev_postings_list.extend_from_slice(&postings_list);
},
Some((_, ref mut prev_postings_list)) => {
prev_postings_list.extend_from_slice(&postings_list);
},
None => {
previous_prefix = Some((arr_prefix, postings_list.to_vec()));
prev_pl.clear();
prev_pl.extend_from_slice(&postings_list);
},
Some((_, ref mut prev_pl)) => prev_pl.extend_from_slice(&postings_list),
None => previous_prefix = Some((arr_prefix, postings_list.to_vec())),
}
}
}
// write the last prefix postings lists
if let Some((prev_prefix, mut prev_postings_list)) = previous_prefix.take() {
prev_postings_list.sort_unstable();
prev_postings_list.dedup();
if let Some((prev_prefix, mut prev_pl)) = previous_prefix.take() {
prev_pl.sort_unstable();
prev_pl.dedup();
let pls = Set::new_unchecked(&prev_postings_list);
let pls = Set::new_unchecked(&prev_pl);
pplc_store.put_prefix_postings_list(writer, prev_prefix, &pls)?;
}
}