word prox: Remove again the interim BVec

This commit is contained in:
Louis Dureuil
2024-11-25 15:05:50 +01:00
parent dd76eaaaec
commit 378c1df811

View File

@ -42,8 +42,6 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
let rtxn = &context.rtxn;
let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let new_fields_ids_map = &mut *new_fields_ids_map;
@ -65,7 +63,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map,
&mut word_positions,
&mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox));
let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)
},
)?;
}
@ -77,7 +76,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map,
&mut word_positions,
&mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox));
let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)
},
)?;
let document = inner.merged(rtxn, index, context.db_fields_ids_map)?;
@ -87,7 +87,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map,
&mut word_positions,
&mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox));
let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)
},
)?;
}
@ -99,25 +100,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map,
&mut word_positions,
&mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox));
let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)
},
)?;
}
}
del_word_pair_proximity.sort_unstable();
del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in del_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)?;
}
add_word_pair_proximity.sort_unstable();
add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in add_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)?;
}
Ok(())
}
}
@ -138,24 +127,26 @@ fn build_key<'a>(
fn word_positions_into_word_pair_proximity(
word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
) {
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8) -> Result<()>,
) -> Result<()> {
let (head_word, head_position) = word_positions.pop_front().unwrap();
for (word, position) in word_positions.iter() {
let prox = index_proximity(head_position as u32, *position as u32) as u8;
if prox > 0 && prox < MAX_DISTANCE as u8 {
word_pair_proximity((head_word.clone(), word.clone()), prox);
word_pair_proximity((head_word.clone(), word.clone()), prox)?;
}
}
Ok(())
}
fn drain_word_positions(
word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
) {
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8) -> Result<()>,
) -> Result<()> {
while !word_positions.is_empty() {
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
}
Ok(())
}
fn process_document_tokens<'doc>(
@ -163,20 +154,20 @@ fn process_document_tokens<'doc>(
document_tokenizer: &DocumentTokenizer,
fields_ids_map: &mut GlobalFieldsIdsMap,
word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8) -> Result<()>,
) -> Result<()> {
let mut field_id = None;
let mut token_fn = |_fname: &str, fid: FieldId, pos: u16, word: &str| {
if field_id != Some(fid) {
field_id = Some(fid);
drain_word_positions(word_positions, word_pair_proximity);
drain_word_positions(word_positions, word_pair_proximity)?;
}
// drain the proximity window until the head word is considered close to the word we are inserting.
while word_positions
.front()
.map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
{
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity);
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
}
// insert the new word.
@ -185,6 +176,6 @@ fn process_document_tokens<'doc>(
};
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
drain_word_positions(word_positions, word_pair_proximity);
drain_word_positions(word_positions, word_pair_proximity)?;
Ok(())
}