word prox: Remove again the interim BVec

This commit is contained in:
Louis Dureuil
2024-11-25 15:05:50 +01:00
parent dd76eaaaec
commit 378c1df811

View File

@ -42,8 +42,6 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
let rtxn = &context.rtxn; let rtxn = &context.rtxn;
let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc); let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc);
let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc);
let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield();
let new_fields_ids_map = &mut *new_fields_ids_map; let new_fields_ids_map = &mut *new_fields_ids_map;
@ -65,7 +63,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map, new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox)); let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)
}, },
)?; )?;
} }
@ -77,7 +76,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map, new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
del_word_pair_proximity.push(((w1, w2), prox)); let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)
}, },
)?; )?;
let document = inner.merged(rtxn, index, context.db_fields_ids_map)?; let document = inner.merged(rtxn, index, context.db_fields_ids_map)?;
@ -87,7 +87,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map, new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox)); let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)
}, },
)?; )?;
} }
@ -99,25 +100,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
new_fields_ids_map, new_fields_ids_map,
&mut word_positions, &mut word_positions,
&mut |(w1, w2), prox| { &mut |(w1, w2), prox| {
add_word_pair_proximity.push(((w1, w2), prox)); let key = build_key(prox, &w1, &w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)
}, },
)?; )?;
} }
} }
del_word_pair_proximity.sort_unstable();
del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in del_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_del_u32(key, docid)?;
}
add_word_pair_proximity.sort_unstable();
add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
for ((w1, w2), prox) in add_word_pair_proximity.iter() {
let key = build_key(*prox, w1, w2, &mut key_buffer);
cached_sorter.insert_add_u32(key, docid)?;
}
Ok(()) Ok(())
} }
} }
@ -138,24 +127,26 @@ fn build_key<'a>(
fn word_positions_into_word_pair_proximity( fn word_positions_into_word_pair_proximity(
word_positions: &mut VecDeque<(Rc<str>, u16)>, word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8), word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8) -> Result<()>,
) { ) -> Result<()> {
let (head_word, head_position) = word_positions.pop_front().unwrap(); let (head_word, head_position) = word_positions.pop_front().unwrap();
for (word, position) in word_positions.iter() { for (word, position) in word_positions.iter() {
let prox = index_proximity(head_position as u32, *position as u32) as u8; let prox = index_proximity(head_position as u32, *position as u32) as u8;
if prox > 0 && prox < MAX_DISTANCE as u8 { if prox > 0 && prox < MAX_DISTANCE as u8 {
word_pair_proximity((head_word.clone(), word.clone()), prox); word_pair_proximity((head_word.clone(), word.clone()), prox)?;
} }
} }
Ok(())
} }
fn drain_word_positions( fn drain_word_positions(
word_positions: &mut VecDeque<(Rc<str>, u16)>, word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8), word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8) -> Result<()>,
) { ) -> Result<()> {
while !word_positions.is_empty() { while !word_positions.is_empty() {
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
} }
Ok(())
} }
fn process_document_tokens<'doc>( fn process_document_tokens<'doc>(
@ -163,20 +154,20 @@ fn process_document_tokens<'doc>(
document_tokenizer: &DocumentTokenizer, document_tokenizer: &DocumentTokenizer,
fields_ids_map: &mut GlobalFieldsIdsMap, fields_ids_map: &mut GlobalFieldsIdsMap,
word_positions: &mut VecDeque<(Rc<str>, u16)>, word_positions: &mut VecDeque<(Rc<str>, u16)>,
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8), word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
let mut field_id = None; let mut field_id = None;
let mut token_fn = |_fname: &str, fid: FieldId, pos: u16, word: &str| { let mut token_fn = |_fname: &str, fid: FieldId, pos: u16, word: &str| {
if field_id != Some(fid) { if field_id != Some(fid) {
field_id = Some(fid); field_id = Some(fid);
drain_word_positions(word_positions, word_pair_proximity); drain_word_positions(word_positions, word_pair_proximity)?;
} }
// drain the proximity window until the head word is considered close to the word we are inserting. // drain the proximity window until the head word is considered close to the word we are inserting.
while word_positions while word_positions
.front() .front()
.map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
{ {
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
} }
// insert the new word. // insert the new word.
@ -185,6 +176,6 @@ fn process_document_tokens<'doc>(
}; };
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
drain_word_positions(word_positions, word_pair_proximity); drain_word_positions(word_positions, word_pair_proximity)?;
Ok(()) Ok(())
} }