diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index dcd9e3a78..db64e1132 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -42,8 +42,6 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { let rtxn = &context.rtxn; let mut key_buffer = bumpalo::collections::Vec::new_in(doc_alloc); - let mut del_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); - let mut add_word_pair_proximity = bumpalo::collections::Vec::new_in(doc_alloc); let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let new_fields_ids_map = &mut *new_fields_ids_map; @@ -65,7 +63,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - del_word_pair_proximity.push(((w1, w2), prox)); + let key = build_key(prox, &w1, &w2, &mut key_buffer); + cached_sorter.insert_del_u32(key, docid) }, )?; } @@ -77,7 +76,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - del_word_pair_proximity.push(((w1, w2), prox)); + let key = build_key(prox, &w1, &w2, &mut key_buffer); + cached_sorter.insert_del_u32(key, docid) }, )?; let document = inner.merged(rtxn, index, context.db_fields_ids_map)?; @@ -87,7 +87,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - add_word_pair_proximity.push(((w1, w2), prox)); + let key = build_key(prox, &w1, &w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid) }, )?; } @@ -99,25 +100,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { new_fields_ids_map, &mut word_positions, &mut |(w1, w2), prox| { - add_word_pair_proximity.push(((w1, w2), prox)); + let key = build_key(prox, &w1, &w2, &mut key_buffer); + cached_sorter.insert_add_u32(key, docid) }, )?; } } - del_word_pair_proximity.sort_unstable(); - del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2); - for ((w1, w2), prox) in del_word_pair_proximity.iter() { - let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid)?; - } - - add_word_pair_proximity.sort_unstable(); - add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2); - for ((w1, w2), prox) in add_word_pair_proximity.iter() { - let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid)?; - } Ok(()) } } @@ -138,24 +127,26 @@ fn build_key<'a>( fn word_positions_into_word_pair_proximity( word_positions: &mut VecDeque<(Rc, u16)>, - word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), -) { + word_pair_proximity: &mut impl FnMut((Rc, Rc), u8) -> Result<()>, +) -> Result<()> { let (head_word, head_position) = word_positions.pop_front().unwrap(); for (word, position) in word_positions.iter() { let prox = index_proximity(head_position as u32, *position as u32) as u8; if prox > 0 && prox < MAX_DISTANCE as u8 { - word_pair_proximity((head_word.clone(), word.clone()), prox); + word_pair_proximity((head_word.clone(), word.clone()), prox)?; } } + Ok(()) } fn drain_word_positions( word_positions: &mut VecDeque<(Rc, u16)>, - word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), -) { + word_pair_proximity: &mut impl FnMut((Rc, Rc), u8) -> Result<()>, +) -> Result<()> { while !word_positions.is_empty() { - word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; } + Ok(()) } fn process_document_tokens<'doc>( @@ -163,20 +154,20 @@ fn process_document_tokens<'doc>( document_tokenizer: &DocumentTokenizer, fields_ids_map: &mut GlobalFieldsIdsMap, word_positions: &mut VecDeque<(Rc, u16)>, - word_pair_proximity: &mut impl FnMut((Rc, Rc), u8), + word_pair_proximity: &mut impl FnMut((Rc, Rc), u8) -> Result<()>, ) -> Result<()> { let mut field_id = None; let mut token_fn = |_fname: &str, fid: FieldId, pos: u16, word: &str| { if field_id != Some(fid) { field_id = Some(fid); - drain_word_positions(word_positions, word_pair_proximity); + drain_word_positions(word_positions, word_pair_proximity)?; } // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions .front() .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE) { - word_positions_into_word_pair_proximity(word_positions, word_pair_proximity); + word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?; } // insert the new word. @@ -185,6 +176,6 @@ fn process_document_tokens<'doc>( }; document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?; - drain_word_positions(word_positions, word_pair_proximity); + drain_word_positions(word_positions, word_pair_proximity)?; Ok(()) }