mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Use hashmap instead of Btree in wpp extractor
This commit is contained in:
		| @@ -1,4 +1,4 @@ | ||||
| use std::collections::{BTreeMap, VecDeque}; | ||||
| use std::collections::{HashMap, VecDeque}; | ||||
|  | ||||
| use heed::RoTxn; | ||||
| use itertools::merge_join_by; | ||||
| @@ -35,10 +35,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|         cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>, | ||||
|         document_change: DocumentChange, | ||||
|     ) -> Result<()> { | ||||
|         /// TODO: mutualize those buffers | ||||
|         let mut key_buffer = Vec::new(); | ||||
|         let mut add_word_pair_proximity = BTreeMap::new(); | ||||
|         let mut del_word_pair_proximity = BTreeMap::new(); | ||||
|         let mut word_pair_proximity = HashMap::new(); | ||||
|         let mut word_positions: VecDeque<(String, u16)> = | ||||
|             VecDeque::with_capacity(MAX_DISTANCE as usize); | ||||
|  | ||||
| @@ -51,7 +49,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|                     document_tokenizer, | ||||
|                     fields_ids_map, | ||||
|                     &mut word_positions, | ||||
|                     &mut del_word_pair_proximity, | ||||
|                     &mut |(w1, w2), prox| { | ||||
|                         word_pair_proximity | ||||
|                             .entry((w1, w2)) | ||||
|                             .and_modify(|(del_p, _add_p)| { | ||||
|                                 *del_p = std::cmp::min(*del_p, prox); | ||||
|                             }) | ||||
|                             .or_insert((prox, 0)); | ||||
|                     }, | ||||
|                 )?; | ||||
|             } | ||||
|             DocumentChange::Update(inner) => { | ||||
| @@ -61,7 +66,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|                     document_tokenizer, | ||||
|                     fields_ids_map, | ||||
|                     &mut word_positions, | ||||
|                     &mut del_word_pair_proximity, | ||||
|                     &mut |(w1, w2), prox| { | ||||
|                         word_pair_proximity | ||||
|                             .entry((w1, w2)) | ||||
|                             .and_modify(|(del_p, _add_p)| { | ||||
|                                 *del_p = std::cmp::min(*del_p, prox); | ||||
|                             }) | ||||
|                             .or_insert((prox, 0)); | ||||
|                     }, | ||||
|                 )?; | ||||
|                 let document = inner.new(); | ||||
|                 process_document_tokens( | ||||
| @@ -69,7 +81,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|                     document_tokenizer, | ||||
|                     fields_ids_map, | ||||
|                     &mut word_positions, | ||||
|                     &mut add_word_pair_proximity, | ||||
|                     &mut |(w1, w2), prox| { | ||||
|                         word_pair_proximity | ||||
|                             .entry((w1, w2)) | ||||
|                             .and_modify(|(_del_p, add_p)| { | ||||
|                                 *add_p = std::cmp::min(*add_p, prox); | ||||
|                             }) | ||||
|                             .or_insert((0, prox)); | ||||
|                     }, | ||||
|                 )?; | ||||
|             } | ||||
|             DocumentChange::Insertion(inner) => { | ||||
| @@ -79,36 +98,24 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | ||||
|                     document_tokenizer, | ||||
|                     fields_ids_map, | ||||
|                     &mut word_positions, | ||||
|                     &mut add_word_pair_proximity, | ||||
|                     &mut |(w1, w2), prox| { | ||||
|                         word_pair_proximity | ||||
|                             .entry((w1, w2)) | ||||
|                             .and_modify(|(_del_p, add_p)| { | ||||
|                                 *add_p = std::cmp::min(*add_p, prox); | ||||
|                             }) | ||||
|                             .or_insert((0, prox)); | ||||
|                     }, | ||||
|                 )?; | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         use itertools::EitherOrBoth::*; | ||||
|         for eob in | ||||
|             merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { | ||||
|                 d.cmp(a) | ||||
|             }) | ||||
|         { | ||||
|             match eob { | ||||
|                 Left(((w1, w2), prox)) => { | ||||
|                     let key = build_key(*prox, w1, w2, &mut key_buffer); | ||||
|         for ((w1, w2), (del_p, add_p)) in word_pair_proximity.iter() { | ||||
|             let key = build_key(*del_p, w1, w2, &mut key_buffer); | ||||
|             cached_sorter.insert_del_u32(key, docid)?; | ||||
|                 } | ||||
|                 Right(((w1, w2), prox)) => { | ||||
|                     let key = build_key(*prox, w1, w2, &mut key_buffer); | ||||
|             let key = build_key(*add_p, w1, w2, &mut key_buffer); | ||||
|             cached_sorter.insert_add_u32(key, docid)?; | ||||
|         } | ||||
|                 Both(((w1, w2), del_prox), (_, add_prox)) => { | ||||
|                     if del_prox != add_prox { | ||||
|                         let key = build_key(*del_prox, w1, w2, &mut key_buffer); | ||||
|                         cached_sorter.insert_del_u32(key, docid)?; | ||||
|                         let key = build_key(*add_prox, w1, w2, &mut key_buffer); | ||||
|                         cached_sorter.insert_add_u32(key, docid)?; | ||||
|                     } | ||||
|                 } | ||||
|             } | ||||
|         } | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
| @@ -125,18 +132,19 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> & | ||||
|  | ||||
| fn word_positions_into_word_pair_proximity( | ||||
|     word_positions: &mut VecDeque<(String, u16)>, | ||||
|     word_pair_proximity: &mut BTreeMap<(String, String), u8>, | ||||
|     word_pair_proximity: &mut dyn FnMut((String, String), u8), | ||||
| ) -> Result<()> { | ||||
|     let (head_word, head_position) = word_positions.pop_front().unwrap(); | ||||
|     for (word, position) in word_positions.iter() { | ||||
|         let prox = index_proximity(head_position as u32, *position as u32) as u8; | ||||
|         if prox > 0 && prox < MAX_DISTANCE as u8 { | ||||
|             word_pair_proximity | ||||
|                 .entry((head_word.clone(), word.clone())) | ||||
|                 .and_modify(|p| { | ||||
|                     *p = std::cmp::min(*p, prox); | ||||
|                 }) | ||||
|                 .or_insert(prox); | ||||
|             word_pair_proximity((head_word.clone(), word.clone()), prox); | ||||
|             // word_pair_proximity | ||||
|             //     .entry((head_word.clone(), word.clone())) | ||||
|             //     .and_modify(|p| { | ||||
|             //         *p = std::cmp::min(*p, prox); | ||||
|             //     }) | ||||
|             //     .or_insert(prox); | ||||
|         } | ||||
|     } | ||||
|     Ok(()) | ||||
| @@ -147,7 +155,7 @@ fn process_document_tokens( | ||||
|     document_tokenizer: &DocumentTokenizer, | ||||
|     fields_ids_map: &mut GlobalFieldsIdsMap, | ||||
|     word_positions: &mut VecDeque<(String, u16)>, | ||||
|     word_pair_proximity: &mut BTreeMap<(String, String), u8>, | ||||
|     word_pair_proximity: &mut dyn FnMut((String, String), u8), | ||||
| ) -> Result<()> { | ||||
|     let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { | ||||
|         // drain the proximity window until the head word is considered close to the word we are inserting. | ||||
|   | ||||
		Reference in New Issue
	
	Block a user