mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-24 20:46:27 +00:00 
			
		
		
		
	Use hashmap instead of Btree in wpp extractor
This commit is contained in:
		| @@ -1,4 +1,4 @@ | |||||||
| use std::collections::{BTreeMap, VecDeque}; | use std::collections::{HashMap, VecDeque}; | ||||||
|  |  | ||||||
| use heed::RoTxn; | use heed::RoTxn; | ||||||
| use itertools::merge_join_by; | use itertools::merge_join_by; | ||||||
| @@ -35,10 +35,8 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | |||||||
|         cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>, |         cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>, | ||||||
|         document_change: DocumentChange, |         document_change: DocumentChange, | ||||||
|     ) -> Result<()> { |     ) -> Result<()> { | ||||||
|         /// TODO: mutualize those buffers |  | ||||||
|         let mut key_buffer = Vec::new(); |         let mut key_buffer = Vec::new(); | ||||||
|         let mut add_word_pair_proximity = BTreeMap::new(); |         let mut word_pair_proximity = HashMap::new(); | ||||||
|         let mut del_word_pair_proximity = BTreeMap::new(); |  | ||||||
|         let mut word_positions: VecDeque<(String, u16)> = |         let mut word_positions: VecDeque<(String, u16)> = | ||||||
|             VecDeque::with_capacity(MAX_DISTANCE as usize); |             VecDeque::with_capacity(MAX_DISTANCE as usize); | ||||||
|  |  | ||||||
| @@ -51,7 +49,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | |||||||
|                     document_tokenizer, |                     document_tokenizer, | ||||||
|                     fields_ids_map, |                     fields_ids_map, | ||||||
|                     &mut word_positions, |                     &mut word_positions, | ||||||
|                     &mut del_word_pair_proximity, |                     &mut |(w1, w2), prox| { | ||||||
|  |                         word_pair_proximity | ||||||
|  |                             .entry((w1, w2)) | ||||||
|  |                             .and_modify(|(del_p, _add_p)| { | ||||||
|  |                                 *del_p = std::cmp::min(*del_p, prox); | ||||||
|  |                             }) | ||||||
|  |                             .or_insert((prox, 0)); | ||||||
|  |                     }, | ||||||
|                 )?; |                 )?; | ||||||
|             } |             } | ||||||
|             DocumentChange::Update(inner) => { |             DocumentChange::Update(inner) => { | ||||||
| @@ -61,7 +66,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | |||||||
|                     document_tokenizer, |                     document_tokenizer, | ||||||
|                     fields_ids_map, |                     fields_ids_map, | ||||||
|                     &mut word_positions, |                     &mut word_positions, | ||||||
|                     &mut del_word_pair_proximity, |                     &mut |(w1, w2), prox| { | ||||||
|  |                         word_pair_proximity | ||||||
|  |                             .entry((w1, w2)) | ||||||
|  |                             .and_modify(|(del_p, _add_p)| { | ||||||
|  |                                 *del_p = std::cmp::min(*del_p, prox); | ||||||
|  |                             }) | ||||||
|  |                             .or_insert((prox, 0)); | ||||||
|  |                     }, | ||||||
|                 )?; |                 )?; | ||||||
|                 let document = inner.new(); |                 let document = inner.new(); | ||||||
|                 process_document_tokens( |                 process_document_tokens( | ||||||
| @@ -69,7 +81,14 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | |||||||
|                     document_tokenizer, |                     document_tokenizer, | ||||||
|                     fields_ids_map, |                     fields_ids_map, | ||||||
|                     &mut word_positions, |                     &mut word_positions, | ||||||
|                     &mut add_word_pair_proximity, |                     &mut |(w1, w2), prox| { | ||||||
|  |                         word_pair_proximity | ||||||
|  |                             .entry((w1, w2)) | ||||||
|  |                             .and_modify(|(_del_p, add_p)| { | ||||||
|  |                                 *add_p = std::cmp::min(*add_p, prox); | ||||||
|  |                             }) | ||||||
|  |                             .or_insert((0, prox)); | ||||||
|  |                     }, | ||||||
|                 )?; |                 )?; | ||||||
|             } |             } | ||||||
|             DocumentChange::Insertion(inner) => { |             DocumentChange::Insertion(inner) => { | ||||||
| @@ -79,35 +98,23 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { | |||||||
|                     document_tokenizer, |                     document_tokenizer, | ||||||
|                     fields_ids_map, |                     fields_ids_map, | ||||||
|                     &mut word_positions, |                     &mut word_positions, | ||||||
|                     &mut add_word_pair_proximity, |                     &mut |(w1, w2), prox| { | ||||||
|  |                         word_pair_proximity | ||||||
|  |                             .entry((w1, w2)) | ||||||
|  |                             .and_modify(|(_del_p, add_p)| { | ||||||
|  |                                 *add_p = std::cmp::min(*add_p, prox); | ||||||
|  |                             }) | ||||||
|  |                             .or_insert((0, prox)); | ||||||
|  |                     }, | ||||||
|                 )?; |                 )?; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         use itertools::EitherOrBoth::*; |         for ((w1, w2), (del_p, add_p)) in word_pair_proximity.iter() { | ||||||
|         for eob in |             let key = build_key(*del_p, w1, w2, &mut key_buffer); | ||||||
|             merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| { |             cached_sorter.insert_del_u32(key, docid)?; | ||||||
|                 d.cmp(a) |             let key = build_key(*add_p, w1, w2, &mut key_buffer); | ||||||
|             }) |             cached_sorter.insert_add_u32(key, docid)?; | ||||||
|         { |  | ||||||
|             match eob { |  | ||||||
|                 Left(((w1, w2), prox)) => { |  | ||||||
|                     let key = build_key(*prox, w1, w2, &mut key_buffer); |  | ||||||
|                     cached_sorter.insert_del_u32(key, docid)?; |  | ||||||
|                 } |  | ||||||
|                 Right(((w1, w2), prox)) => { |  | ||||||
|                     let key = build_key(*prox, w1, w2, &mut key_buffer); |  | ||||||
|                     cached_sorter.insert_add_u32(key, docid)?; |  | ||||||
|                 } |  | ||||||
|                 Both(((w1, w2), del_prox), (_, add_prox)) => { |  | ||||||
|                     if del_prox != add_prox { |  | ||||||
|                         let key = build_key(*del_prox, w1, w2, &mut key_buffer); |  | ||||||
|                         cached_sorter.insert_del_u32(key, docid)?; |  | ||||||
|                         let key = build_key(*add_prox, w1, w2, &mut key_buffer); |  | ||||||
|                         cached_sorter.insert_add_u32(key, docid)?; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
| @@ -125,18 +132,19 @@ fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> & | |||||||
|  |  | ||||||
| fn word_positions_into_word_pair_proximity( | fn word_positions_into_word_pair_proximity( | ||||||
|     word_positions: &mut VecDeque<(String, u16)>, |     word_positions: &mut VecDeque<(String, u16)>, | ||||||
|     word_pair_proximity: &mut BTreeMap<(String, String), u8>, |     word_pair_proximity: &mut dyn FnMut((String, String), u8), | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let (head_word, head_position) = word_positions.pop_front().unwrap(); |     let (head_word, head_position) = word_positions.pop_front().unwrap(); | ||||||
|     for (word, position) in word_positions.iter() { |     for (word, position) in word_positions.iter() { | ||||||
|         let prox = index_proximity(head_position as u32, *position as u32) as u8; |         let prox = index_proximity(head_position as u32, *position as u32) as u8; | ||||||
|         if prox > 0 && prox < MAX_DISTANCE as u8 { |         if prox > 0 && prox < MAX_DISTANCE as u8 { | ||||||
|             word_pair_proximity |             word_pair_proximity((head_word.clone(), word.clone()), prox); | ||||||
|                 .entry((head_word.clone(), word.clone())) |             // word_pair_proximity | ||||||
|                 .and_modify(|p| { |             //     .entry((head_word.clone(), word.clone())) | ||||||
|                     *p = std::cmp::min(*p, prox); |             //     .and_modify(|p| { | ||||||
|                 }) |             //         *p = std::cmp::min(*p, prox); | ||||||
|                 .or_insert(prox); |             //     }) | ||||||
|  |             //     .or_insert(prox); | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     Ok(()) |     Ok(()) | ||||||
| @@ -147,7 +155,7 @@ fn process_document_tokens( | |||||||
|     document_tokenizer: &DocumentTokenizer, |     document_tokenizer: &DocumentTokenizer, | ||||||
|     fields_ids_map: &mut GlobalFieldsIdsMap, |     fields_ids_map: &mut GlobalFieldsIdsMap, | ||||||
|     word_positions: &mut VecDeque<(String, u16)>, |     word_positions: &mut VecDeque<(String, u16)>, | ||||||
|     word_pair_proximity: &mut BTreeMap<(String, String), u8>, |     word_pair_proximity: &mut dyn FnMut((String, String), u8), | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { |     let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| { | ||||||
|         // drain the proximity window until the head word is considered close to the word we are inserting. |         // drain the proximity window until the head word is considered close to the word we are inserting. | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user