mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Change encoding of word_pair_proximity DB to (proximity, word1, word2)
Same for word_prefix_pair_proximity
This commit is contained in:
		
				
					committed by
					
						 Loïc Lecrenier
						Loïc Lecrenier
					
				
			
			
				
	
			
			
			
						parent
						
							19b2326f3d
						
					
				
				
					commit
					bdeb47305e
				
			| @@ -7,12 +7,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { | |||||||
|     type DItem = (&'a str, &'a str, u8); |     type DItem = (&'a str, &'a str, u8); | ||||||
|  |  | ||||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { |     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||||
|         let (n, bytes) = bytes.split_last()?; |         let (n, bytes) = bytes.split_first()?; | ||||||
|         let s1_end = bytes.iter().position(|b| *b == 0)?; |         let s1_end = bytes.iter().position(|b| *b == 0)?; | ||||||
|         let (s1_bytes, rest) = bytes.split_at(s1_end); |         let (s1_bytes, rest) = bytes.split_at(s1_end); | ||||||
|         let rest = &rest[1..]; |         let s2_bytes = &rest[1..]; | ||||||
|         let s1 = str::from_utf8(s1_bytes).ok()?; |         let s1 = str::from_utf8(s1_bytes).ok()?; | ||||||
|         let (_, s2_bytes) = rest.split_last()?; |  | ||||||
|         let s2 = str::from_utf8(s2_bytes).ok()?; |         let s2 = str::from_utf8(s2_bytes).ok()?; | ||||||
|         Some((s1, s2, *n)) |         Some((s1, s2, *n)) | ||||||
|     } |     } | ||||||
| @@ -22,12 +21,11 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { | |||||||
|     type EItem = (&'a str, &'a str, u8); |     type EItem = (&'a str, &'a str, u8); | ||||||
|  |  | ||||||
|     fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> { |     fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> { | ||||||
|         let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); |         let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); | ||||||
|  |         bytes.push(*n); | ||||||
|         bytes.extend_from_slice(s1.as_bytes()); |         bytes.extend_from_slice(s1.as_bytes()); | ||||||
|         bytes.push(0); |         bytes.push(0); | ||||||
|         bytes.extend_from_slice(s2.as_bytes()); |         bytes.extend_from_slice(s2.as_bytes()); | ||||||
|         bytes.push(0); |  | ||||||
|         bytes.push(*n); |  | ||||||
|         Some(Cow::Owned(bytes)) |         Some(Cow::Owned(bytes)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -37,11 +35,10 @@ impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { | |||||||
|     type DItem = (&'a [u8], &'a [u8], u8); |     type DItem = (&'a [u8], &'a [u8], u8); | ||||||
|  |  | ||||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { |     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||||
|         let (n, bytes) = bytes.split_last()?; |         let (n, bytes) = bytes.split_first()?; | ||||||
|         let s1_end = bytes.iter().position(|b| *b == 0)?; |         let s1_end = bytes.iter().position(|b| *b == 0)?; | ||||||
|         let (s1_bytes, rest) = bytes.split_at(s1_end); |         let (s1_bytes, rest) = bytes.split_at(s1_end); | ||||||
|         let rest = &rest[1..]; |         let s2_bytes = &rest[1..]; | ||||||
|         let (_, s2_bytes) = rest.split_last()?; |  | ||||||
|         Some((s1_bytes, s2_bytes, *n)) |         Some((s1_bytes, s2_bytes, *n)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -50,12 +47,11 @@ impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { | |||||||
|     type EItem = (&'a [u8], &'a [u8], u8); |     type EItem = (&'a [u8], &'a [u8], u8); | ||||||
|  |  | ||||||
|     fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> { |     fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> { | ||||||
|         let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); |         let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); | ||||||
|  |         bytes.push(*n); | ||||||
|         bytes.extend_from_slice(s1); |         bytes.extend_from_slice(s1); | ||||||
|         bytes.push(0); |         bytes.push(0); | ||||||
|         bytes.extend_from_slice(s2); |         bytes.extend_from_slice(s2); | ||||||
|         bytes.push(0); |  | ||||||
|         bytes.push(*n); |  | ||||||
|         Some(Cow::Owned(bytes)) |         Some(Cow::Owned(bytes)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -194,7 +194,7 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { | |||||||
|         (word1, prefix, proximity), |         (word1, prefix, proximity), | ||||||
|         b, |         b, | ||||||
|     )| { |     )| { | ||||||
|         &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) |         &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) | ||||||
|     }); |     }); | ||||||
|     snap |     snap | ||||||
| } | } | ||||||
|   | |||||||
| @@ -151,11 +151,10 @@ fn document_word_positions_into_sorter<'b>( | |||||||
|     let mut key_buffer = Vec::new(); |     let mut key_buffer = Vec::new(); | ||||||
|     for ((w1, w2), prox) in word_pair_proximity { |     for ((w1, w2), prox) in word_pair_proximity { | ||||||
|         key_buffer.clear(); |         key_buffer.clear(); | ||||||
|  |         key_buffer.push(prox as u8); | ||||||
|         key_buffer.extend_from_slice(w1.as_bytes()); |         key_buffer.extend_from_slice(w1.as_bytes()); | ||||||
|         key_buffer.push(0); |         key_buffer.push(0); | ||||||
|         key_buffer.extend_from_slice(w2.as_bytes()); |         key_buffer.extend_from_slice(w2.as_bytes()); | ||||||
|         key_buffer.push(0); |  | ||||||
|         key_buffer.push(prox as u8); |  | ||||||
|  |  | ||||||
|         word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; |         word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -1,46 +1,46 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/word_prefix_pair_proximity_docids.rs | source: milli/src/update/word_prefix_pair_proximity_docids.rs | ||||||
| --- | --- | ||||||
| 5                a    1  [101, ] | 1  5                a    [101, ] | ||||||
| 5                a    2  [101, ] | 1  amazing          a    [100, ] | ||||||
| 5                b    4  [101, ] | 1  an               a    [100, ] | ||||||
| 5                be   4  [101, ] | 1  and              b    [100, ] | ||||||
| am               a    3  [101, ] | 1  and              be   [100, ] | ||||||
| amazing          a    1  [100, ] | 1  at               a    [100, ] | ||||||
| amazing          a    2  [100, ] | 1  rings            a    [101, ] | ||||||
| amazing          a    3  [100, ] | 1  the              b    [101, ] | ||||||
| amazing          b    2  [100, ] | 1  the              be   [101, ] | ||||||
| amazing          be   2  [100, ] | 2  5                a    [101, ] | ||||||
| an               a    1  [100, ] | 2  amazing          a    [100, ] | ||||||
| an               a    2  [100, ] | 2  amazing          b    [100, ] | ||||||
| an               b    3  [100, ] | 2  amazing          be   [100, ] | ||||||
| an               be   3  [100, ] | 2  an               a    [100, ] | ||||||
| and              a    2  [100, ] | 2  and              a    [100, ] | ||||||
| and              a    3  [100, ] | 2  at               a    [100, 101, ] | ||||||
| and              a    4  [100, ] | 2  beautiful        a    [100, ] | ||||||
| and              b    1  [100, ] | 2  bell             a    [101, ] | ||||||
| and              be   1  [100, ] | 2  house            b    [100, ] | ||||||
| at               a    1  [100, ] | 2  house            be   [100, ] | ||||||
| at               a    2  [100, 101, ] | 2  rings            b    [101, ] | ||||||
| at               a    3  [100, ] | 2  rings            be   [101, ] | ||||||
| at               b    3  [101, ] | 3  am               a    [101, ] | ||||||
| at               b    4  [100, ] | 3  amazing          a    [100, ] | ||||||
| at               be   3  [101, ] | 3  an               b    [100, ] | ||||||
| at               be   4  [100, ] | 3  an               be   [100, ] | ||||||
| beautiful        a    2  [100, ] | 3  and              a    [100, ] | ||||||
| beautiful        a    3  [100, ] | 3  at               a    [100, ] | ||||||
| beautiful        a    4  [100, ] | 3  at               b    [101, ] | ||||||
| bell             a    2  [101, ] | 3  at               be   [101, ] | ||||||
| bell             a    4  [101, ] | 3  beautiful        a    [100, ] | ||||||
| house            a    3  [100, ] | 3  house            a    [100, ] | ||||||
| house            a    4  [100, ] | 3  rings            a    [101, ] | ||||||
| house            b    2  [100, ] | 3  the              a    [101, ] | ||||||
| house            be   2  [100, ] | 4  5                b    [101, ] | ||||||
| rings            a    1  [101, ] | 4  5                be   [101, ] | ||||||
| rings            a    3  [101, ] | 4  and              a    [100, ] | ||||||
| rings            b    2  [101, ] | 4  at               b    [100, ] | ||||||
| rings            be   2  [101, ] | 4  at               be   [100, ] | ||||||
| the              a    3  [101, ] | 4  beautiful        a    [100, ] | ||||||
| the              b    1  [101, ] | 4  bell             a    [101, ] | ||||||
| the              be   1  [101, ] | 4  house            a    [100, ] | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| --- | --- | ||||||
| source: milli/src/update/word_prefix_pair_proximity_docids.rs | source: milli/src/update/word_prefix_pair_proximity_docids.rs | ||||||
| --- | --- | ||||||
| 5ed4bf83317b10962a55ade353427bdd | fb88e49fd666886731b62baef8f44995 | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| /*! | /*! | ||||||
|  ## What is WordPrefixPairProximityDocids? |  ## What is WordPrefixPairProximityDocids? | ||||||
| The word-prefix-pair-proximity-docids database is a database whose keys are of | The word-prefix-pair-proximity-docids database is a database whose keys are of | ||||||
| the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of | the form `(proximity, word, prefix)` and the values are roaring bitmaps of | ||||||
| the documents which contain `word` followed by another word starting with | the documents which contain `word` followed by another word starting with | ||||||
| `prefix` at a distance of `proximity`. | `prefix` at a distance of `proximity`. | ||||||
|  |  | ||||||
| @@ -23,127 +23,100 @@ dog | |||||||
| Note that only prefixes which correspond to more than a certain number of | Note that only prefixes which correspond to more than a certain number of | ||||||
| different words from the database are included in this list. | different words from the database are included in this list. | ||||||
|  |  | ||||||
| * a sorted list of word pairs and the distance between them (i.e. proximity), | * a sorted list of proximities and word pairs (the proximity is the distance between the two words), | ||||||
| * associated with a roaring bitmap, such as: | associated with a roaring bitmap, such as: | ||||||
| ```text | ```text | ||||||
| good dog   3         -> docids1: [2, 5, 6] | 1 good doggo         -> docids1: [8] | ||||||
| good doggo 1         -> docids2: [8] | 1 good door          -> docids2: [7, 19, 20] | ||||||
| good dogma 1         -> docids3: [7, 19, 20] | 1 good ghost         -> docids3: [1] | ||||||
| good ghost 2         -> docids4: [1] | 2 good dog           -> docids4: [2, 5, 6] | ||||||
| horror cathedral 4   -> docids5: [1, 2] | 2 horror cathedral   -> docids5: [1, 2] | ||||||
| ``` | ``` | ||||||
|  |  | ||||||
| I illustrate a simplified version of the algorithm to create the word-prefix | I illustrate a simplified version of the algorithm to create the word-prefix | ||||||
| pair-proximity database below: | pair-proximity database below: | ||||||
|  |  | ||||||
| 1. **Outer loop:** First, we iterate over each word pair and its proximity: | 1. **Outer loop:** First, we iterate over each proximity and word pair: | ||||||
| ```text | ```text | ||||||
|  | proximity: 1 | ||||||
| word1    : good | word1    : good | ||||||
| word2    : dog | word2    : doggo | ||||||
| proximity: 3 |  | ||||||
| ``` | ``` | ||||||
| 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are | 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are | ||||||
| in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) | in the list of sorted prefixes. And we insert the key `prefix` | ||||||
| and the value (`docids`) to a sorted map which we call the “batch”. For example, | and the value (`docids`) to a sorted map which we call the “batch”. For example, | ||||||
| at the end of the first inner loop, we may have: | at the end of the first inner loop, we may have: | ||||||
| ```text | ```text | ||||||
| Outer loop 1: | Outer loop 1: | ||||||
| ------------------------------ | ------------------------------ | ||||||
|  | proximity: 1 | ||||||
| word1    : good | word1    : good | ||||||
| word2    : dog | word2    : doggo | ||||||
| proximity: 3 |  | ||||||
| docids   : docids1 | docids   : docids1 | ||||||
|  |  | ||||||
| prefixes: [d, do, dog] | prefixes: [d, do, dog] | ||||||
|  |  | ||||||
| batch: [ | batch: [ | ||||||
|     (d, 3)   -> [docids1] |     d,   -> [docids1] | ||||||
|     (do, 3)  -> [docids1] |     do   -> [docids1] | ||||||
|     (dog, 3) -> [docids1] |     dog  -> [docids1] | ||||||
| ] | ] | ||||||
| ``` | ``` | ||||||
| 3. For illustration purpose, let's run through a second iteration of the outer loop: | 3. For illustration purpose, let's run through a second iteration of the outer loop: | ||||||
| ```text | ```text | ||||||
| Outer loop 2: | Outer loop 2: | ||||||
| ------------------------------ | ------------------------------ | ||||||
| word1    : good |  | ||||||
| word2    : doggo |  | ||||||
| proximity: 1 | proximity: 1 | ||||||
|  | word1    : good | ||||||
|  | word2    : door | ||||||
| docids   : docids2 | docids   : docids2 | ||||||
|  |  | ||||||
| prefixes: [d, do, dog] | prefixes: [d, do, doo] | ||||||
|  |  | ||||||
| batch: [ | batch: [ | ||||||
|     (d, 1)   -> [docids2] |     d   -> [docids1, docids2] | ||||||
|     (d, 3)   -> [docids1] |     do  -> [docids1, docids2] | ||||||
|     (do, 1)  -> [docids2] |     dog -> [docids1] | ||||||
|     (do, 3)  -> [docids1] |     doo -> [docids2] | ||||||
|     (dog, 1) -> [docids2] |  | ||||||
|     (dog, 3) -> [docids1] |  | ||||||
| ] |  | ||||||
| ``` |  | ||||||
| Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some |  | ||||||
| of the elements inserted in the second iteration of the outer loop appear |  | ||||||
| *before* elements from the first iteration. |  | ||||||
|  |  | ||||||
| 4. And a third: |  | ||||||
| ```text |  | ||||||
| Outer loop 3: |  | ||||||
| ------------------------------ |  | ||||||
| word1    : good |  | ||||||
| word2    : dogma |  | ||||||
| proximity: 1 |  | ||||||
| docids   : docids3 |  | ||||||
|  |  | ||||||
| prefixes: [d, do, dog] |  | ||||||
|  |  | ||||||
| batch: [ |  | ||||||
|     (d, 1)   -> [docids2, docids3] |  | ||||||
|     (d, 3)   -> [docids1] |  | ||||||
|     (do, 1)  -> [docids2, docids3] |  | ||||||
|     (do, 3)  -> [docids1] |  | ||||||
|     (dog, 1) -> [docids2, docids3] |  | ||||||
|     (dog, 3) -> [docids1] |  | ||||||
| ] | ] | ||||||
| ``` | ``` | ||||||
| Notice that there were some conflicts which were resolved by merging the | Notice that there were some conflicts which were resolved by merging the | ||||||
| conflicting values together. | conflicting values together. Also, an additional prefix was added at the | ||||||
|  | end of the batch. | ||||||
|  |  | ||||||
| 5. On the fourth iteration of the outer loop, we have: | 4. On the third iteration of the outer loop, we have: | ||||||
| ```text | ```text | ||||||
| Outer loop 4: | Outer loop 4: | ||||||
| ------------------------------ | ------------------------------ | ||||||
|  | proximity: 1 | ||||||
| word1    : good | word1    : good | ||||||
| word2    : ghost | word2    : ghost | ||||||
| proximity: 2 |  | ||||||
| ``` | ``` | ||||||
| Because `word2` begins with a different letter than the previous `word2`, | Because `word2` begins with a different letter than the previous `word2`, | ||||||
| we know that: | we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 | ||||||
|  |  | ||||||
| 1. All the prefixes of `word2` are greater than the prefixes of the previous word2 |  | ||||||
| 2. And therefore, every instance of (`word2`, `prefix`) will be greater than |  | ||||||
| any element in the batch. |  | ||||||
|  |  | ||||||
| Therefore, we know that we can insert every element from the batch into the | Therefore, we know that we can insert every element from the batch into the | ||||||
| database before proceeding any further. This operation is called | database before proceeding any further. This operation is called | ||||||
| “flushing the batch”. Flushing the batch should also be done whenever `word1` | “flushing the batch”. Flushing the batch should also be done whenever: | ||||||
| is different than the previous `word1`. | * `proximity` is different than the previous `proximity`. | ||||||
|  | * `word1` is different than the previous `word1`. | ||||||
|  | * `word2` starts with a different letter than the previous word2 | ||||||
|  |  | ||||||
| 6. **Flushing the batch:** to flush the batch, we look at the `word1` and | 6. **Flushing the batch:** to flush the batch, we iterate over its elements: | ||||||
| iterate over the elements of the batch in sorted order: |  | ||||||
| ```text | ```text | ||||||
| Flushing Batch loop 1: | Flushing Batch loop 1: | ||||||
| ------------------------------ | ------------------------------ | ||||||
| word1    : good | proximity  : 1 | ||||||
| word2    : d | word1      : good | ||||||
| proximity: 1 | prefix     : d | ||||||
|  |  | ||||||
| docids   : [docids2, docids3] | docids   : [docids2, docids3] | ||||||
| ``` | ``` | ||||||
| We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using | We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using | ||||||
| `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a | `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a | ||||||
| roaring bitmap of all the document ids where `word1` is followed by `prefix` | roaring bitmap of all the document ids where `word1` is followed by `prefix` | ||||||
| at a distance of `proximity`. | at a distance of `proximity`. | ||||||
| Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` | Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` | ||||||
| into the database. | into the database. | ||||||
|  |  | ||||||
| 7. That's it! ... except... | 7. That's it! ... except... | ||||||
| @@ -184,8 +157,8 @@ Note, also, that since we read data from the database when iterating over | |||||||
| `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- | `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- | ||||||
| docids from the batch directly into the database (we would have a concurrent | docids from the batch directly into the database (we would have a concurrent | ||||||
| reader and writer). Therefore, when calling the algorithm on | reader and writer). Therefore, when calling the algorithm on | ||||||
| (`new_prefixes`, `word_pairs_db`), we insert the computed | `(new_prefixes, word_pairs_db)`, we insert the computed | ||||||
| ((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad | `((proximity, word, prefix), docids)` elements in an intermediary grenad | ||||||
| Writer instead of the DB. At the end of the outer loop, we finally read from | Writer instead of the DB. At the end of the outer loop, we finally read from | ||||||
| the grenad and insert its elements in the database. | the grenad and insert its elements in the database. | ||||||
|  |  | ||||||
| @@ -406,7 +379,7 @@ fn execute_on_word_pairs_and_prefixes<I>( | |||||||
|     while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { |     while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { | ||||||
|         // skip this iteration if the proximity is over the threshold |         // skip this iteration if the proximity is over the threshold | ||||||
|         if proximity > max_proximity { |         if proximity > max_proximity { | ||||||
|             continue; |             break; | ||||||
|         }; |         }; | ||||||
|         let word2_start_different_than_prev = word2[0] != prev_word2_start; |         let word2_start_different_than_prev = word2[0] != prev_word2_start; | ||||||
|         // if there were no potential prefixes for the previous word2 based on its first letter, |         // if there were no potential prefixes for the previous word2 based on its first letter, | ||||||
| @@ -416,16 +389,21 @@ fn execute_on_word_pairs_and_prefixes<I>( | |||||||
|             continue; |             continue; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // if word1 is different than the previous word1 OR if the start of word2 is different |         // if the proximity is different to the previous one, OR | ||||||
|         // than the previous start of word2, then we'll need to flush the batch |         // if word1 is different than the previous word1, OR | ||||||
|  |         // if the start of word2 is different than the previous start of word2, | ||||||
|  |         // THEN we'll need to flush the batch | ||||||
|  |         let prox_different_than_prev = proximity != batch.proximity; | ||||||
|         let word1_different_than_prev = word1 != batch.word1; |         let word1_different_than_prev = word1 != batch.word1; | ||||||
|         if word1_different_than_prev || word2_start_different_than_prev { |         if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev | ||||||
|  |         { | ||||||
|             batch.flush(&mut merge_buffer, &mut insert)?; |             batch.flush(&mut merge_buffer, &mut insert)?; | ||||||
|             // don't forget to reset the value of batch.word1 and prev_word2_start |             // don't forget to reset the value of batch.word1 and prev_word2_start | ||||||
|             if word1_different_than_prev { |             if word1_different_than_prev { | ||||||
|                 prefix_search_start.0 = 0; |                 prefix_search_start.0 = 0; | ||||||
|                 batch.word1.clear(); |                 batch.word1.clear(); | ||||||
|                 batch.word1.extend_from_slice(word1); |                 batch.word1.extend_from_slice(word1); | ||||||
|  |                 batch.proximity = proximity; | ||||||
|             } |             } | ||||||
|             if word2_start_different_than_prev { |             if word2_start_different_than_prev { | ||||||
|                 // word2_start_different_than_prev == true |                 // word2_start_different_than_prev == true | ||||||
| @@ -437,74 +415,70 @@ fn execute_on_word_pairs_and_prefixes<I>( | |||||||
|  |  | ||||||
|         if !empty_prefixes { |         if !empty_prefixes { | ||||||
|             // All conditions are satisfied, we can now insert each new prefix of word2 into the batch |             // All conditions are satisfied, we can now insert each new prefix of word2 into the batch | ||||||
|  |             prefix_buffer.clear(); | ||||||
|             prefixes.for_each_prefix_of( |             prefixes.for_each_prefix_of( | ||||||
|                 word2, |                 word2, | ||||||
|                 &mut prefix_buffer, |                 &mut prefix_buffer, | ||||||
|                 &prefix_search_start, |                 &prefix_search_start, | ||||||
|                 |prefix_buffer| { |                 |prefix_buffer| { | ||||||
|                     let prefix_len = prefix_buffer.len(); |  | ||||||
|                     prefix_buffer.push(0); |  | ||||||
|                     prefix_buffer.push(proximity); |  | ||||||
|                     batch.insert(&prefix_buffer, data.to_vec()); |                     batch.insert(&prefix_buffer, data.to_vec()); | ||||||
|                     prefix_buffer.truncate(prefix_len); |  | ||||||
|                 }, |                 }, | ||||||
|             ); |             ); | ||||||
|             prefix_buffer.clear(); |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|     batch.flush(&mut merge_buffer, &mut insert)?; |     batch.flush(&mut merge_buffer, &mut insert)?; | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
| /** | /** | ||||||
| A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). | A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). | ||||||
| The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. | The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. | ||||||
|  |  | ||||||
| It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. | It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. | ||||||
|  |  | ||||||
| The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content | The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content | ||||||
| can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: | can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: | ||||||
| - key   : (word1, prefix, proximity) as bytes | - key   : (proximity, word1, prefix) as bytes | ||||||
| - value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes | - value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes | ||||||
| */ | */ | ||||||
| #[derive(Default)] | #[derive(Default)] | ||||||
| struct PrefixAndProximityBatch { | struct PrefixAndProximityBatch { | ||||||
|  |     proximity: u8, | ||||||
|     word1: Vec<u8>, |     word1: Vec<u8>, | ||||||
|     batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>, |     batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl PrefixAndProximityBatch { | impl PrefixAndProximityBatch { | ||||||
|     /// Insert the new key and value into the batch |     /// Insert the new key and value into the batch | ||||||
|  |     /// | ||||||
|  |     /// The key must either exist in the batch or be greater than all existing keys | ||||||
|     fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) { |     fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) { | ||||||
|         match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { |         match self.batch.iter_mut().find(|el| el.0 == new_key) { | ||||||
|             Ok(position) => { |             Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), | ||||||
|                 self.batch[position].1.push(Cow::Owned(new_value)); |             None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), | ||||||
|             } |  | ||||||
|             Err(position) => { |  | ||||||
|                 self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)])); |  | ||||||
|             } |  | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Empties the batch, calling `insert` on each element. |     /// Empties the batch, calling `insert` on each element. | ||||||
|     /// |     /// | ||||||
|     /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. |     /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. | ||||||
|     fn flush( |     fn flush( | ||||||
|         &mut self, |         &mut self, | ||||||
|         merge_buffer: &mut Vec<u8>, |         merge_buffer: &mut Vec<u8>, | ||||||
|         insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, |         insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, | ||||||
|     ) -> Result<()> { |     ) -> Result<()> { | ||||||
|         let PrefixAndProximityBatch { word1, batch } = self; |         let PrefixAndProximityBatch { proximity, word1, batch } = self; | ||||||
|         if batch.is_empty() { |         if batch.is_empty() { | ||||||
|             return Ok(()); |             return Ok(()); | ||||||
|         } |         } | ||||||
|         merge_buffer.clear(); |         merge_buffer.clear(); | ||||||
|  |  | ||||||
|         let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1); |         let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); | ||||||
|  |         buffer.push(*proximity); | ||||||
|         buffer.extend_from_slice(word1); |         buffer.extend_from_slice(word1); | ||||||
|         buffer.push(0); |         buffer.push(0); | ||||||
|  |  | ||||||
|         for (key, mergeable_data) in batch.drain(..) { |         for (key, mergeable_data) in batch.drain(..) { | ||||||
|             buffer.truncate(word1.len() + 1); |             buffer.truncate(1 + word1.len() + 1); | ||||||
|             buffer.extend_from_slice(key.as_slice()); |             buffer.extend_from_slice(key.as_slice()); | ||||||
|  |  | ||||||
|             let data = if mergeable_data.len() > 1 { |             let data = if mergeable_data.len() > 1 { | ||||||
| @@ -884,51 +858,33 @@ mod tests { | |||||||
|         CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); |         CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); | ||||||
|  |  | ||||||
|         let word_pairs = [ |         let word_pairs = [ | ||||||
|             // 1, 3:  (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) |  | ||||||
|             (("healthy", "arbre", 2), &serialised_bitmap123), |  | ||||||
|             //          not inserted because 3 > max_proximity |  | ||||||
|             (("healthy", "arbre", 3), &serialised_bitmap456), |  | ||||||
|             // 0, 2:  (healthy arb 1) and (healthy arbre 1) with (bitmap123) |  | ||||||
|             (("healthy", "arbres", 1), &serialised_bitmap123), |             (("healthy", "arbres", 1), &serialised_bitmap123), | ||||||
|             // 1, 3: |  | ||||||
|             (("healthy", "arbres", 2), &serialised_bitmap456), |  | ||||||
|             //          not be inserted because 3 > max_proximity |  | ||||||
|             (("healthy", "arbres", 3), &serialised_bitmap789), |  | ||||||
|             //          not inserted because no prefixes for boat |  | ||||||
|             (("healthy", "boat", 1), &serialised_bitmap123), |             (("healthy", "boat", 1), &serialised_bitmap123), | ||||||
|             //          not inserted because no prefixes for ca |  | ||||||
|             (("healthy", "ca", 1), &serialised_bitmap123), |             (("healthy", "ca", 1), &serialised_bitmap123), | ||||||
|             // 4: (healthy cat 1) with (bitmap456 + bitmap123) |  | ||||||
|             (("healthy", "cats", 1), &serialised_bitmap456), |             (("healthy", "cats", 1), &serialised_bitmap456), | ||||||
|             // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) |  | ||||||
|             (("healthy", "cats", 2), &serialised_bitmap789), |  | ||||||
|             // 4 + 6: (healthy catto 1) with (bitmap123) |  | ||||||
|             (("healthy", "cattos", 1), &serialised_bitmap123), |             (("healthy", "cattos", 1), &serialised_bitmap123), | ||||||
|             // 5 + 7: (healthy catto 2) with (bitmap_ranges) |  | ||||||
|             (("healthy", "cattos", 2), &serialised_bitmap_ranges), |  | ||||||
|             // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) |  | ||||||
|             (("jittery", "cat", 1), &serialised_bitmap123), |             (("jittery", "cat", 1), &serialised_bitmap123), | ||||||
|             // 8: |  | ||||||
|             (("jittery", "cata", 1), &serialised_bitmap456), |             (("jittery", "cata", 1), &serialised_bitmap456), | ||||||
|             // 8: |  | ||||||
|             (("jittery", "catb", 1), &serialised_bitmap789), |             (("jittery", "catb", 1), &serialised_bitmap789), | ||||||
|             // 8: |  | ||||||
|             (("jittery", "catc", 1), &serialised_bitmap_ranges), |             (("jittery", "catc", 1), &serialised_bitmap_ranges), | ||||||
|  |             (("healthy", "arbre", 2), &serialised_bitmap123), | ||||||
|  |             (("healthy", "arbres", 2), &serialised_bitmap456), | ||||||
|  |             (("healthy", "cats", 2), &serialised_bitmap789), | ||||||
|  |             (("healthy", "cattos", 2), &serialised_bitmap_ranges), | ||||||
|  |             (("healthy", "arbre", 3), &serialised_bitmap456), | ||||||
|  |             (("healthy", "arbres", 3), &serialised_bitmap789), | ||||||
|         ]; |         ]; | ||||||
|  |  | ||||||
|         let expected_result = [ |         let expected_result = [ | ||||||
|             // first batch: |  | ||||||
|             (("healthy", "arb", 1), bitmap123.clone()), |             (("healthy", "arb", 1), bitmap123.clone()), | ||||||
|             (("healthy", "arb", 2), &bitmap123 | &bitmap456), |  | ||||||
|             (("healthy", "arbre", 1), bitmap123.clone()), |             (("healthy", "arbre", 1), bitmap123.clone()), | ||||||
|             (("healthy", "arbre", 2), &bitmap123 | &bitmap456), |  | ||||||
|             // second batch: |  | ||||||
|             (("healthy", "cat", 1), &bitmap456 | &bitmap123), |             (("healthy", "cat", 1), &bitmap456 | &bitmap123), | ||||||
|             (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), |  | ||||||
|             (("healthy", "catto", 1), bitmap123.clone()), |             (("healthy", "catto", 1), bitmap123.clone()), | ||||||
|             (("healthy", "catto", 2), bitmap_ranges.clone()), |  | ||||||
|             // third batch |  | ||||||
|             (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), |             (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), | ||||||
|  |             (("healthy", "arb", 2), &bitmap123 | &bitmap456), | ||||||
|  |             (("healthy", "arbre", 2), &bitmap123 | &bitmap456), | ||||||
|  |             (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), | ||||||
|  |             (("healthy", "catto", 2), bitmap_ranges.clone()), | ||||||
|         ]; |         ]; | ||||||
|  |  | ||||||
|         let mut result = vec![]; |         let mut result = vec![]; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user