Change encoding of word_pair_proximity DB to (proximity, word1, word2)

Same for word_prefix_pair_proximity
2025-10-31 16:06:31 +00:00 · 2022-09-14 13:54:12 +02:00
parent 19b2326f3d
commit bdeb47305e
6 changed files with 130 additions and 179 deletions
--- a/milli/src/heed_codec/str_str_u8_codec.rs
+++ b/milli/src/heed_codec/str_str_u8_codec.rs
@@ -7,12 +7,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec {
    type DItem = (&'a str, &'a str, u8);
    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        let (n, bytes) = bytes.split_last()?;
+        let (n, bytes) = bytes.split_first()?;
        let s1_end = bytes.iter().position(|b| *b == 0)?;
        let (s1_bytes, rest) = bytes.split_at(s1_end);
-        let rest = &rest[1..];
+        let s2_bytes = &rest[1..];
        let s1 = str::from_utf8(s1_bytes).ok()?;
        let (_, s2_bytes) = rest.split_last()?;
        let s2 = str::from_utf8(s2_bytes).ok()?;
        Some((s1, s2, *n))
    }
@@ -22,12 +21,11 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec {
    type EItem = (&'a str, &'a str, u8);
    fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
-        let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
+        let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
        bytes.push(*n);
        bytes.extend_from_slice(s1.as_bytes());
        bytes.push(0);
        bytes.extend_from_slice(s2.as_bytes());
        bytes.push(0);
        bytes.push(*n);
        Some(Cow::Owned(bytes))
    }
 }
@@ -37,11 +35,10 @@ impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec {
    type DItem = (&'a [u8], &'a [u8], u8);
    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
-        let (n, bytes) = bytes.split_last()?;
+        let (n, bytes) = bytes.split_first()?;
        let s1_end = bytes.iter().position(|b| *b == 0)?;
        let (s1_bytes, rest) = bytes.split_at(s1_end);
-        let rest = &rest[1..];
+        let s2_bytes = &rest[1..];
        let (_, s2_bytes) = rest.split_last()?;
        Some((s1_bytes, s2_bytes, *n))
    }
 }
@@ -50,12 +47,11 @@ impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec {
    type EItem = (&'a [u8], &'a [u8], u8);
    fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
-        let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
+        let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
        bytes.push(*n);
        bytes.extend_from_slice(s1);
        bytes.push(0);
        bytes.extend_from_slice(s2);
        bytes.push(0);
        bytes.push(*n);
        Some(Cow::Owned(bytes))
    }
 }
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -194,7 +194,7 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
        (word1, prefix, proximity),
        b,
    )| {
-        &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b))
+        &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
    });
    snap
 }
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -151,11 +151,10 @@ fn document_word_positions_into_sorter<'b>(
    let mut key_buffer = Vec::new();
    for ((w1, w2), prox) in word_pair_proximity {
        key_buffer.clear();
        key_buffer.push(prox as u8);
        key_buffer.extend_from_slice(w1.as_bytes());
        key_buffer.push(0);
        key_buffer.extend_from_slice(w2.as_bytes());
        key_buffer.push(0);
        key_buffer.push(prox as u8);
        word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
    }
--- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap
+++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap
@@ -1,46 +1,46 @@
 ---
 source: milli/src/update/word_prefix_pair_proximity_docids.rs
 ---
-5                a    1  [101, ]
+1  5                a    [101, ]
-5                a    2  [101, ]
+1  amazing          a    [100, ]
-5                b    4  [101, ]
+1  an               a    [100, ]
-5                be   4  [101, ]
+1  and              b    [100, ]
-am               a    3  [101, ]
+1  and              be   [100, ]
-amazing          a    1  [100, ]
+1  at               a    [100, ]
-amazing          a    2  [100, ]
+1  rings            a    [101, ]
-amazing          a    3  [100, ]
+1  the              b    [101, ]
-amazing          b    2  [100, ]
+1  the              be   [101, ]
-amazing          be   2  [100, ]
+2  5                a    [101, ]
-an               a    1  [100, ]
+2  amazing          a    [100, ]
-an               a    2  [100, ]
+2  amazing          b    [100, ]
-an               b    3  [100, ]
+2  amazing          be   [100, ]
-an               be   3  [100, ]
+2  an               a    [100, ]
-and              a    2  [100, ]
+2  and              a    [100, ]
-and              a    3  [100, ]
+2  at               a    [100, 101, ]
-and              a    4  [100, ]
+2  beautiful        a    [100, ]
-and              b    1  [100, ]
+2  bell             a    [101, ]
-and              be   1  [100, ]
+2  house            b    [100, ]
-at               a    1  [100, ]
+2  house            be   [100, ]
-at               a    2  [100, 101, ]
+2  rings            b    [101, ]
-at               a    3  [100, ]
+2  rings            be   [101, ]
-at               b    3  [101, ]
+3  am               a    [101, ]
-at               b    4  [100, ]
+3  amazing          a    [100, ]
-at               be   3  [101, ]
+3  an               b    [100, ]
-at               be   4  [100, ]
+3  an               be   [100, ]
-beautiful        a    2  [100, ]
+3  and              a    [100, ]
-beautiful        a    3  [100, ]
+3  at               a    [100, ]
-beautiful        a    4  [100, ]
+3  at               b    [101, ]
-bell             a    2  [101, ]
+3  at               be   [101, ]
-bell             a    4  [101, ]
+3  beautiful        a    [100, ]
-house            a    3  [100, ]
+3  house            a    [100, ]
-house            a    4  [100, ]
+3  rings            a    [101, ]
-house            b    2  [100, ]
+3  the              a    [101, ]
-house            be   2  [100, ]
+4  5                b    [101, ]
-rings            a    1  [101, ]
+4  5                be   [101, ]
-rings            a    3  [101, ]
+4  and              a    [100, ]
-rings            b    2  [101, ]
+4  at               b    [100, ]
-rings            be   2  [101, ]
+4  at               be   [100, ]
-the              a    3  [101, ]
+4  beautiful        a    [100, ]
-the              b    1  [101, ]
+4  bell             a    [101, ]
-the              be   1  [101, ]
+4  house            a    [100, ]
--- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap
+++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap
@@ -1,4 +1,4 @@
 ---
 source: milli/src/update/word_prefix_pair_proximity_docids.rs
 ---
-5ed4bf83317b10962a55ade353427bdd
+fb88e49fd666886731b62baef8f44995
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@@ -1,7 +1,7 @@
 /*!
 ## What is WordPrefixPairProximityDocids?
 The word-prefix-pair-proximity-docids database is a database whose keys are of
-the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of
+the form `(proximity, word, prefix)` and the values are roaring bitmaps of
 the documents which contain `word` followed by another word starting with
 `prefix` at a distance of `proximity`.
@@ -23,127 +23,100 @@ dog
 Note that only prefixes which correspond to more than a certain number of
 different words from the database are included in this list.
-* a sorted list of word pairs and the distance between them (i.e. proximity),
+* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
-* associated with a roaring bitmap, such as:
+associated with a roaring bitmap, such as:
 ```text
-good dog   3         -> docids1: [2, 5, 6]
+1 good doggo         -> docids1: [8]
-good doggo 1         -> docids2: [8]
+1 good door          -> docids2: [7, 19, 20]
-good dogma 1         -> docids3: [7, 19, 20]
+1 good ghost         -> docids3: [1]
-good ghost 2         -> docids4: [1]
+2 good dog           -> docids4: [2, 5, 6]
-horror cathedral 4   -> docids5: [1, 2]
+2 horror cathedral   -> docids5: [1, 2]
 ```
 I illustrate a simplified version of the algorithm to create the word-prefix
 pair-proximity database below:
-1. **Outer loop:** First, we iterate over each word pair and its proximity:
+1. **Outer loop:** First, we iterate over each proximity and word pair:
 ```text
 proximity: 1
 word1    : good
-word2    : dog
+word2    : doggo
 proximity: 3
 ```
 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
-in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`)
+in the list of sorted prefixes. And we insert the key `prefix`
 and the value (`docids`) to a sorted map which we call the “batch”. For example,
 at the end of the first inner loop, we may have:
 ```text
 Outer loop 1:
 ------------------------------
 proximity: 1
 word1    : good
-word2    : dog
+word2    : doggo
 proximity: 3
 docids   : docids1
 prefixes: [d, do, dog]
 batch: [
-    (d, 3)   -> [docids1]
+    d,   -> [docids1]
-    (do, 3)  -> [docids1]
+    do   -> [docids1]
-    (dog, 3) -> [docids1]
+    dog  -> [docids1]
 ]
 ```
 3. For illustration purpose, let's run through a second iteration of the outer loop:
 ```text
 Outer loop 2:
 ------------------------------
 word1    : good
 word2    : doggo
 proximity: 1
 word1    : good
 word2    : door
 docids   : docids2
-prefixes: [d, do, dog]
+prefixes: [d, do, doo]
 batch: [
-    (d, 1)   -> [docids2]
+    d   -> [docids1, docids2]
-    (d, 3)   -> [docids1]
+    do  -> [docids1, docids2]
-    (do, 1)  -> [docids2]
+    dog -> [docids1]
-    (do, 3)  -> [docids1]
+    doo -> [docids2]
    (dog, 1) -> [docids2]
    (dog, 3) -> [docids1]
 ]
 ```
 Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some
 of the elements inserted in the second iteration of the outer loop appear
 *before* elements from the first iteration.
 4. And a third:
 ```text
 Outer loop 3:
 ------------------------------
 word1    : good
 word2    : dogma
 proximity: 1
 docids   : docids3
 prefixes: [d, do, dog]
 batch: [
    (d, 1)   -> [docids2, docids3]
    (d, 3)   -> [docids1]
    (do, 1)  -> [docids2, docids3]
    (do, 3)  -> [docids1]
    (dog, 1) -> [docids2, docids3]
    (dog, 3) -> [docids1]
 ]
 ```
 Notice that there were some conflicts which were resolved by merging the
-conflicting values together.
+conflicting values together. Also, an additional prefix was added at the
 end of the batch.
-5. On the fourth iteration of the outer loop, we have:
+4. On the third iteration of the outer loop, we have:
 ```text
 Outer loop 4:
 ------------------------------
 proximity: 1
 word1    : good
 word2    : ghost
 proximity: 2
 ```
 Because `word2` begins with a different letter than the previous `word2`,
-we know that:
+we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
 1. All the prefixes of `word2` are greater than the prefixes of the previous word2
 2. And therefore, every instance of (`word2`, `prefix`) will be greater than
 any element in the batch.
 Therefore, we know that we can insert every element from the batch into the
 database before proceeding any further. This operation is called
-“flushing the batch”. Flushing the batch should also be done whenever `word1`
+“flushing the batch”. Flushing the batch should also be done whenever:
-is different than the previous `word1`.
+* `proximity` is different than the previous `proximity`.
 * `word1` is different than the previous `word1`.
 * `word2` starts with a different letter than the previous word2
-6. **Flushing the batch:** to flush the batch, we look at the `word1` and
+6. **Flushing the batch:** to flush the batch, we iterate over its elements:
 iterate over the elements of the batch in sorted order:
 ```text
 Flushing Batch loop 1:
 ------------------------------
-word1    : good
+proximity  : 1
-word2    : d
+word1      : good
-proximity: 1
+prefix     : d
 docids   : [docids2, docids3]
 ```
 We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
 `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
 roaring bitmap of all the document ids where `word1` is followed by `prefix`
 at a distance of `proximity`.
-Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids`
+Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
 into the database.
 7. That's it! ... except...
@@ -184,8 +157,8 @@ Note, also, that since we read data from the database when iterating over
 `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
 docids from the batch directly into the database (we would have a concurrent
 reader and writer). Therefore, when calling the algorithm on
-(`new_prefixes`, `word_pairs_db`), we insert the computed
+`(new_prefixes, word_pairs_db)`, we insert the computed
-((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad
+`((proximity, word, prefix), docids)` elements in an intermediary grenad
 Writer instead of the DB. At the end of the outer loop, we finally read from
 the grenad and insert its elements in the database.
@@ -406,7 +379,7 @@ fn execute_on_word_pairs_and_prefixes<I>(
    while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
        // skip this iteration if the proximity is over the threshold
        if proximity > max_proximity {
-            continue;
+            break;
        };
        let word2_start_different_than_prev = word2[0] != prev_word2_start;
        // if there were no potential prefixes for the previous word2 based on its first letter,
@@ -416,16 +389,21 @@ fn execute_on_word_pairs_and_prefixes<I>(
            continue;
        }
-        // if word1 is different than the previous word1 OR if the start of word2 is different
+        // if the proximity is different to the previous one, OR
-        // than the previous start of word2, then we'll need to flush the batch
+        // if word1 is different than the previous word1, OR
        // if the start of word2 is different than the previous start of word2,
        // THEN we'll need to flush the batch
        let prox_different_than_prev = proximity != batch.proximity;
        let word1_different_than_prev = word1 != batch.word1;
-        if word1_different_than_prev || word2_start_different_than_prev {
+        if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
        {
            batch.flush(&mut merge_buffer, &mut insert)?;
            // don't forget to reset the value of batch.word1 and prev_word2_start
            if word1_different_than_prev {
                prefix_search_start.0 = 0;
                batch.word1.clear();
                batch.word1.extend_from_slice(word1);
                batch.proximity = proximity;
            }
            if word2_start_different_than_prev {
                // word2_start_different_than_prev == true
@@ -437,74 +415,70 @@ fn execute_on_word_pairs_and_prefixes<I>(
        if !empty_prefixes {
            // All conditions are satisfied, we can now insert each new prefix of word2 into the batch
            prefix_buffer.clear();
            prefixes.for_each_prefix_of(
                word2,
                &mut prefix_buffer,
                &prefix_search_start,
                |prefix_buffer| {
                    let prefix_len = prefix_buffer.len();
                    prefix_buffer.push(0);
                    prefix_buffer.push(proximity);
                    batch.insert(&prefix_buffer, data.to_vec());
                    prefix_buffer.truncate(prefix_len);
                },
            );
            prefix_buffer.clear();
        }
    }
    batch.flush(&mut merge_buffer, &mut insert)?;
    Ok(())
 }
 /**
-A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
+A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
 The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
-It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently.
+It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
-The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content
+The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
 can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
- key   : (word1, prefix, proximity) as bytes
+- key   : (proximity, word1, prefix) as bytes
- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes
+- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
 */
 #[derive(Default)]
 struct PrefixAndProximityBatch {
    proximity: u8,
    word1: Vec<u8>,
    batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
 }
 impl PrefixAndProximityBatch {
    /// Insert the new key and value into the batch
    ///
    /// The key must either exist in the batch or be greater than all existing keys
    fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
-        match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
+        match self.batch.iter_mut().find(|el| el.0 == new_key) {
-            Ok(position) => {
+            Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
-                self.batch[position].1.push(Cow::Owned(new_value));
+            None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
            }
            Err(position) => {
                self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)]));
            }
        }
    }
    /// Empties the batch, calling `insert` on each element.
    ///
-    /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap.
+    /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
    fn flush(
        &mut self,
        merge_buffer: &mut Vec<u8>,
        insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
    ) -> Result<()> {
-        let PrefixAndProximityBatch { word1, batch } = self;
+        let PrefixAndProximityBatch { proximity, word1, batch } = self;
        if batch.is_empty() {
            return Ok(());
        }
        merge_buffer.clear();
-        let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1);
+        let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
        buffer.push(*proximity);
        buffer.extend_from_slice(word1);
        buffer.push(0);
        for (key, mergeable_data) in batch.drain(..) {
-            buffer.truncate(word1.len() + 1);
+            buffer.truncate(1 + word1.len() + 1);
            buffer.extend_from_slice(key.as_slice());
            let data = if mergeable_data.len() > 1 {
@@ -884,51 +858,33 @@ mod tests {
        CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
        let word_pairs = [
            // 1, 3:  (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456)
            (("healthy", "arbre", 2), &serialised_bitmap123),
            //          not inserted because 3 > max_proximity
            (("healthy", "arbre", 3), &serialised_bitmap456),
            // 0, 2:  (healthy arb 1) and (healthy arbre 1) with (bitmap123)
            (("healthy", "arbres", 1), &serialised_bitmap123),
            // 1, 3:
            (("healthy", "arbres", 2), &serialised_bitmap456),
            //          not be inserted because 3 > max_proximity
            (("healthy", "arbres", 3), &serialised_bitmap789),
            //          not inserted because no prefixes for boat
            (("healthy", "boat", 1), &serialised_bitmap123),
            //          not inserted because no prefixes for ca
            (("healthy", "ca", 1), &serialised_bitmap123),
            // 4: (healthy cat 1) with (bitmap456 + bitmap123)
            (("healthy", "cats", 1), &serialised_bitmap456),
            // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges)
            (("healthy", "cats", 2), &serialised_bitmap789),
            // 4 + 6: (healthy catto 1) with (bitmap123)
            (("healthy", "cattos", 1), &serialised_bitmap123),
            // 5 + 7: (healthy catto 2) with (bitmap_ranges)
            (("healthy", "cattos", 2), &serialised_bitmap_ranges),
            // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges)
            (("jittery", "cat", 1), &serialised_bitmap123),
            // 8:
            (("jittery", "cata", 1), &serialised_bitmap456),
            // 8:
            (("jittery", "catb", 1), &serialised_bitmap789),
            // 8:
            (("jittery", "catc", 1), &serialised_bitmap_ranges),
            (("healthy", "arbre", 2), &serialised_bitmap123),
            (("healthy", "arbres", 2), &serialised_bitmap456),
            (("healthy", "cats", 2), &serialised_bitmap789),
            (("healthy", "cattos", 2), &serialised_bitmap_ranges),
            (("healthy", "arbre", 3), &serialised_bitmap456),
            (("healthy", "arbres", 3), &serialised_bitmap789),
        ];
        let expected_result = [
            // first batch:
            (("healthy", "arb", 1), bitmap123.clone()),
            (("healthy", "arb", 2), &bitmap123 | &bitmap456),
            (("healthy", "arbre", 1), bitmap123.clone()),
            (("healthy", "arbre", 2), &bitmap123 | &bitmap456),
            // second batch:
            (("healthy", "cat", 1), &bitmap456 | &bitmap123),
            (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
            (("healthy", "catto", 1), bitmap123.clone()),
            (("healthy", "catto", 2), bitmap_ranges.clone()),
            // third batch
            (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
            (("healthy", "arb", 2), &bitmap123 | &bitmap456),
            (("healthy", "arbre", 2), &bitmap123 | &bitmap456),
            (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
            (("healthy", "catto", 2), bitmap_ranges.clone()),
        ];
        let mut result = vec![];