Remove proximity database, forcing us to remove phrase search and splitwords

Don't compute proximity database anymore
2025-12-09 06:05:41 +00:00 · 2023-10-03 16:58:26 +02:00 · 2023-10-03 15:34:01 +02:00
18 changed files with 518 additions and 640 deletions
--- a/.github/workflows/benchmarks-manual.yml
+++ b/.github/workflows/benchmarks-manual.yml
@@ -74,4 +74,4 @@ jobs:
          echo "${{ steps.file.outputs.basename }}.json has just been pushed."
          echo 'How to compare this benchmark with another one?'
          echo '  - Check the available files with: ./benchmarks/scripts/list.sh'
-          echo "  - Run the following command: ./benchmaks/scripts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
+          echo "  - Run the following command: ./benchmaks/scipts/compare.sh <file-to-compare-with> ${{ steps.file.outputs.basename }}.json"
--- a/.github/workflows/publish-docker-images.yml
+++ b/.github/workflows/publish-docker-images.yml
@@ -57,10 +57,10 @@ jobs:
          echo "date=$commit_date" >> $GITHUB_OUTPUT

      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v3
+        uses: docker/setup-qemu-action@v2

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2

      - name: Login to Docker Hub
        uses: docker/login-action@v2
@@ -70,7 +70,7 @@ jobs:

      - name: Docker meta
        id: meta
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v4
        with:
          images: getmeili/meilisearch
          # Prevent `latest` to be updated for each new tag pushed.
@@ -83,7 +83,7 @@ jobs:
            type=raw,value=latest,enable=${{ steps.check-tag-format.outputs.stable == 'true' && steps.check-tag-format.outputs.latest == 'true' }}

      - name: Build and push
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v4
        with:
          push: true
          platforms: linux/amd64,linux/arm64
--- a/.github/workflows/trigger-benchmarks-on-message.yml
+++ b/.github/workflows/trigger-benchmarks-on-message.yml
@@ -1,81 +0,0 @@
-name: Benchmarks (PR)
-on: issue_comment
-permissions:
-  issues: write
-
-env:
-  GH_TOKEN: ${{ secrets.MEILI_BOT_GH_PAT }}
-
-jobs:
-  run-benchmarks-on-comment:
-    name: Run and upload benchmarks
-    runs-on: benchmarks
-    timeout-minutes: 4320 # 72h
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions-rs/toolchain@v1
-        with:
-          profile: minimal
-          toolchain: stable
-          override: true
-
-      - name: Check for Command
-        id: command
-        uses: xt0rted/slash-command-action@v2
-        with:
-          command: benchmark
-          reaction-type: "eyes"
-
-      # Set variables
-      - name: Set current branch name
-        shell: bash
-        run: echo "name=$(echo ${GITHUB_REF#refs/heads/})" >> $GITHUB_OUTPUT
-        id: current_branch
-      - name: Set normalized current branch name # Replace `/` by `_` in branch name to avoid issues when pushing to S3
-        shell: bash
-        run: echo "name=$(echo ${GITHUB_REF#refs/heads/} | tr '/' '_')" >> $GITHUB_OUTPUT
-        id: normalized_current_branch
-      - name: Set shorter commit SHA
-        shell: bash
-        run: echo "short=$(echo $GITHUB_SHA | cut -c1-8)" >> $GITHUB_OUTPUT
-        id: commit_sha
-      - name: Set file basename with format "dataset_branch_commitSHA"
-        shell: bash
-        run: echo "basename=$(echo ${{ steps.command.outputs.command-arguments }}_${{ steps.normalized_current_branch.outputs.name }}_${{ steps.commit_sha.outputs.short }})" >> $GITHUB_OUTPUT
-        id: file
-
-      # Run benchmarks
-      - name: Run benchmarks - Dataset ${{ steps.command.outputs.command-arguments }} - Branch ${{ steps.current_branch.outputs.name }} - Commit ${{ steps.commit_sha.outputs.short }}
-        run: |
-          cd benchmarks
-          cargo bench --bench ${{ steps.command.outputs.command-arguments }} -- --save-baseline ${{ steps.file.outputs.basename }}
-
-      # Generate critcmp files
-      - name: Install critcmp
-        uses: taiki-e/install-action@v2
-        with:
-          tool: critcmp
-      - name: Export cripcmp file
-        run: |
-          critcmp --export ${{ steps.file.outputs.basename }} > ${{ steps.file.outputs.basename }}.json
-
-      # Upload benchmarks
-      - name: Upload ${{ steps.file.outputs.basename }}.json to DO Spaces # DigitalOcean Spaces = S3
-        uses: BetaHuhn/do-spaces-action@v2
-        with:
-          access_key: ${{ secrets.DO_SPACES_ACCESS_KEY }}
-          secret_key: ${{ secrets.DO_SPACES_SECRET_KEY }}
-          space_name: ${{ secrets.DO_SPACES_SPACE_NAME }}
-          space_region: ${{ secrets.DO_SPACES_SPACE_REGION }}
-          source: ${{ steps.file.outputs.basename }}.json
-          out_dir: critcmp_results
-
-      # Compute the diff of the benchmarks and send a message on the GitHub PR
-      - name: Compute and send a message in the PR
-        run: |
-          export base=git rev-parse $(git cherry main | head -n 1 | cut -c 3-)~ | cut -c -8
-          echo 'Here are your benchmarks diff 👊' >> body.txt
-          echo '```' >> body.txt
-          ./benchmaks/scipts/compare.sh $base ${{ steps.file.outputs.basename }}.json >> body.txt
-          echo '```' >> body.txt
-          gh pr comment ${GITHUB_REF#refs/heads/} --body-file body.txt
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -21,7 +21,7 @@ serde_json = { version = "1.0.95", features = ["preserve_order"] }
 criterion = { version = "0.5.1", features = ["html_reports"] }
 rand = "0.8.5"
 rand_chacha = "0.3.1"
-roaring = { path = "../../roaring-rs" }
+roaring = "0.10.1"

 [build-dependencies]
 anyhow = "1.0.70"
--- a/dump/Cargo.toml
+++ b/dump/Cargo.toml
@@ -19,7 +19,7 @@ meilisearch-auth = { path = "../meilisearch-auth" }
 meilisearch-types = { path = "../meilisearch-types" }
 once_cell = "1.17.1"
 regex = "1.7.3"
-roaring = { path = "../../roaring-rs", features = ["serde"] }
+roaring = { version = "0.10.1", features = ["serde"] }
 serde = { version = "1.0.160", features = ["derive"] }
 serde_json = { version = "1.0.95", features = ["preserve_order"] }
 tar = "0.4.38"
--- a/index-scheduler/Cargo.toml
+++ b/index-scheduler/Cargo.toml
@@ -23,7 +23,7 @@ meilisearch-auth = { path = "../meilisearch-auth" }
 meilisearch-types = { path = "../meilisearch-types" }
 page_size = "0.5.0"
 puffin = "0.16.0"
-roaring = { path = "../../roaring-rs", features = ["serde"] }
+roaring = { version = "0.10.1", features = ["serde"] }
 serde = { version = "1.0.160", features = ["derive"] }
 serde_json = { version = "1.0.95", features = ["preserve_order"] }
 synchronoise = "1.0.1"
--- a/meilisearch-auth/Cargo.toml
+++ b/meilisearch-auth/Cargo.toml
@@ -17,7 +17,7 @@ hmac = "0.12.1"
 maplit = "1.0.2"
 meilisearch-types = { path = "../meilisearch-types" }
 rand = "0.8.5"
-roaring = { path = "../../roaring-rs", features = ["serde"] }
+roaring = { version = "0.10.1", features = ["serde"] }
 serde = { version = "1.0.160", features = ["derive"] }
 serde_json = { version = "1.0.95", features = ["preserve_order"] }
 sha2 = "0.10.6"
--- a/meilisearch-types/Cargo.toml
+++ b/meilisearch-types/Cargo.toml
@@ -23,7 +23,7 @@ flate2 = "1.0.25"
 fst = "0.4.7"
 memmap2 = "0.7.1"
 milli = { path = "../milli" }
-roaring = { path = "../../roaring-rs", features = ["serde"] }
+roaring = { version = "0.10.1", features = ["serde"] }
 serde = { version = "1.0.160", features = ["derive"] }
 serde-cs = "0.2.4"
 serde_json = "1.0.95"
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -42,7 +42,7 @@ once_cell = "1.17.1"
 ordered-float = "3.6.0"
 rand_pcg = { version = "0.3.1", features = ["serde1"] }
 rayon = "1.7.0"
-roaring = { path = "../../roaring-rs" }
+roaring = "0.10.1"
 rstar = { version = "0.11.0", features = ["serde"] }
 serde = { version = "1.0.160", features = ["derive"] }
 serde_json = { version = "1.0.95", features = ["preserve_order"] }
--- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
+++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
@@ -1,5 +1,4 @@
 use std::borrow::Cow;
-use std::convert::TryInto;
 use std::io;
 use std::mem::size_of;

@@ -57,30 +56,22 @@ impl CboRoaringBitmapCodec {
    }

    /// Merge serialized CboRoaringBitmaps in a buffer.
-    /// The buffer must be empty before calling the function.
    ///
    /// if the merged values length is under the threshold, values are directly
    /// serialized in the buffer else a RoaringBitmap is created from the
    /// values and is serialized in the buffer.
    pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
-        debug_assert!(buffer.is_empty());
-
        let mut roaring = RoaringBitmap::new();
        let mut vec = Vec::new();

        for bytes in slices {
            if bytes.len() <= THRESHOLD * size_of::<u32>() {
-                debug_assert!(bytes.len() % size_of::<u32>() == 0);
-                vec.reserve(bytes.len() / size_of::<u32>());
-
-                for bytes in bytes.chunks_exact(size_of::<u32>()) {
-                    // unwrap can't happens since we ensured that everything
-                    // was a multiple of size_of<u32>.
-                    let v = u32::from_ne_bytes(bytes.try_into().unwrap());
-                    vec.push(v);
+                let mut reader = bytes.as_ref();
+                while let Ok(integer) = reader.read_u32::<NativeEndian>() {
+                    vec.push(integer);
                }
            } else {
-                roaring.union_with_serialized_unchecked(bytes.as_ref())?;
+                roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
            }
        }

@@ -94,7 +85,7 @@ impl CboRoaringBitmapCodec {
                }
            } else {
                // We can unwrap safely because the vector is sorted upper.
-                let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
+                let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
                roaring.serialize_into(buffer)?;
            }
        } else {
@@ -195,11 +186,8 @@ mod tests {

        let medium_data: Vec<_> =
            medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
-        // TODO: used for profiling purpose, get rids of it once the function is optimized
-        for _ in 0..100000 {
-            buffer.clear();
-            CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
-        }
+        buffer.clear();
+        CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();

        let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
        let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -1,6 +1,5 @@
 #![cfg_attr(all(test, fuzzing), feature(no_coverage))]
 #![allow(clippy::type_complexity)]
-#![feature(test)]

 #[cfg(test)]
 #[global_allocator]
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -12,7 +12,8 @@ use super::Word;
 use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
 use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
 use crate::{
-    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
+    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec,
+    RoaringBitmapLenCodec, SearchContext,
 };

 /// A cache storing pointers to values in the LMDB databases.
@@ -259,6 +260,7 @@ impl<'ctx> SearchContext<'ctx> {
        word2: Interned<String>,
        proximity: u8,
    ) -> Result<Option<RoaringBitmap>> {
+        unreachable!();
        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
            self.txn,
            (proximity, word1, word2),
@@ -278,6 +280,7 @@ impl<'ctx> SearchContext<'ctx> {
        word2: Interned<String>,
        proximity: u8,
    ) -> Result<Option<u64>> {
+        unreachable!();
        DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>(
            self.txn,
            (proximity, word1, word2),
@@ -291,12 +294,23 @@ impl<'ctx> SearchContext<'ctx> {
        )
    }

+    pub fn get_db_word_docids_len(&mut self, word: Interned<String>) -> Result<Option<u64>> {
+        DatabaseCache::get_value::<_, _, RoaringBitmapLenCodec>(
+            self.txn,
+            word,
+            self.word_interner.get(word).as_str(),
+            &mut self.db_cache.word_docids,
+            self.index.word_docids.remap_data_type::<ByteSlice>(),
+        )
+    }
+
    pub fn get_db_word_prefix_pair_proximity_docids(
        &mut self,
        word1: Interned<String>,
        prefix2: Interned<String>,
        proximity: u8,
    ) -> Result<Option<RoaringBitmap>> {
+        unreachable!();
        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
            self.txn,
            (proximity, word1, prefix2),
@@ -315,6 +329,7 @@ impl<'ctx> SearchContext<'ctx> {
        right: Interned<String>,
        proximity: u8,
    ) -> Result<Option<RoaringBitmap>> {
+        unreachable!();
        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
            self.txn,
            (proximity, left_prefix, right),
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -295,11 +295,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
                ranking_rules.push(Box::new(Typo::new(None)));
            }
            crate::Criterion::Proximity => {
-                if proximity {
-                    continue;
-                }
-                proximity = true;
-                ranking_rules.push(Box::new(Proximity::new(None)));
+                // if proximity {
+                continue;
+                // }
+                // proximity = true;
+                // ranking_rules.push(Box::new(Proximity::new(None)));
            }
            crate::Criterion::Attribute => {
                if attribute {
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@@ -265,11 +265,11 @@ pub fn partially_initialized_term_from_word(
 }

 fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Interned<Phrase>>> {
-    if let Some((l, r)) = split_best_frequency(ctx, word)? {
-        Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
-    } else {
-        Ok(None)
-    }
+    // if let Some((l, r)) = split_best_frequency(ctx, word)? {
+    //     Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] })))
+    // } else {
+    Ok(None)
+    // }
 }

 impl Interned<QueryTerm> {
@@ -416,11 +416,20 @@ fn split_best_frequency(
        let left = ctx.word_interner.insert(left.to_owned());
        let right = ctx.word_interner.insert(right.to_owned());

-        if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
+        if let (Some(l_freq), Some(r_freq)) =
+            (ctx.get_db_word_docids_len(left)?, ctx.get_db_word_docids_len(right)?)
+        {
+            let frequency = l_freq.min(r_freq);
            if best.map_or(true, |(old, _, _)| frequency > old) {
                best = Some((frequency, left, right));
            }
        }
+
+        // if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? {
+        //     if best.map_or(true, |(old, _, _)| frequency > old) {
+        //         best = Some((frequency, left, right));
+        //     }
+        // }
    }

    Ok(best.map(|(_, left, right)| (left, right)))
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -82,41 +82,41 @@ pub fn located_query_terms_from_tokens(
                    position = position.wrapping_add(7);
                }

-                phrase = 'phrase: {
-                    let phrase = phrase.take();
+                // phrase = 'phrase: {
+                //     let phrase = phrase.take();

-                    // If we have a hard separator inside a phrase, we immediately start a new phrase
-                    let phrase = if separator_kind == SeparatorKind::Hard {
-                        if let Some(phrase) = phrase {
-                            if let Some(located_query_term) = phrase.build(ctx) {
-                                located_terms.push(located_query_term)
-                            }
-                            Some(PhraseBuilder::empty())
-                        } else {
-                            None
-                        }
-                    } else {
-                        phrase
-                    };
+                //     // If we have a hard separator inside a phrase, we immediately start a new phrase
+                //     let phrase = if separator_kind == SeparatorKind::Hard {
+                //         if let Some(phrase) = phrase {
+                //             if let Some(located_query_term) = phrase.build(ctx) {
+                //                 located_terms.push(located_query_term)
+                //             }
+                //             Some(PhraseBuilder::empty())
+                //         } else {
+                //             None
+                //         }
+                //     } else {
+                //         phrase
+                //     };

-                    // We close and start a new phrase depending on the number of double quotes
-                    let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
-                    if quote_count == 0 {
-                        break 'phrase phrase;
-                    }
+                //     // We close and start a new phrase depending on the number of double quotes
+                //     let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count();
+                //     if quote_count == 0 {
+                //         break 'phrase phrase;
+                //     }

-                    // Consume the closing quote and the phrase
-                    if let Some(phrase) = phrase {
-                        // Per the check above, quote_count > 0
-                        quote_count -= 1;
-                        if let Some(located_query_term) = phrase.build(ctx) {
-                            located_terms.push(located_query_term)
-                        }
-                    }
+                //     // Consume the closing quote and the phrase
+                //     if let Some(phrase) = phrase {
+                //         // Per the check above, quote_count > 0
+                //         quote_count -= 1;
+                //         if let Some(located_query_term) = phrase.build(ctx) {
+                //             located_terms.push(located_query_term)
+                //         }
+                //     }

-                    // Start new phrase if the token ends with an opening quote
-                    (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
-                };
+                //     // Start new phrase if the token ends with an opening quote
+                //     (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
+                // };
            }
            _ => (),
        }
--- a/milli/src/search/new/tests/sort.rs
+++ b/milli/src/search/new/tests/sort.rs
@@ -351,5 +351,5 @@ fn test_redacted() {
        .map(|scores| score_details::ScoreDetails::to_json_map(scores.iter()))
        .collect();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]");
-    // insta::assert_json_snapshot!(document_scores_json);
+    insta::assert_json_snapshot!(document_scores_json);
 }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -152,15 +152,15 @@ pub(crate) fn data_from_obkv_documents(
        });
    }

-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
-        docid_word_positions_chunks.clone(),
-        indexer,
-        lmdb_writer_sx.clone(),
-        extract_word_pair_proximity_docids,
-        merge_cbo_roaring_bitmaps,
-        TypedChunk::WordPairProximityDocids,
-        "word-pair-proximity-docids",
-    );
+    // spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    //     docid_word_positions_chunks.clone(),
+    //     indexer,
+    //     lmdb_writer_sx.clone(),
+    //     extract_word_pair_proximity_docids,
+    //     merge_cbo_roaring_bitmaps,
+    //     TypedChunk::WordPairProximityDocids,
+    //     "word-pair-proximity-docids",
+    // );

    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
        docid_word_positions_chunks.clone(),
Author	SHA1	Message	Date
ManyTheFish	42bbfebf70	Remove proximity database, forcing us to remove phrase search and splitwords	2023-10-03 16:58:26 +02:00
ManyTheFish	5637978fe4	Don't compute proximity database anymore	2023-10-03 15:34:01 +02:00