fix the search cutoff and add a test

2025-11-07 03:16:28 +00:00 · 2024-03-14 17:34:46 +01:00
parent b72495eb58
commit b8cda6c300
9 changed files with 590 additions and 55 deletions
--- a/milli/src/search/hybrid.rs
+++ b/milli/src/search/hybrid.rs
@@ -132,7 +132,7 @@ impl<'a> Search<'a> {
            index: self.index,
            distribution_shift: self.distribution_shift,
            embedder_name: self.embedder_name.clone(),
-            time_budget: self.time_budget,
+            time_budget: self.time_budget.clone(),
        };

        let vector_query = search.vector.take();
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -195,7 +195,7 @@ impl<'a> Search<'a> {
                self.limit,
                self.distribution_shift,
                embedder_name,
-                self.time_budget,
+                self.time_budget.clone(),
            )?,
            None => execute_search(
                &mut ctx,
@@ -211,7 +211,7 @@ impl<'a> Search<'a> {
                Some(self.words_limit),
                &mut DefaultSearchLogger,
                &mut DefaultSearchLogger,
-                self.time_budget,
+                self.time_budget.clone(),
            )?,
        };

--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -161,11 +161,21 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(

    while valid_docids.len() < length {
        if time_budget.exceeded() {
-            let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
-            maybe_add_to_results!(bucket);
+            loop {
+                let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
+                ranking_rule_scores.push(ScoreDetails::Skipped);
+                maybe_add_to_results!(bucket);
+                ranking_rule_scores.pop();
+
+                if cur_ranking_rule_index == 0 {
+                    break;
+                }
+
+                back!();
+            }

            return Ok(BucketSortOutput {
-                scores: vec![Default::default(); valid_docids.len()],
+                scores: valid_scores,
                docids: valid_docids,
                all_candidates,
                degraded: true,
--- a/milli/src/search/new/tests/cutoff.rs
+++ b/milli/src/search/new/tests/cutoff.rs
@@ -0,0 +1,419 @@
+//! This module test the search cutoff and ensure a few things:
+//! 1. A basic test works and mark the search as degraded
+//! 2. A test that ensure the filters are affectively applied even with a cutoff of 0
+//! 3. A test that ensure the cutoff works well with the ranking scores
+
+use std::time::Duration;
+
+use big_s::S;
+use maplit::hashset;
+use meili_snap::snapshot;
+
+use crate::index::tests::TempIndex;
+use crate::{Criterion, Filter, Search, TimeBudget};
+
+fn create_index() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_filterable_fields(hashset! { S("id") });
+            s.set_criteria(vec![Criterion::Words, Criterion::Typo]);
+        })
+        .unwrap();
+
+    // reverse the ID / insertion order so we see better what was sorted from what got the insertion order ordering
+    index
+        .add_documents(documents!([
+            {
+                "id": 4,
+                "text": "hella puppo kefir",
+            },
+            {
+                "id": 3,
+                "text": "hella puppy kefir",
+            },
+            {
+                "id": 2,
+                "text": "hello",
+            },
+            {
+                "id": 1,
+                "text": "hello puppy",
+            },
+            {
+                "id": 0,
+                "text": "hello puppy kefir",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn basic_degraded_search() {
+    let index = create_index();
+    let rtxn = index.read_txn().unwrap();
+
+    let mut search = Search::new(&rtxn, &index);
+    search.query("hello puppy kefir");
+    search.limit(3);
+    search.time_budget(TimeBudget::new(Duration::from_millis(0)));
+
+    let result = search.execute().unwrap();
+    assert!(result.degraded);
+}
+
+#[test]
+fn degraded_search_cannot_skip_filter() {
+    let index = create_index();
+    let rtxn = index.read_txn().unwrap();
+
+    let mut search = Search::new(&rtxn, &index);
+    search.query("hello puppy kefir");
+    search.limit(100);
+    search.time_budget(TimeBudget::new(Duration::from_millis(0)));
+    let filter_condition = Filter::from_str("id > 2").unwrap().unwrap();
+    search.filter(filter_condition);
+
+    let result = search.execute().unwrap();
+    assert!(result.degraded);
+    snapshot!(format!("{:?}\n{:?}", result.candidates, result.documents_ids), @r###"
+    RoaringBitmap<[0, 1]>
+    [0, 1]
+    "###);
+}
+
+#[test]
+fn degraded_search_and_score_details() {
+    let index = create_index();
+    let rtxn = index.read_txn().unwrap();
+
+    let mut search = Search::new(&rtxn, &index);
+    search.query("hello puppy kefir");
+    search.limit(4);
+    search.time_budget(TimeBudget::max());
+
+    let result = search.execute().unwrap();
+    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
+    [
+        4,
+        1,
+        0,
+        3,
+    ]
+    [
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 0,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 1,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 2,
+                    max_matching_words: 3,
+                },
+            ),
+        ],
+    ]
+    "###);
+
+    // Do ONE loop iteration. Not much can be deduced, almost everyone matched the words first bucket.
+    search.time_budget(TimeBudget::max().with_stop_after(1));
+
+    let result = search.execute().unwrap();
+    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
+    [
+        0,
+        1,
+        4,
+        2,
+    ]
+    [
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Skipped,
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Skipped,
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Skipped,
+        ],
+        [
+            Skipped,
+        ],
+    ]
+    "###);
+
+    // Do TWO loop iterations. The first document should be entirely sorted
+    search.time_budget(TimeBudget::max().with_stop_after(2));
+
+    let result = search.execute().unwrap();
+    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
+    [
+        4,
+        0,
+        1,
+        2,
+    ]
+    [
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 0,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Skipped,
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Skipped,
+        ],
+        [
+            Skipped,
+        ],
+    ]
+    "###);
+
+    // Do THREE loop iterations. The second document should be entirely sorted as well
+    search.time_budget(TimeBudget::max().with_stop_after(3));
+
+    let result = search.execute().unwrap();
+    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
+    [
+        4,
+        1,
+        0,
+        2,
+    ]
+    [
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 0,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 1,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Skipped,
+        ],
+        [
+            Skipped,
+        ],
+    ]
+    "###);
+
+    // Do FOUR loop iterations. The third document should be entirely sorted as well
+    // The words bucket have still not progressed thus the last document doesn't have any info yet.
+    search.time_budget(TimeBudget::max().with_stop_after(4));
+
+    let result = search.execute().unwrap();
+    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
+    [
+        4,
+        1,
+        0,
+        2,
+    ]
+    [
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 0,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 1,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+        ],
+        [
+            Skipped,
+        ],
+    ]
+    "###);
+
+    // After FIVE loop iteration. The words ranking rule gave us a new bucket.
+    // Since we reached the limit we were able to early exit without checking the typo ranking rule.
+    search.time_budget(TimeBudget::max().with_stop_after(5));
+
+    let result = search.execute().unwrap();
+    snapshot!(format!("{:#?}\n{:#?}", result.documents_ids, result.document_scores), @r###"
+    [
+        4,
+        1,
+        0,
+        3,
+    ]
+    [
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 0,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+            Typo(
+                Typo {
+                    typo_count: 1,
+                    max_typo_count: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 3,
+                    max_matching_words: 3,
+                },
+            ),
+        ],
+        [
+            Words(
+                Words {
+                    matching_words: 2,
+                    max_matching_words: 3,
+                },
+            ),
+        ],
+    ]
+    "###);
+}
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@@ -1,5 +1,6 @@
 pub mod attribute_fid;
 pub mod attribute_position;
+pub mod cutoff;
 pub mod distinct;
 pub mod exactness;
 pub mod geo_sort;