TMP: remove optimization where later ranking rules are not applied on buckets of a single document

Merge #3786
3786: Consistently use wrapping add to avoid overflow in debug when query s… r=dureuill a=dureuill # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/3785 ## What does this PR do? - Some of the code paths would erroneously use the default addition operator that has the semantics that "overflow is an error, checked at runtime in debug" instead of the intended "overflow is expected" semantics that this code use (this code is using `u16::MAX` as a sentinel). This PR makes it so the wrapping add operator is used everywhere. Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2025-07-21 05:41:01 +00:00 · 2023-05-30 11:12:28 +02:00 · 2023-05-29 12:39:54 +00:00 · 2023-05-29 11:58:26 +02:00 · 2023-05-29 11:54:12 +02:00
3 changed files with 41 additions and 16 deletions
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@ -116,16 +116,15 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
    }

    while valid_docids.len() < length {
-        // The universe for this bucket is zero or one element, so we don't need to sort
-        // anything, just extend the results and go back to the parent ranking rule.
-        if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 {
-            let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
-            maybe_add_to_results!(bucket);
+        // The universe for this bucket is zero element, so we don't need to sort
+        // anything, just go back to the parent ranking rule.
+        if ranking_rule_universes[cur_ranking_rule_index].is_empty() {
            back!();
            continue;
        }

-        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else {
+        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])?
+        else {
            back!();
            continue;
        };
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@ -181,9 +181,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
        logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
-        // If universe.len() <= 1, the bucket sort algorithm
-        // should not have called this function.
-        assert!(universe.len() > 1);
        // Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
        // should never happen
        let mut state = self.state.take().unwrap();
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -77,13 +77,9 @@ pub fn located_query_terms_from_tokens(
                }
            }
            TokenKind::Separator(separator_kind) => {
-                match separator_kind {
-                    SeparatorKind::Hard => {
-                        position += 1;
-                    }
-                    SeparatorKind::Soft => {
-                        position += 0;
-                    }
+                // add penalty for hard separators
+                if let SeparatorKind::Hard = separator_kind {
+                    position = position.wrapping_add(1);
                }

                phrase = 'phrase: {
@ -288,3 +284,36 @@ impl PhraseBuilder {
        })
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use charabia::TokenizerBuilder;
+
+    use super::*;
+    use crate::index::tests::TempIndex;
+
+    fn temp_index_with_documents() -> TempIndex {
+        let temp_index = TempIndex::new();
+        temp_index
+            .add_documents(documents!([
+                { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
+                { "id": 2, "name": "Westfália" },
+                { "id": 3, "name": "Ŵôřlḑôle" },
+            ]))
+            .unwrap();
+        temp_index
+    }
+
+    #[test]
+    fn start_with_hard_separator() -> Result<()> {
+        let tokenizer = TokenizerBuilder::new().build();
+        let tokens = tokenizer.tokenize(".");
+        let index = temp_index_with_documents();
+        let rtxn = index.read_txn()?;
+        let mut ctx = SearchContext::new(&index, &rtxn);
+        // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
+        let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
+        assert!(located_query_terms.is_empty());
+        Ok(())
+    }
+}
Author	SHA1	Message	Date
Louis Dureuil	8f65605845	TMP: remove optimization where later ranking rules are not applied on buckets of a single document	2023-05-30 11:12:28 +02:00
meili-bors[bot]	0a7817a002	Merge #3786 3786: Consistently use wrapping add to avoid overflow in debug when query s… r=dureuill a=dureuill # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/3785 ## What does this PR do? - Some of the code paths would erroneously use the default addition operator that has the semantics that "overflow is an error, checked at runtime in debug" instead of the intended "overflow is expected" semantics that this code use (this code is using `u16::MAX` as a sentinel). This PR makes it so the wrapping add operator is used everywhere. Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2023-05-29 12:39:54 +00:00
Louis Dureuil	1dfc4038ab	Add test that fails before PR and passes now	2023-05-29 11:58:26 +02:00
Louis Dureuil	73198179f1	Consistently use wrapping add to avoid overflow in debug when query starts with a separator	2023-05-29 11:54:12 +02:00