Make all search tests pass, fix distinctAttribute bug

2025-11-09 04:16:28 +00:00 · 2023-04-24 12:11:25 +02:00
parent a7a0891210
commit d1fdbb63da
17 changed files with 465 additions and 327 deletions
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -88,7 +88,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
        };
    }

-    let mut all_candidates = RoaringBitmap::new();
+    let mut all_candidates = universe.clone();
    let mut valid_docids = vec![];
    let mut cur_offset = 0usize;

@@ -162,8 +162,6 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
        )?;
    }

-    all_candidates |= &ranking_rule_universes[0];
-
    Ok(BucketSortOutput { docids: valid_docids, all_candidates })
 }

@@ -193,12 +191,14 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
            apply_distinct_rule(ctx, distinct_fid, &candidates)?;
        for universe in ranking_rule_universes.iter_mut() {
            *universe -= &excluded;
+            *all_candidates -= &excluded;
        }
        remaining
    } else {
        candidates.clone()
    };
    *all_candidates |= &candidates;
+
    // if the candidates are empty, there is nothing to do;
    if candidates.is_empty() {
        return Ok(());
@@ -216,8 +216,8 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
            );
        } else {
            // otherwise, skip some of the documents and add some of the rest, in order of ids
-            let all_candidates = candidates.iter().collect::<Vec<_>>();
-            let (skipped_candidates, candidates) = all_candidates.split_at(from - *cur_offset);
+            let candidates_vec = candidates.iter().collect::<Vec<_>>();
+            let (skipped_candidates, candidates) = candidates_vec.split_at(from - *cur_offset);

            logger.skip_bucket_ranking_rule(
                cur_ranking_rule_index,
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -243,7 +243,7 @@ pub(crate) mod tests {
        let temp_index = TempIndex::new();
        temp_index
            .add_documents(documents!([
-                { "id": 1, "name": "split this world westfali westfalia the" },
+                { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
            ]))
            .unwrap();
        temp_index
@@ -305,7 +305,7 @@ pub(crate) mod tests {
                    ..Default::default()
                })
                .next(),
-            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+            None
        );
        assert_eq!(
            matching_words
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -599,7 +599,7 @@ mod tests {
        // no crop should return complete text with highlighted matches.
        insta::assert_snapshot!(
            matcher.format(format_options),
-            @"<em>Ŵôřlḑ</em>ôle"
+            @"<em>Ŵôřlḑôle</em>"
        );

        // Text containing unicode match.
@@ -621,7 +621,7 @@ mod tests {
        // no crop should return complete text with highlighted matches.
        insta::assert_snapshot!(
            matcher.format(format_options),
-            @"<em>Westfáli</em>a"
+            @"<em>Westfália</em>"
        );
    }

--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -184,11 +184,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
    for rr in settings_ranking_rules {
        // Add Words before any of: typo, proximity, attribute, exactness
        match rr {
-            crate::Criterion::Typo
-            | crate::Criterion::Attribute
-            | crate::Criterion::Proximity
-            // TODO: no exactness
-            | crate::Criterion::Exactness => {
+            crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => {
                if !words {
                    ranking_rules.push(Box::new(Words::new(terms_matching_strategy)));
                    words = true;
@@ -339,6 +335,8 @@ pub fn execute_search(

        check_sort_criteria(ctx, sort_criteria.as_ref())?;

+        // TODO: if the exactness criterion is the first one, then
+        // use a different strategy to find the universe (union of any term)
        universe = resolve_maximally_reduced_query_graph(
            ctx,
            &universe,
--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@@ -56,8 +56,13 @@ impl RankingRuleGraphTrait for PositionGraph {
        }

        for phrase in term.term_subset.all_phrases(ctx)? {
-            for &word in phrase.words(ctx).iter().flatten() {
-                let positions = ctx.get_db_word_positions(word)?;
+            // Only check the position of the first word in the phrase
+            // this is not correct, but it is the best we can do, since
+            // it is difficult/impossible to know the expected position
+            // of a word in a phrase.
+            // There is probably a more correct way to do it though.
+            if let Some(word) = phrase.words(ctx).iter().flatten().next() {
+                let positions = ctx.get_db_word_positions(*word)?;
                all_positions.extend(positions);
            }
        }
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -79,11 +79,6 @@ pub fn compute_docids(
        //
        // This is an optimisation to avoid checking for an excessive number of
        // pairs.
-        // WAIT, NO.
-        // This should only be done once per node.
-        // Here, we'll potentially do is.. 16 times?
-        // Maybe we should do it at edge-build time instead.
-        // Same for the future attribute ranking rule.
        let right_derivs = first_word_of_term_iter(ctx, &right_term.term_subset)?;
        if right_derivs.len() > 1 {
            let universe = &universe;
@@ -190,11 +185,6 @@ fn compute_non_prefix_edges(
    docids: &mut RoaringBitmap,
    universe: &RoaringBitmap,
 ) -> Result<()> {
-    let mut used_left_phrases = BTreeSet::new();
-    let mut used_right_phrases = BTreeSet::new();
-    let mut used_left_words = BTreeSet::new();
-    let mut used_right_words = BTreeSet::new();
-
    let mut universe = universe.clone();

    for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
@@ -204,25 +194,19 @@ fn compute_non_prefix_edges(
            return Ok(());
        }
    }
-    if let Some(left_phrase) = left_phrase {
-        used_left_phrases.insert(left_phrase);
-    }
-    if let Some(right_phrase) = right_phrase {
-        used_right_phrases.insert(right_phrase);
-    }

    if let Some(new_docids) =
        ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)?
    {
        let new_docids = &universe & new_docids;
        if !new_docids.is_empty() {
-            used_left_words.insert(word1);
-            used_right_words.insert(word2);
            *docids |= new_docids;
        }
    }
    if backward_proximity >= 1
-            // no swapping when either term is a phrase
+            // TODO: for now, we don't do any swapping when either term is a phrase
+            // but maybe we should. We'd need to look at the first/last word of the phrase
+            // depending on the context.
            && left_phrase.is_none() && right_phrase.is_none()
    {
        if let Some(new_docids) =
@@ -230,8 +214,6 @@ fn compute_non_prefix_edges(
        {
            let new_docids = &universe & new_docids;
            if !new_docids.is_empty() {
-                used_left_words.insert(word2);
-                used_right_words.insert(word1);
                *docids |= new_docids;
            }
        }
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -69,11 +69,16 @@ pub fn compute_query_term_subset_docids_within_field_id(
    }

    for phrase in term.all_phrases(ctx)? {
-        for &word in phrase.words(ctx).iter().flatten() {
-            if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word, fid)? {
-                docids |= word_fid_docids;
+        let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
+        // There may be false positives when resolving a phrase, so we're not
+        // guaranteed that all of its words are within a single fid.
+        // TODO: fix this?
+        if let Some(word) = phrase.words(ctx).iter().flatten().next() {
+            if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
+                phrase_docids &= word_fid_docids;
            }
        }
+        docids |= phrase_docids;
    }

    if let Some(word_prefix) = term.use_prefix_db(ctx) {
@@ -104,11 +109,16 @@ pub fn compute_query_term_subset_docids_within_position(
    }

    for phrase in term.all_phrases(ctx)? {
-        for &word in phrase.words(ctx).iter().flatten() {
-            if let Some(word_position_docids) = ctx.get_db_word_position_docids(word, position)? {
-                docids |= word_position_docids;
+        let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
+        // It's difficult to know the expected position of the words in the phrase,
+        // so instead we just check the first one.
+        // TODO: fix this?
+        if let Some(word) = phrase.words(ctx).iter().flatten().next() {
+            if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
+                phrase_docids &= word_position_docids;
            }
        }
+        docids |= phrase_docids;
    }

    if let Some(word_prefix) = term.use_prefix_db(ctx) {