4693: Introduce distinct attributes at search time r=irevoire a=Kerollmops

This PR fixes #4611.

### To Do
- [x] Remove the `distinguishableAttributes` settings (not even a commit about that).
- [x] Use the `filterableAttributes` to be able to use the `distinct` parameter at search.
- [x] Work on the errors and make tests.

Co-authored-by: Clément Renault <clement@meilisearch.com>
Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2024-06-18 07:45:03 +00:00
committed by GitHub
17 changed files with 315 additions and 14 deletions

View File

@ -22,6 +22,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
query: &Q,
distinct: Option<&str>,
universe: &RoaringBitmap,
from: usize,
length: usize,
@ -34,7 +35,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
logger.ranking_rules(&ranking_rules);
logger.initial_universe(universe);
let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? {
let distinct_field = match distinct {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
let distinct_fid = if let Some(field) = distinct_field {
ctx.index.fields_ids_map(ctx.txn)?.id(field)
} else {
None

View File

@ -516,6 +516,7 @@ mod tests {
false,
universe,
&None,
&None,
crate::search::new::GeoSortStrategy::default(),
0,
100,

View File

@ -568,6 +568,7 @@ pub fn execute_vector_search(
scoring_strategy: ScoringStrategy,
universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
from: usize,
length: usize,
@ -598,6 +599,7 @@ pub fn execute_vector_search(
ctx,
ranking_rules,
&PlaceholderQuery,
distinct.as_deref(),
&universe,
from,
length,
@ -627,6 +629,7 @@ pub fn execute_search(
exhaustive_number_hits: bool,
mut universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
geo_strategy: geo_sort::Strategy,
from: usize,
length: usize,
@ -717,6 +720,7 @@ pub fn execute_search(
ctx,
ranking_rules,
&graph,
distinct.as_deref(),
&universe,
from,
length,
@ -732,6 +736,7 @@ pub fn execute_search(
ctx,
ranking_rules,
&PlaceholderQuery,
distinct.as_deref(),
&universe,
from,
length,
@ -748,7 +753,12 @@ pub fn execute_search(
// The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set.
if exhaustive_number_hits {
if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
let distinct_field = match distinct.as_deref() {
Some(distinct) => Some(distinct),
None => ctx.index.distinct_field(ctx.txn)?,
};
if let Some(f) = distinct_field {
if let Some(distinct_fid) = fields_ids_map.id(f) {
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
}

View File

@ -205,8 +205,18 @@ fn create_index() -> TempIndex {
index
}
fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec<String> {
let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids);
fn verify_distinct(
index: &Index,
txn: &RoTxn,
distinct: Option<&str>,
docids: &[u32],
) -> Vec<String> {
let vs = collect_field_values(
index,
txn,
distinct.or_else(|| index.distinct_field(txn).unwrap()).unwrap(),
docids,
);
let mut unique = HashSet::new();
for v in vs.iter() {
@ -223,12 +233,49 @@ fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec<String> {
fn test_distinct_placeholder_no_ranking_rules() {
let index = create_index();
// Set the letter as filterable and unset the distinct attribute.
index
.update_settings(|s| {
s.set_filterable_fields(hashset! { S("letter") });
s.reset_distinct_field();
})
.unwrap();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.distinct(S("letter"));
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, Some("letter"), &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
"\"B\"",
"\"C\"",
"\"D\"",
"\"E\"",
"\"F\"",
"\"G\"",
"\"H\"",
"\"I\"",
"__does_not_exist__",
"__does_not_exist__",
"__does_not_exist__",
]
"###);
}
#[test]
fn test_distinct_at_search_placeholder_no_ranking_rules() {
let index = create_index();
let txn = index.read_txn().unwrap();
let s = Search::new(&txn, &index);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
@ -263,7 +310,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"E\"",
@ -303,7 +350,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"I\"",
@ -346,7 +393,7 @@ fn test_distinct_placeholder_sort() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"I\"",
@ -399,7 +446,7 @@ fn test_distinct_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"A\"",
@ -453,7 +500,7 @@ fn test_distinct_sort_words() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"I\"",
@ -549,7 +596,7 @@ fn test_distinct_typo() {
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]");
let distinct_values = verify_distinct(&index, &txn, &documents_ids);
let distinct_values = verify_distinct(&index, &txn, None, &documents_ids);
insta::assert_debug_snapshot!(distinct_values, @r###"
[
"\"B\"",