Merge pull request #5725 from meilisearch/fix-threshold-overcounting-bug

Fix Total Hits being wrong when rankingScoreThreshold is used
This commit is contained in:
Louis Dureuil
2025-07-16 07:15:24 +00:00
committed by GitHub
7 changed files with 92 additions and 1 deletions

View File

@ -1051,6 +1051,7 @@ pub fn prepare_search<'t>(
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS);
search.exhaustive_number_hits(is_finite_pagination);
search.max_total_hits(Some(max_total_hits));
search.scoring_strategy(
if query.show_ranking_score
|| query.show_ranking_score_details

View File

@ -1,6 +1,7 @@
use super::shared_index_with_documents;
use crate::common::Server;
use crate::json;
use meili_snap::{json_string, snapshot};
#[actix_rt::test]
async fn default_search_should_return_estimated_total_hit() {
@ -133,3 +134,61 @@ async fn ensure_placeholder_search_hit_count_valid() {
.await;
}
}
#[actix_rt::test]
async fn test_issue_5274() {
let server = Server::new_shared();
let index = server.unique_index();
let documents = json!([
{
"id": 1,
"title": "Document 1",
"content": "This is the first."
},
{
"id": 2,
"title": "Document 2",
"content": "This is the second doc."
}
]);
let (task, _code) = index.add_documents(documents, None).await;
server.wait_task(task.uid()).await.succeeded();
// Find out the lowest ranking score among the documents
let (rep, _status) = index
.search_post(json!({"q": "doc", "page": 1, "hitsPerPage": 2, "showRankingScore": true}))
.await;
let hits = rep["hits"].as_array().expect("Missing hits array");
let second_hit = hits.get(1).expect("Missing second hit");
let ranking_score = second_hit
.get("_rankingScore")
.expect("Missing _rankingScore field")
.as_f64()
.expect("Expected _rankingScore to be a f64");
// Search with a ranking score threshold just above and expect to be a single hit
let (rep, _status) = index
.search_post(json!({"q": "doc", "page": 1, "hitsPerPage": 1, "rankingScoreThreshold": ranking_score + 0.0001}))
.await;
snapshot!(json_string!(rep, {
".processingTimeMs" => "[ignored]",
}), @r#"
{
"hits": [
{
"id": 2,
"title": "Document 2",
"content": "This is the second doc."
}
],
"query": "doc",
"processingTimeMs": "[ignored]",
"hitsPerPage": 1,
"page": 1,
"totalPages": 1,
"totalHits": 1
}
"#);
}

View File

@ -210,6 +210,7 @@ impl Search<'_> {
scoring_strategy: ScoringStrategy::Detailed,
words_limit: self.words_limit,
exhaustive_number_hits: self.exhaustive_number_hits,
max_total_hits: self.max_total_hits,
rtxn: self.rtxn,
index: self.index,
semantic: self.semantic.clone(),

View File

@ -52,6 +52,7 @@ pub struct Search<'a> {
scoring_strategy: ScoringStrategy,
words_limit: usize,
exhaustive_number_hits: bool,
max_total_hits: Option<usize>,
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
semantic: Option<SemanticSearch>,
@ -74,6 +75,7 @@ impl<'a> Search<'a> {
terms_matching_strategy: TermsMatchingStrategy::default(),
scoring_strategy: Default::default(),
exhaustive_number_hits: false,
max_total_hits: None,
words_limit: 10,
rtxn,
index,
@ -165,6 +167,11 @@ impl<'a> Search<'a> {
self
}
pub fn max_total_hits(&mut self, max_total_hits: Option<usize>) -> &mut Search<'a> {
self.max_total_hits = max_total_hits;
self
}
pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> {
self.time_budget = time_budget;
self
@ -243,6 +250,8 @@ impl<'a> Search<'a> {
&mut ctx,
vector,
self.scoring_strategy,
self.exhaustive_number_hits,
self.max_total_hits,
universe,
&self.sort_criteria,
&self.distinct,
@ -261,6 +270,7 @@ impl<'a> Search<'a> {
self.terms_matching_strategy,
self.scoring_strategy,
self.exhaustive_number_hits,
self.max_total_hits,
universe,
&self.sort_criteria,
&self.distinct,
@ -314,6 +324,7 @@ impl fmt::Debug for Search<'_> {
scoring_strategy,
words_limit,
exhaustive_number_hits,
max_total_hits,
rtxn: _,
index: _,
semantic,
@ -333,6 +344,7 @@ impl fmt::Debug for Search<'_> {
.field("terms_matching_strategy", terms_matching_strategy)
.field("scoring_strategy", scoring_strategy)
.field("exhaustive_number_hits", exhaustive_number_hits)
.field("max_total_hits", max_total_hits)
.field("words_limit", words_limit)
.field(
"semantic.embedder_name",

View File

@ -32,6 +32,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
logger: &mut dyn SearchLogger<Q>,
time_budget: TimeBudget,
ranking_score_threshold: Option<f64>,
exhaustive_number_hits: bool,
max_total_hits: Option<usize>,
) -> Result<BucketSortOutput> {
logger.initial_query(query);
logger.ranking_rules(&ranking_rules);
@ -159,7 +161,13 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
};
}
while valid_docids.len() < length {
let max_len_to_evaluate =
match (max_total_hits, exhaustive_number_hits && ranking_score_threshold.is_some()) {
(Some(max_total_hits), true) => max_total_hits,
_ => length,
};
while valid_docids.len() < max_len_to_evaluate {
if time_budget.exceeded() {
loop {
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);

View File

@ -510,6 +510,7 @@ mod tests {
crate::TermsMatchingStrategy::default(),
crate::score_details::ScoringStrategy::Skip,
false,
None,
universe,
&None,
&None,

View File

@ -626,6 +626,8 @@ pub fn execute_vector_search(
ctx: &mut SearchContext<'_>,
vector: &[f32],
scoring_strategy: ScoringStrategy,
exhaustive_number_hits: bool,
max_total_hits: Option<usize>,
universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
@ -669,6 +671,8 @@ pub fn execute_vector_search(
placeholder_search_logger,
time_budget,
ranking_score_threshold,
exhaustive_number_hits,
max_total_hits,
)?;
Ok(PartialSearchResult {
@ -689,6 +693,7 @@ pub fn execute_search(
terms_matching_strategy: TermsMatchingStrategy,
scoring_strategy: ScoringStrategy,
exhaustive_number_hits: bool,
max_total_hits: Option<usize>,
mut universe: RoaringBitmap,
sort_criteria: &Option<Vec<AscDesc>>,
distinct: &Option<String>,
@ -825,6 +830,8 @@ pub fn execute_search(
query_graph_logger,
time_budget,
ranking_score_threshold,
exhaustive_number_hits,
max_total_hits,
)?
} else {
let ranking_rules =
@ -841,6 +848,8 @@ pub fn execute_search(
placeholder_search_logger,
time_budget,
ranking_score_threshold,
exhaustive_number_hits,
max_total_hits,
)?
};