From fd4b192a39372320916f11fd9f5610b3647a6760 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 28 May 2025 17:57:31 +0200 Subject: [PATCH] Add distinct_fid function and expose distinct_single_docid --- crates/milli/src/search/new/bucket_sort.rs | 15 ++++----------- crates/milli/src/search/new/distinct.rs | 17 ++++++++++++++++- crates/milli/src/search/new/mod.rs | 4 ++-- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs index ca7a4a986..3c26cad5c 100644 --- a/crates/milli/src/search/new/bucket_sort.rs +++ b/crates/milli/src/search/new/bucket_sort.rs @@ -4,7 +4,9 @@ use super::logger::SearchLogger; use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; use super::SearchContext; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; +use crate::search::new::distinct::{ + apply_distinct_rule, distinct_fid, distinct_single_docid, DistinctOutput, +}; use crate::{Result, TimeBudget}; pub struct BucketSortOutput { @@ -35,16 +37,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( logger.ranking_rules(&ranking_rules); logger.initial_universe(universe); - let distinct_field = match distinct { - Some(distinct) => Some(distinct), - None => ctx.index.distinct_field(ctx.txn)?, - }; - - let distinct_fid = if let Some(field) = distinct_field { - ctx.index.fields_ids_map(ctx.txn)?.id(field) - } else { - None - }; + let distinct_fid = distinct_fid(distinct, ctx.index, ctx.txn)?; if universe.len() < from as u64 { return Ok(BucketSortOutput { diff --git a/crates/milli/src/search/new/distinct.rs b/crates/milli/src/search/new/distinct.rs index 17859b6f8..36172302a 100644 --- a/crates/milli/src/search/new/distinct.rs +++ b/crates/milli/src/search/new/distinct.rs @@ -9,7 +9,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec, }; use crate::heed_codec::BytesRefCodec; -use crate::{Index, Result, SearchContext}; +use crate::{FieldId, Index, Result, SearchContext}; pub struct DistinctOutput { pub remaining: RoaringBitmap, @@ -121,3 +121,18 @@ pub fn facet_string_values<'a>( fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } + +pub fn distinct_fid( + query_distinct_field: Option<&str>, + index: &Index, + rtxn: &RoTxn<'_>, +) -> Result> { + let distinct_field = match query_distinct_field { + Some(distinct) => Some(distinct), + None => index.distinct_field(rtxn)?, + }; + + let distinct_fid = + if let Some(field) = distinct_field { index.fields_ids_map(rtxn)?.id(field) } else { None }; + Ok(distinct_fid) +} diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index 0a3bc1b04..a65b4076b 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -28,6 +28,7 @@ use std::time::Duration; use bucket_sort::{bucket_sort, BucketSortOutput}; use charabia::{Language, TokenizerBuilder}; use db_cache::DatabaseCache; +pub use distinct::{distinct_fid, distinct_single_docid}; use exact_attribute::ExactAttribute; use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; use heed::RoTxn; @@ -47,8 +48,7 @@ use sort::Sort; use self::distinct::facet_string_values; use self::geo_sort::GeoSort; -pub use self::geo_sort::Parameter as GeoSortParameter; -pub use self::geo_sort::Strategy as GeoSortStrategy; +pub use self::geo_sort::{Parameter as GeoSortParameter, Strategy as GeoSortStrategy}; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort;