TMP: remove optimization where later ranking rules are not applied on buckets of a single document

Merge #3786
3786: Consistently use wrapping add to avoid overflow in debug when query s… r=dureuill a=dureuill # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/3785 ## What does this PR do? - Some of the code paths would erroneously use the default addition operator that has the semantics that "overflow is an error, checked at runtime in debug" instead of the intended "overflow is expected" semantics that this code use (this code is using `u16::MAX` as a sentinel). This PR makes it so the wrapping add operator is used everywhere. Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2025-12-05 20:25:42 +00:00 · 2023-05-30 11:12:28 +02:00 · 2023-05-29 12:39:54 +00:00 · 2023-05-29 11:58:26 +02:00 · 2023-05-29 11:54:12 +02:00
13 changed files with 159 additions and 289 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2730,7 +2730,6 @@ dependencies = [
 "geoutils",
 "grenad",
 "heed",
- "indexmap",
 "insta",
 "itertools",
 "json-depth-checker",
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@@ -14,27 +14,14 @@ default-run = "meilisearch"

 [dependencies]
 actix-cors = "0.6.4"
-actix-http = { version = "3.3.1", default-features = false, features = [
-    "compress-brotli",
-    "compress-gzip",
-    "rustls",
-] }
-actix-web = { version = "4.3.1", default-features = false, features = [
-    "macros",
-    "compress-brotli",
-    "compress-gzip",
-    "cookies",
-    "rustls",
-] }
+actix-http = { version = "3.3.1", default-features = false, features = ["compress-brotli", "compress-gzip", "rustls"] }
+actix-web = { version = "4.3.1", default-features = false, features = ["macros", "compress-brotli", "compress-gzip", "cookies", "rustls"] }
 actix-web-static-files = { git = "https://github.com/kilork/actix-web-static-files.git", rev = "2d3b6160", optional = true }
 anyhow = { version = "1.0.70", features = ["backtrace"] }
 async-stream = "0.3.5"
 async-trait = "0.1.68"
 bstr = "1.4.0"
-byte-unit = { version = "4.0.19", default-features = false, features = [
-    "std",
-    "serde",
-] }
+byte-unit = { version = "4.0.19", default-features = false, features = ["std", "serde"] }
 bytes = "1.4.0"
 clap = { version = "4.2.1", features = ["derive", "env"] }
 crossbeam-channel = "0.5.8"
@@ -69,10 +56,7 @@ prometheus = { version = "0.13.3", features = ["process"] }
 rand = "0.8.5"
 rayon = "1.7.0"
 regex = "1.7.3"
-reqwest = { version = "0.11.16", features = [
-    "rustls-tls",
-    "json",
-], default-features = false }
+reqwest = { version = "0.11.16", features = ["rustls-tls", "json"], default-features = false }
 rustls = "0.20.8"
 rustls-pemfile = "1.0.2"
 segment = { version = "0.2.2", optional = true }
@@ -86,12 +70,7 @@ sysinfo = "0.28.4"
 tar = "0.4.38"
 tempfile = "3.5.0"
 thiserror = "1.0.40"
-time = { version = "0.3.20", features = [
-    "serde-well-known",
-    "formatting",
-    "parsing",
-    "macros",
-] }
+time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
 tokio = { version = "1.27.0", features = ["full"] }
 tokio-stream = "0.1.12"
 toml = "0.7.3"
@@ -110,7 +89,7 @@ brotli = "3.3.4"
 insta = "1.29.0"
 manifest-dir-macros = "0.1.16"
 maplit = "1.0.2"
-meili-snap = { path = "../meili-snap" }
+meili-snap = {path = "../meili-snap"}
 temp-env = "0.3.3"
 urlencoding = "2.1.2"
 yaup = "0.2.1"
@@ -119,10 +98,7 @@ yaup = "0.2.1"
 anyhow = { version = "1.0.70", optional = true }
 cargo_toml = { version = "0.15.2", optional = true }
 hex = { version = "0.4.3", optional = true }
-reqwest = { version = "0.11.16", features = [
-    "blocking",
-    "rustls-tls",
-], default-features = false, optional = true }
+reqwest = { version = "0.11.16", features = ["blocking", "rustls-tls"], default-features = false, optional = true }
 sha-1 = { version = "0.10.1", optional = true }
 static-files = { version = "0.2.3", optional = true }
 tempfile = { version = "3.5.0", optional = true }
@@ -132,17 +108,7 @@ zip = { version = "0.6.4", optional = true }
 [features]
 default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
 analytics = ["segment"]
-mini-dashboard = [
-    "actix-web-static-files",
-    "static-files",
-    "anyhow",
-    "cargo_toml",
-    "hex",
-    "reqwest",
-    "sha-1",
-    "tempfile",
-    "zip",
-]
+mini-dashboard = ["actix-web-static-files", "static-files", "anyhow", "cargo_toml", "hex", "reqwest", "sha-1", "tempfile", "zip"]
 chinese = ["meilisearch-types/chinese"]
 hebrew = ["meilisearch-types/hebrew"]
 japanese = ["meilisearch-types/japanese"]
--- a/meilisearch/src/routes/indexes/search.rs
+++ b/meilisearch/src/routes/indexes/search.rs
@@ -16,9 +16,9 @@ use crate::extractors::authentication::policies::*;
 use crate::extractors::authentication::GuardedData;
 use crate::extractors::sequential_extractor::SeqHandler;
 use crate::search::{
-    add_search_rules, perform_search, FacetValuesSort, MatchingStrategy, SearchQuery,
-    DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
-    DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
+    add_search_rules, perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH,
+    DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
+    DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
 };

 pub fn configure(cfg: &mut web::ServiceConfig) {
@@ -58,8 +58,6 @@ pub struct SearchQueryGet {
    show_matches_position: Param<bool>,
    #[deserr(default, error = DeserrQueryParamError<InvalidSearchFacets>)]
    facets: Option<CS<String>>,
-    #[deserr(default, error = DeserrQueryParamError<InvalidSearchFacets>)] // TODO
-    sort_facet_values_by: Option<FacetValuesSort>,
    #[deserr( default = DEFAULT_HIGHLIGHT_PRE_TAG(), error = DeserrQueryParamError<InvalidSearchHighlightPreTag>)]
    highlight_pre_tag: String,
    #[deserr( default = DEFAULT_HIGHLIGHT_POST_TAG(), error = DeserrQueryParamError<InvalidSearchHighlightPostTag>)]
@@ -94,7 +92,6 @@ impl From<SearchQueryGet> for SearchQuery {
            sort: other.sort.map(|attr| fix_sort_query_parameters(&attr)),
            show_matches_position: other.show_matches_position.0,
            facets: other.facets.map(|o| o.into_iter().collect()),
-            sort_facet_values_by: other.sort_facet_values_by,
            highlight_pre_tag: other.highlight_pre_tag,
            highlight_post_tag: other.highlight_post_tag,
            crop_marker: other.crop_marker,
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@@ -5,12 +5,10 @@ use std::time::Instant;

 use deserr::Deserr;
 use either::Either;
-use indexmap::IndexMap;
 use meilisearch_auth::IndexSearchRules;
 use meilisearch_types::deserr::DeserrJsonError;
 use meilisearch_types::error::deserr_codes::*;
 use meilisearch_types::index_uid::IndexUid;
-use meilisearch_types::milli::OrderBy;
 use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
 use meilisearch_types::{milli, Document};
 use milli::tokenizer::TokenizerBuilder;
@@ -62,8 +60,6 @@ pub struct SearchQuery {
    pub sort: Option<Vec<String>>,
    #[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
    pub facets: Option<Vec<String>>,
-    #[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)] // TODO
-    pub sort_facet_values_by: Option<FacetValuesSort>,
    #[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
    pub highlight_pre_tag: String,
    #[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPostTag>, default = DEFAULT_HIGHLIGHT_POST_TAG())]
@@ -115,8 +111,6 @@ pub struct SearchQueryWithIndex {
    pub sort: Option<Vec<String>>,
    #[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)]
    pub facets: Option<Vec<String>>,
-    #[deserr(default, error = DeserrJsonError<InvalidSearchFacets>)] // TODO
-    pub sort_facet_values_by: Option<FacetValuesSort>,
    #[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPreTag>, default = DEFAULT_HIGHLIGHT_PRE_TAG())]
    pub highlight_pre_tag: String,
    #[deserr(default, error = DeserrJsonError<InvalidSearchHighlightPostTag>, default = DEFAULT_HIGHLIGHT_POST_TAG())]
@@ -144,7 +138,6 @@ impl SearchQueryWithIndex {
            filter,
            sort,
            facets,
-            sort_facet_values_by,
            highlight_pre_tag,
            highlight_post_tag,
            crop_marker,
@@ -166,7 +159,6 @@ impl SearchQueryWithIndex {
                filter,
                sort,
                facets,
-                sort_facet_values_by,
                highlight_pre_tag,
                highlight_post_tag,
                crop_marker,
@@ -202,26 +194,6 @@ impl From<MatchingStrategy> for TermsMatchingStrategy {
    }
 }

-#[derive(Debug, Default, Clone, PartialEq, Eq, Deserr)]
-#[deserr(rename_all = camelCase)]
-pub enum FacetValuesSort {
-    /// Facet values are sorted in alphabetical order, ascending from A to Z.
-    #[default]
-    Alpha,
-    /// Facet values are sorted by decreasing count.
-    /// The count is the number of records containing this facet value in the results of the query.
-    Count,
-}
-
-impl From<FacetValuesSort> for OrderBy {
-    fn from(val: FacetValuesSort) -> Self {
-        match val {
-            FacetValuesSort::Alpha => OrderBy::Lexicographic,
-            FacetValuesSort::Count => OrderBy::Count,
-        }
-    }
-}
-
 #[derive(Debug, Clone, Serialize, PartialEq, Eq)]
 pub struct SearchHit {
    #[serde(flatten)]
@@ -241,7 +213,7 @@ pub struct SearchResult {
    #[serde(flatten)]
    pub hits_info: HitsInfo,
    #[serde(skip_serializing_if = "Option::is_none")]
-    pub facet_distribution: Option<BTreeMap<String, IndexMap<String, u64>>>,
+    pub facet_distribution: Option<BTreeMap<String, BTreeMap<String, u64>>>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub facet_stats: Option<BTreeMap<String, FacetStats>>,
 }
@@ -479,10 +451,7 @@ pub fn perform_search(
            if fields.iter().all(|f| f != "*") {
                facet_distribution.facets(fields);
            }
-            let distribution = facet_distribution
-                .candidates(candidates)
-                .order_by(query.sort_facet_values_by.map_or_else(Default::default, Into::into))
-                .execute()?;
+            let distribution = facet_distribution.candidates(candidates).execute()?;
            let stats = facet_distribution.compute_stats()?;
            (Some(distribution), Some(stats))
        }
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -32,7 +32,6 @@ heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.6", default-f
    "lmdb",
    "sync-read-txn",
 ] }
-indexmap = { version = "1.9.3", features = ["serde"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
 memmap2 = "0.5.10"
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -99,8 +99,8 @@ pub use self::heed_codec::{
 };
 pub use self::index::Index;
 pub use self::search::{
-    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, OrderBy,
-    Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
+    SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
 };

 pub type Result<T> = std::result::Result<T, error::Error>;
--- a/milli/src/search/facet/facet_distribution.rs
+++ b/milli/src/search/facet/facet_distribution.rs
@@ -4,18 +4,16 @@ use std::{fmt, mem};

 use heed::types::ByteSlice;
 use heed::BytesDecode;
-use indexmap::IndexMap;
 use roaring::RoaringBitmap;

 use crate::error::UserError;
 use crate::facet::FacetType;
 use crate::heed_codec::facet::{
-    FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
+    FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
+    OrderedF64Codec,
 };
 use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec};
-use crate::search::facet::facet_distribution_iter::{
-    count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
-};
+use crate::search::facet::facet_distribution_iter;
 use crate::{FieldId, Index, Result};

 /// The default number of values by facets that will
@@ -26,21 +24,10 @@ pub const DEFAULT_VALUES_PER_FACET: usize = 100;
 /// the system to choose between one algorithm or another.
 const CANDIDATES_THRESHOLD: u64 = 3000;

-/// How should we fetch the facets?
-#[derive(Debug, Default, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
-pub enum OrderBy {
-    /// By lexicographic order...
-    #[default]
-    Lexicographic,
-    /// Or by number of docids in common?
-    Count,
-}
-
 pub struct FacetDistribution<'a> {
    facets: Option<HashSet<String>>,
    candidates: Option<RoaringBitmap>,
    max_values_per_facet: usize,
-    order_by: OrderBy,
    rtxn: &'a heed::RoTxn<'a>,
    index: &'a Index,
 }
@@ -51,7 +38,6 @@ impl<'a> FacetDistribution<'a> {
            facets: None,
            candidates: None,
            max_values_per_facet: DEFAULT_VALUES_PER_FACET,
-            order_by: OrderBy::default(),
            rtxn,
            index,
        }
@@ -67,11 +53,6 @@ impl<'a> FacetDistribution<'a> {
        self
    }

-    pub fn order_by(&mut self, order_by: OrderBy) -> &mut Self {
-        self.order_by = order_by;
-        self
-    }
-
    pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
        self.candidates = Some(candidates);
        self
@@ -84,7 +65,7 @@ impl<'a> FacetDistribution<'a> {
        field_id: FieldId,
        facet_type: FacetType,
        candidates: &RoaringBitmap,
-        distribution: &mut IndexMap<String, u64>,
+        distribution: &mut BTreeMap<String, u64>,
    ) -> heed::Result<()> {
        match facet_type {
            FacetType::Number => {
@@ -153,15 +134,9 @@ impl<'a> FacetDistribution<'a> {
        &self,
        field_id: FieldId,
        candidates: &RoaringBitmap,
-        order_by: OrderBy,
-        distribution: &mut IndexMap<String, u64>,
+        distribution: &mut BTreeMap<String, u64>,
    ) -> heed::Result<()> {
-        let search_function = match order_by {
-            OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
-            OrderBy::Count => count_iterate_over_facet_distribution,
-        };
-
-        search_function(
+        facet_distribution_iter::iterate_over_facet_distribution(
            self.rtxn,
            self.index
                .facet_id_f64_docids
@@ -184,15 +159,9 @@ impl<'a> FacetDistribution<'a> {
        &self,
        field_id: FieldId,
        candidates: &RoaringBitmap,
-        order_by: OrderBy,
-        distribution: &mut IndexMap<String, u64>,
+        distribution: &mut BTreeMap<String, u64>,
    ) -> heed::Result<()> {
-        let search_function = match order_by {
-            OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
-            OrderBy::Count => count_iterate_over_facet_distribution,
-        };
-
-        search_function(
+        facet_distribution_iter::iterate_over_facet_distribution(
            self.rtxn,
            self.index
                .facet_id_string_docids
@@ -220,46 +189,95 @@ impl<'a> FacetDistribution<'a> {
        )
    }

-    fn facet_values(&self, field_id: FieldId) -> heed::Result<IndexMap<String, u64>> {
-        use FacetType::{Number, String};
+    /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
+    /// facet values one by one and iterate on the facet level 0 for numbers.
+    fn facet_values_from_raw_facet_database(
+        &self,
+        field_id: FieldId,
+    ) -> heed::Result<BTreeMap<String, u64>> {
+        let mut distribution = BTreeMap::new();

-        let mut distribution = IndexMap::new();
-        match (self.order_by, &self.candidates) {
-            (OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => {
-                // Classic search, candidates were specified, we must return facet values only related
-                // to those candidates. We also enter here for facet strings for performance reasons.
-                self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?;
-                self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?;
-            }
-            _ => {
-                let universe;
-                let candidates;
-                match &self.candidates {
-                    Some(cnd) => candidates = cnd,
-                    None => {
-                        universe = self.index.documents_ids(self.rtxn)?;
-                        candidates = &universe;
-                    }
-                }
+        let db = self.index.facet_id_f64_docids;
+        let mut prefix = vec![];
+        prefix.extend_from_slice(&field_id.to_be_bytes());
+        prefix.push(0); // read values from level 0 only

-                self.facet_numbers_distribution_from_facet_levels(
-                    field_id,
-                    candidates,
-                    self.order_by,
-                    &mut distribution,
-                )?;
-                self.facet_strings_distribution_from_facet_levels(
-                    field_id,
-                    candidates,
-                    self.order_by,
-                    &mut distribution,
-                )?;
+        let iter = db
+            .as_polymorph()
+            .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
+            .remap_types::<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>();
+
+        for result in iter {
+            let (key, value) = result?;
+            distribution.insert(key.left_bound.to_string(), value.bitmap.len());
+            if distribution.len() == self.max_values_per_facet {
+                break;
            }
-        };
+        }
+
+        let iter = self
+            .index
+            .facet_id_string_docids
+            .as_polymorph()
+            .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
+            .remap_types::<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>();
+
+        for result in iter {
+            let (key, value) = result?;
+
+            let docid = value.bitmap.iter().next().unwrap();
+            let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound);
+            let original_string =
+                self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned();
+
+            distribution.insert(original_string, value.bitmap.len());
+            if distribution.len() == self.max_values_per_facet {
+                break;
+            }
+        }

        Ok(distribution)
    }

+    fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
+        use FacetType::{Number, String};
+
+        match self.candidates {
+            Some(ref candidates) => {
+                // Classic search, candidates were specified, we must return facet values only related
+                // to those candidates. We also enter here for facet strings for performance reasons.
+                let mut distribution = BTreeMap::new();
+                if candidates.len() <= CANDIDATES_THRESHOLD {
+                    self.facet_distribution_from_documents(
+                        field_id,
+                        Number,
+                        candidates,
+                        &mut distribution,
+                    )?;
+                    self.facet_distribution_from_documents(
+                        field_id,
+                        String,
+                        candidates,
+                        &mut distribution,
+                    )?;
+                } else {
+                    self.facet_numbers_distribution_from_facet_levels(
+                        field_id,
+                        candidates,
+                        &mut distribution,
+                    )?;
+                    self.facet_strings_distribution_from_facet_levels(
+                        field_id,
+                        candidates,
+                        &mut distribution,
+                    )?;
+                }
+                Ok(distribution)
+            }
+            None => self.facet_values_from_raw_facet_database(field_id),
+        }
+    }
+
    pub fn compute_stats(&self) -> Result<BTreeMap<String, (f64, f64)>> {
        let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
        let filterable_fields = self.index.filterable_fields(self.rtxn)?;
@@ -319,7 +337,7 @@ impl<'a> FacetDistribution<'a> {
        Ok(distribution)
    }

-    pub fn execute(&self) -> Result<BTreeMap<String, IndexMap<String, u64>>> {
+    pub fn execute(&self) -> Result<BTreeMap<String, BTreeMap<String, u64>>> {
        let fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
        let filterable_fields = self.index.filterable_fields(self.rtxn)?;

@@ -356,20 +374,13 @@ impl<'a> FacetDistribution<'a> {

 impl fmt::Debug for FacetDistribution<'_> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        let FacetDistribution {
-            facets,
-            candidates,
-            max_values_per_facet,
-            order_by,
-            rtxn: _,
-            index: _,
-        } = self;
+        let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } =
+            self;

        f.debug_struct("FacetDistribution")
            .field("facets", facets)
            .field("candidates", candidates)
            .field("max_values_per_facet", max_values_per_facet)
-            .field("order_by", order_by)
            .finish()
    }
 }
--- a/milli/src/search/facet/facet_distribution_iter.rs
+++ b/milli/src/search/facet/facet_distribution_iter.rs
@@ -1,5 +1,3 @@
-use std::cmp::Reverse;
-use std::collections::BinaryHeap;
 use std::ops::ControlFlow;

 use heed::Result;
@@ -21,7 +19,7 @@ use crate::DocumentId;
 ///
 /// The return value of the closure is a `ControlFlow<()>` which indicates whether we should
 /// keep iterating over the different facet values or stop.
-pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
+pub fn iterate_over_facet_distribution<'t, CB>(
    rtxn: &'t heed::RoTxn<'t>,
    db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
    field_id: u16,
@@ -31,7 +29,7 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
 where
    CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
 {
-    let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
+    let mut fd = FacetDistribution { rtxn, db, field_id, callback };
    let highest_level = get_highest_level(
        rtxn,
        db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
@@ -46,99 +44,7 @@ where
    }
 }

-pub fn count_iterate_over_facet_distribution<'t, CB>(
-    rtxn: &'t heed::RoTxn<'t>,
-    db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
-    field_id: u16,
-    candidates: &RoaringBitmap,
-    mut callback: CB,
-) -> Result<()>
-where
-    CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
-{
-    #[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
-    struct LevelEntry<'t> {
-        /// The number of candidates in this entry.
-        count: u64,
-        /// The key level of the entry.
-        level: Reverse<u8>,
-        /// The left bound key.
-        left_bound: &'t [u8],
-        /// The number of keys we must look for after `left_bound`.
-        group_size: u8,
-        /// Any docid in the set of matching documents. Used to find the original facet string.
-        any_docid: u32,
-    }
-
-    // Represents the list of keys that we must explore.
-    let mut heap = BinaryHeap::new();
-    let highest_level = get_highest_level(
-        rtxn,
-        db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
-        field_id,
-    )?;
-
-    if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
-        // We first fill the heap with values from the highest level
-        let starting_key =
-            FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
-        for el in db.range(rtxn, &(&starting_key..)).unwrap().take(usize::MAX) {
-            let (key, value) = el.unwrap();
-            // The range is unbounded on the right and the group size for the highest level is MAX,
-            // so we need to check that we are not iterating over the next field id
-            if key.field_id != field_id {
-                break;
-            }
-            let intersection = value.bitmap & candidates;
-            let count = intersection.len();
-            if count != 0 {
-                heap.push(LevelEntry {
-                    count,
-                    level: Reverse(key.level),
-                    left_bound: key.left_bound,
-                    group_size: value.size,
-                    any_docid: intersection.min().unwrap(),
-                });
-            }
-        }
-
-        while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
-        {
-            if let Reverse(0) = level {
-                match (callback)(left_bound, count, any_docid)? {
-                    ControlFlow::Continue(_) => (),
-                    ControlFlow::Break(_) => return Ok(()),
-                }
-            } else {
-                let starting_key = FacetGroupKey { field_id, level: level.0 - 1, left_bound };
-                for el in db.range(rtxn, &(&starting_key..)).unwrap().take(group_size as usize) {
-                    let (key, value) = el.unwrap();
-                    // The range is unbounded on the right and the group size for the highest level is MAX,
-                    // so we need to check that we are not iterating over the next field id
-                    if key.field_id != field_id {
-                        break;
-                    }
-                    let intersection = value.bitmap & candidates;
-                    let count = intersection.len();
-                    if count != 0 {
-                        heap.push(LevelEntry {
-                            count,
-                            level: Reverse(key.level),
-                            left_bound: key.left_bound,
-                            group_size: value.size,
-                            any_docid: intersection.min().unwrap(),
-                        });
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// Iterate over the facets values by lexicographic order.
-struct LexicographicFacetDistribution<'t, CB>
+struct FacetDistribution<'t, CB>
 where
    CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
 {
@@ -148,7 +54,7 @@ where
    callback: CB,
 }

-impl<'t, CB> LexicographicFacetDistribution<'t, CB>
+impl<'t, CB> FacetDistribution<'t, CB>
 where
    CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
 {
@@ -180,7 +86,6 @@ where
        }
        Ok(ControlFlow::Continue(()))
    }
-
    fn iterate(
        &mut self,
        candidates: &RoaringBitmap,
@@ -211,7 +116,7 @@ where
                    value.size as usize,
                )?;
                match cf {
-                    ControlFlow::Continue(_) => (),
+                    ControlFlow::Continue(_) => {}
                    ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
                }
            }
@@ -227,7 +132,7 @@ mod tests {
    use heed::BytesDecode;
    use roaring::RoaringBitmap;

-    use super::lexicographically_iterate_over_facet_distribution;
+    use super::iterate_over_facet_distribution;
    use crate::heed_codec::facet::OrderedF64Codec;
    use crate::milli_snap;
    use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
@@ -239,7 +144,7 @@ mod tests {
            let txn = index.env.read_txn().unwrap();
            let candidates = (0..=255).collect::<RoaringBitmap>();
            let mut results = String::new();
-            lexicographically_iterate_over_facet_distribution(
+            iterate_over_facet_distribution(
                &txn,
                index.content,
                0,
@@ -256,7 +161,6 @@ mod tests {
            txn.commit().unwrap();
        }
    }
-
    #[test]
    fn filter_distribution_all_stop_early() {
        let indexes = [get_simple_index(), get_random_looking_index()];
@@ -265,7 +169,7 @@ mod tests {
            let candidates = (0..=255).collect::<RoaringBitmap>();
            let mut results = String::new();
            let mut nbr_facets = 0;
-            lexicographically_iterate_over_facet_distribution(
+            iterate_over_facet_distribution(
                &txn,
                index.content,
                0,
--- a/milli/src/search/facet/mod.rs
+++ b/milli/src/search/facet/mod.rs
@@ -4,7 +4,7 @@ use heed::types::{ByteSlice, DecodeIgnore};
 use heed::{BytesDecode, RoTxn};
 use roaring::RoaringBitmap;

-pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
+pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
 pub use self::filter::{BadGeoError, Filter};
 use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec};
 use crate::heed_codec::ByteSliceRefCodec;
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -4,7 +4,7 @@ use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
 use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;

-pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET};
+pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
 pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
 use self::new::PartialSearchResult;
 use crate::{
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -116,16 +116,15 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
    }

    while valid_docids.len() < length {
-        // The universe for this bucket is zero or one element, so we don't need to sort
-        // anything, just extend the results and go back to the parent ranking rule.
-        if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 {
-            let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
-            maybe_add_to_results!(bucket);
+        // The universe for this bucket is zero element, so we don't need to sort
+        // anything, just go back to the parent ranking rule.
+        if ranking_rule_universes[cur_ranking_rule_index].is_empty() {
            back!();
            continue;
        }

-        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else {
+        let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])?
+        else {
            back!();
            continue;
        };
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@@ -181,9 +181,6 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
        logger: &mut dyn SearchLogger<QueryGraph>,
        universe: &RoaringBitmap,
    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
-        // If universe.len() <= 1, the bucket sort algorithm
-        // should not have called this function.
-        assert!(universe.len() > 1);
        // Will crash if `next_bucket` is called before `start_iteration` or after `end_iteration`,
        // should never happen
        let mut state = self.state.take().unwrap();
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -77,13 +77,9 @@ pub fn located_query_terms_from_tokens(
                }
            }
            TokenKind::Separator(separator_kind) => {
-                match separator_kind {
-                    SeparatorKind::Hard => {
-                        position += 1;
-                    }
-                    SeparatorKind::Soft => {
-                        position += 0;
-                    }
+                // add penalty for hard separators
+                if let SeparatorKind::Hard = separator_kind {
+                    position = position.wrapping_add(1);
                }

                phrase = 'phrase: {
@@ -288,3 +284,36 @@ impl PhraseBuilder {
        })
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use charabia::TokenizerBuilder;
+
+    use super::*;
+    use crate::index::tests::TempIndex;
+
+    fn temp_index_with_documents() -> TempIndex {
+        let temp_index = TempIndex::new();
+        temp_index
+            .add_documents(documents!([
+                { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
+                { "id": 2, "name": "Westfália" },
+                { "id": 3, "name": "Ŵôřlḑôle" },
+            ]))
+            .unwrap();
+        temp_index
+    }
+
+    #[test]
+    fn start_with_hard_separator() -> Result<()> {
+        let tokenizer = TokenizerBuilder::new().build();
+        let tokens = tokenizer.tokenize(".");
+        let index = temp_index_with_documents();
+        let rtxn = index.read_txn()?;
+        let mut ctx = SearchContext::new(&index, &rtxn);
+        // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
+        let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
+        assert!(located_query_terms.is_empty());
+        Ok(())
+    }
+}
Author	SHA1	Message	Date
Louis Dureuil	8f65605845	TMP: remove optimization where later ranking rules are not applied on buckets of a single document	2023-05-30 11:12:28 +02:00
meili-bors[bot]	0a7817a002	Merge #3786 3786: Consistently use wrapping add to avoid overflow in debug when query s… r=dureuill a=dureuill # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/3785 ## What does this PR do? - Some of the code paths would erroneously use the default addition operator that has the semantics that "overflow is an error, checked at runtime in debug" instead of the intended "overflow is expected" semantics that this code use (this code is using `u16::MAX` as a sentinel). This PR makes it so the wrapping add operator is used everywhere. Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2023-05-29 12:39:54 +00:00
Louis Dureuil	1dfc4038ab	Add test that fails before PR and passes now	2023-05-29 11:58:26 +02:00
Louis Dureuil	73198179f1	Consistently use wrapping add to avoid overflow in debug when query starts with a separator	2023-05-29 11:54:12 +02:00