Merge branch 'main' into fragment-filters

2025-10-19 10:06:29 +00:00 · 2025-08-13 17:27:09 +02:00
parent 307ea38c2a 300f5ce0f4
commit 398efa3c55
80 changed files with 2723 additions and 256 deletions
--- a/crates/milli/src/search/facet/filter.rs
+++ b/crates/milli/src/search/facet/filter.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::collections::BTreeSet;
 use std::fmt::{Debug, Display};
 use std::ops::Bound::{self, Excluded, Included, Unbounded};
@@ -14,10 +15,9 @@ use super::facet_range_search;
 use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
 use crate::error::{Error, UserError};
 use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
-use crate::heed_codec::facet::{
-    FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
-};
+use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
 use crate::index::db_name::FACET_ID_STRING_DOCIDS;
+use crate::search::facet::facet_range_search::find_docids_of_facet_within_bounds;
 use crate::{
    distance_between_two_points, lat_lng_to_xyz, FieldId, FieldsIdsMap,
    FilterableAttributesFeatures, FilterableAttributesRule, Index, InternalError, Result,
@@ -422,20 +422,56 @@ impl<'a> Filter<'a> {
                return Ok(docids);
            }
            Condition::StartsWith { keyword: _, word } => {
+                // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb".
+                // We just incremented the last letter to find the upper bound.
+                // The upper bound may not be valid utf8, but lmdb doesn't care as it works over bytes.
+
                let value = crate::normalize_facet(word.value());
-                let base = FacetGroupKey { field_id, level: 0, left_bound: value.as_str() };
-                let docids = strings_db
-                    .prefix_iter(rtxn, &base)?
-                    .map(|result| -> Result<RoaringBitmap> {
-                        match result {
-                            Ok((_facet_group_key, FacetGroupValue { bitmap, .. })) => Ok(bitmap),
-                            Err(_e) => Err(InternalError::from(SerializationError::Decoding {
-                                db_name: Some(FACET_ID_STRING_DOCIDS),
-                            })
-                            .into()),
-                        }
-                    })
-                    .union()?;
+                let mut value2 = value.as_bytes().to_owned();
+
+                let last = match value2.last_mut() {
+                    Some(last) => last,
+                    None => {
+                        // The prefix is empty, so all documents that have the field will match.
+                        return index
+                            .exists_faceted_documents_ids(rtxn, field_id)
+                            .map_err(|e| e.into());
+                    }
+                };
+
+                if *last == u8::MAX {
+                    // u8::MAX is a forbidden UTF-8 byte, we're guaranteed it cannot be sent through a filter to meilisearch, but just in case, we're going to return something
+                    tracing::warn!(
+                        "Found non utf-8 character in filter. That shouldn't be possible"
+                    );
+                    return Ok(RoaringBitmap::new());
+                }
+                *last += 1;
+
+                // This is very similar to `heed::Bytes` but its `EItem` is `&[u8]` instead of `[u8]`
+                struct BytesRef;
+                impl<'a> BytesEncode<'a> for BytesRef {
+                    type EItem = &'a [u8];
+
+                    fn bytes_encode(
+                        item: &'a Self::EItem,
+                    ) -> std::result::Result<Cow<'a, [u8]>, heed::BoxedError> {
+                        Ok(Cow::Borrowed(item))
+                    }
+                }
+
+                let mut docids = RoaringBitmap::new();
+                let bytes_db =
+                    index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRef>>();
+                find_docids_of_facet_within_bounds::<BytesRef>(
+                    rtxn,
+                    bytes_db,
+                    field_id,
+                    &Included(value.as_bytes()),
+                    &Excluded(value2.as_slice()),
+                    universe,
+                    &mut docids,
+                )?;

                return Ok(docids);
            }
--- a/crates/milli/src/search/hybrid.rs
+++ b/crates/milli/src/search/hybrid.rs
@@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
 use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
 use crate::search::new::{distinct_fid, distinct_single_docid};
 use crate::search::SemanticSearch;
-use crate::vector::SearchQuery;
+use crate::vector::{Embedding, SearchQuery};
 use crate::{Index, MatchingWords, Result, Search, SearchResult};

 struct ScoreWithRatioResult {
@@ -16,6 +16,7 @@ struct ScoreWithRatioResult {
    document_scores: Vec<(u32, ScoreWithRatio)>,
    degraded: bool,
    used_negative_operator: bool,
+    query_vector: Option<Embedding>,
 }

 type ScoreWithRatio = (Vec<ScoreDetails>, f32);
@@ -85,6 +86,7 @@ impl ScoreWithRatioResult {
            document_scores,
            degraded: results.degraded,
            used_negative_operator: results.used_negative_operator,
+            query_vector: results.query_vector,
        }
    }

@@ -186,6 +188,7 @@ impl ScoreWithRatioResult {
                degraded: vector_results.degraded | keyword_results.degraded,
                used_negative_operator: vector_results.used_negative_operator
                    | keyword_results.used_negative_operator,
+                query_vector: vector_results.query_vector,
            },
            semantic_hit_count,
        ))
@@ -209,6 +212,7 @@ impl Search<'_> {
            terms_matching_strategy: self.terms_matching_strategy,
            scoring_strategy: ScoringStrategy::Detailed,
            words_limit: self.words_limit,
+            retrieve_vectors: self.retrieve_vectors,
            exhaustive_number_hits: self.exhaustive_number_hits,
            max_total_hits: self.max_total_hits,
            rtxn: self.rtxn,
@@ -265,7 +269,7 @@ impl Search<'_> {
        };

        search.semantic = Some(SemanticSearch {
-            vector: Some(vector_query),
+            vector: Some(vector_query.clone()),
            embedder_name,
            embedder,
            quantized,
@@ -322,6 +326,7 @@ fn return_keyword_results(
        mut document_scores,
        degraded,
        used_negative_operator,
+        query_vector,
    }: SearchResult,
 ) -> (SearchResult, Option<u32>) {
    let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
@@ -348,6 +353,7 @@ fn return_keyword_results(
            document_scores,
            degraded,
            used_negative_operator,
+            query_vector,
        },
        Some(0),
    )
--- a/crates/milli/src/search/mod.rs
+++ b/crates/milli/src/search/mod.rs
@@ -52,6 +52,7 @@ pub struct Search<'a> {
    terms_matching_strategy: TermsMatchingStrategy,
    scoring_strategy: ScoringStrategy,
    words_limit: usize,
+    retrieve_vectors: bool,
    exhaustive_number_hits: bool,
    max_total_hits: Option<usize>,
    rtxn: &'a heed::RoTxn<'a>,
@@ -75,6 +76,7 @@ impl<'a> Search<'a> {
            geo_param: GeoSortParameter::default(),
            terms_matching_strategy: TermsMatchingStrategy::default(),
            scoring_strategy: Default::default(),
+            retrieve_vectors: false,
            exhaustive_number_hits: false,
            max_total_hits: None,
            words_limit: 10,
@@ -161,6 +163,11 @@ impl<'a> Search<'a> {
        self
    }

+    pub fn retrieve_vectors(&mut self, retrieve_vectors: bool) -> &mut Search<'a> {
+        self.retrieve_vectors = retrieve_vectors;
+        self
+    }
+
    /// Forces the search to exhaustively compute the number of candidates,
    /// this will increase the search time but allows finite pagination.
    pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> {
@@ -233,6 +240,7 @@ impl<'a> Search<'a> {
        }

        let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
+        let mut query_vector = None;
        let PartialSearchResult {
            located_query_terms,
            candidates,
@@ -247,24 +255,29 @@ impl<'a> Search<'a> {
                embedder,
                quantized,
                media: _,
-            }) => execute_vector_search(
-                &mut ctx,
-                vector,
-                self.scoring_strategy,
-                self.exhaustive_number_hits,
-                self.max_total_hits,
-                universe,
-                &self.sort_criteria,
-                &self.distinct,
-                self.geo_param,
-                self.offset,
-                self.limit,
-                embedder_name,
-                embedder,
-                *quantized,
-                self.time_budget.clone(),
-                self.ranking_score_threshold,
-            )?,
+            }) => {
+                if self.retrieve_vectors {
+                    query_vector = Some(vector.clone());
+                }
+                execute_vector_search(
+                    &mut ctx,
+                    vector,
+                    self.scoring_strategy,
+                    self.exhaustive_number_hits,
+                    self.max_total_hits,
+                    universe,
+                    &self.sort_criteria,
+                    &self.distinct,
+                    self.geo_param,
+                    self.offset,
+                    self.limit,
+                    embedder_name,
+                    embedder,
+                    *quantized,
+                    self.time_budget.clone(),
+                    self.ranking_score_threshold,
+                )?
+            }
            _ => execute_search(
                &mut ctx,
                self.query.as_deref(),
@@ -306,6 +319,7 @@ impl<'a> Search<'a> {
            documents_ids,
            degraded,
            used_negative_operator,
+            query_vector,
        })
    }
 }
@@ -324,6 +338,7 @@ impl fmt::Debug for Search<'_> {
            terms_matching_strategy,
            scoring_strategy,
            words_limit,
+            retrieve_vectors,
            exhaustive_number_hits,
            max_total_hits,
            rtxn: _,
@@ -344,6 +359,7 @@ impl fmt::Debug for Search<'_> {
            .field("searchable_attributes", searchable_attributes)
            .field("terms_matching_strategy", terms_matching_strategy)
            .field("scoring_strategy", scoring_strategy)
+            .field("retrieve_vectors", retrieve_vectors)
            .field("exhaustive_number_hits", exhaustive_number_hits)
            .field("max_total_hits", max_total_hits)
            .field("words_limit", words_limit)
@@ -366,6 +382,7 @@ pub struct SearchResult {
    pub document_scores: Vec<Vec<ScoreDetails>>,
    pub degraded: bool,
    pub used_negative_operator: bool,
+    pub query_vector: Option<Embedding>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
--- a/crates/milli/src/search/new/tests/integration.rs
+++ b/crates/milli/src/search/new/tests/integration.rs
@@ -17,7 +17,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
    let path = tempfile::tempdir().unwrap();
    let options = EnvOpenOptions::new();
    let mut options = options.read_txn_without_tls();
-    options.map_size(10 * 1024 * 1024); // 10 MB
+    options.map_size(10 * 1024 * 1024); // 10 MiB
    let index = Index::new(options, &path, true).unwrap();

    let mut wtxn = index.write_txn().unwrap();
--- a/crates/milli/src/search/similar.rs
+++ b/crates/milli/src/search/similar.rs
@@ -130,6 +130,7 @@ impl<'a> Similar<'a> {
            document_scores,
            degraded: false,
            used_negative_operator: false,
+            query_vector: None,
        })
    }
 }
--- a/crates/milli/src/test_index.rs
+++ b/crates/milli/src/test_index.rs
@@ -1097,6 +1097,7 @@ fn bug_3021_fourth() {
        mut documents_ids,
        degraded: _,
        used_negative_operator: _,
+        query_vector: _,
    } = search.execute().unwrap();
    let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
    documents_ids.sort_unstable();
--- a/crates/milli/src/update/upgrade/mod.rs
+++ b/crates/milli/src/update/upgrade/mod.rs
@@ -8,6 +8,7 @@ use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
 use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13};
 use v1_14::Latest_V1_13_To_Latest_V1_14;
 use v1_15::Latest_V1_14_To_Latest_V1_15;
+use v1_16::Latest_V1_16_To_V1_17_0;

 use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
 use crate::progress::{Progress, VariableNameStep};
@@ -34,6 +35,7 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[
    &Latest_V1_13_To_Latest_V1_14 {},
    &Latest_V1_14_To_Latest_V1_15 {},
    &Latest_V1_15_To_V1_16_0 {},
+    &Latest_V1_16_To_V1_17_0 {},
    // This is the last upgrade function, it will be called when the index is up to date.
    // any other upgrade function should be added before this one.
    &ToCurrentNoOp {},
@@ -62,6 +64,7 @@ const fn start(from: (u32, u32, u32)) -> Option<usize> {
        // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
        (1, 15, _) => function_index!(6),
        (1, 16, _) => function_index!(7),
+        (1, 17, _) => function_index!(8),
        // We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually
        // considering dumpless upgrade.
        (_major, _minor, _patch) => return None,
--- a/crates/milli/src/update/upgrade/v1_16.rs
+++ b/crates/milli/src/update/upgrade/v1_16.rs
@@ -46,3 +46,22 @@ impl UpgradeIndex for Latest_V1_15_To_V1_16_0 {
        (1, 16, 0)
    }
 }
+
+#[allow(non_camel_case_types)]
+pub(super) struct Latest_V1_16_To_V1_17_0();
+
+impl UpgradeIndex for Latest_V1_16_To_V1_17_0 {
+    fn upgrade(
+        &self,
+        _wtxn: &mut RwTxn,
+        _index: &Index,
+        _original: (u32, u32, u32),
+        _progress: Progress,
+    ) -> Result<bool> {
+        Ok(false)
+    }
+
+    fn target_version(&self) -> (u32, u32, u32) {
+        (1, 17, 0)
+    }
+}
--- a/crates/milli/tests/search/filters.rs
+++ b/crates/milli/tests/search/filters.rs
@@ -25,13 +25,16 @@ macro_rules! test_filter {
            let SearchResult { documents_ids, .. } = search.execute().unwrap();

            let filtered_ids = search::expected_filtered_ids($filter);
-            let expected_external_ids: Vec<_> =
+            let mut expected_external_ids: Vec<_> =
                search::expected_order(&criteria, TermsMatchingStrategy::default(), &[])
                    .into_iter()
                    .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None })
                    .collect();

-            let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
+            let mut documents_ids = search::internal_to_external_ids(&index, &documents_ids);
+
+            expected_external_ids.sort_unstable();
+            documents_ids.sort_unstable();
            assert_eq!(documents_ids, expected_external_ids);
        }
    };
@@ -102,3 +105,9 @@ test_filter!(empty_filter_1_double_not, vec![Right("NOT opt1 IS NOT EMPTY")]);
 test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]);
 test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]);
 test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]);
+
+test_filter!(starts_with_filter_single_letter, vec![Right("tag STARTS WITH e")]);
+test_filter!(starts_with_filter_diacritic, vec![Right("tag STARTS WITH é")]);
+test_filter!(starts_with_filter_empty_prefix, vec![Right("tag STARTS WITH ''")]);
+test_filter!(starts_with_filter_hell, vec![Right("title STARTS WITH hell")]);
+test_filter!(starts_with_filter_hello, vec![Right("title STARTS WITH hello")]);
--- a/crates/milli/tests/search/mod.rs
+++ b/crates/milli/tests/search/mod.rs
@@ -12,7 +12,8 @@ use milli::update::new::indexer;
 use milli::update::{IndexerConfig, Settings};
 use milli::vector::RuntimeEmbedders;
 use milli::{
-    AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy,
+    normalize_facet, AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member,
+    TermsMatchingStrategy,
 };
 use serde::{Deserialize, Deserializer};
 use slice_group_by::GroupBy;
@@ -36,7 +37,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
    let path = tempfile::tempdir().unwrap();
    let options = EnvOpenOptions::new();
    let mut options = options.read_txn_without_tls();
-    options.map_size(10 * 1024 * 1024); // 10 MB
+    options.map_size(10 * 1024 * 1024); // 10 MiB
    let index = Index::new(options, &path, true).unwrap();

    let mut wtxn = index.write_txn().unwrap();
@@ -46,6 +47,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {

    builder.set_criteria(criteria.to_vec());
    builder.set_filterable_fields(vec![
+        FilterableAttributesRule::Field(S("title")),
        FilterableAttributesRule::Field(S("tag")),
        FilterableAttributesRule::Field(S("asc_desc_rank")),
        FilterableAttributesRule::Field(S("_geo")),
@@ -220,6 +222,19 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
        {
            id = Some(document.id.clone())
        }
+    } else if let Some((field, prefix)) = filter.split_once("STARTS WITH") {
+        let field = match field.trim() {
+            "tag" => &document.tag,
+            "title" => &document.title,
+            "description" => &document.description,
+            _ => panic!("Unknown field: {field}"),
+        };
+
+        let field = normalize_facet(field);
+        let prefix = normalize_facet(prefix.trim().trim_matches('\''));
+        if field.starts_with(&prefix) {
+            id = Some(document.id.clone());
+        }
    } else if let Some(("asc_desc_rank", filter)) = filter.split_once('<') {
        if document.asc_desc_rank < filter.parse().unwrap() {
            id = Some(document.id.clone())
@@ -271,6 +286,8 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
    } else if matches!(filter, "tag_in NOT IN[1, 2, 3, four, five]") {
        id = (!matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E"))
            .then(|| document.id.clone());
+    } else {
+        panic!("Unknown filter: {filter}");
    }
    id
 }