From 478f374b9d42465ec5ce1bfc31fa7e692f0b259b Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 28 Jul 2025 16:23:26 +0200 Subject: [PATCH 01/11] Add benchmark --- crates/benchmarks/Cargo.toml | 4 ++ .../benchmarks/benches/filter_starts_with.rs | 72 +++++++++++++++++++ 2 files changed, 76 insertions(+) create mode 100644 crates/benchmarks/benches/filter_starts_with.rs diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index 9dccc444b..25b44436d 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -51,3 +51,7 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "filter_starts_with" +harness = false diff --git a/crates/benchmarks/benches/filter_starts_with.rs b/crates/benchmarks/benches/filter_starts_with.rs new file mode 100644 index 000000000..b1d0b502f --- /dev/null +++ b/crates/benchmarks/benches/filter_starts_with.rs @@ -0,0 +1,72 @@ +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use milli::FilterableAttributesRule; +use utils::Conf; + +#[cfg(not(windows))] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let filterable_fields = ["name"] + .iter() + .map(|s| FilterableAttributesRule::Field(s.to_string())) + .collect(); + builder.set_filterable_fields(filterable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: "jsonl", + queries: &[ + "", + ], + configure: base_conf, + primary_key: Some("geonameid"), + ..Conf::BASE +}; + +fn filter_starts_with(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + utils::Conf { + group_name: "1 letter", + filter: Some("name STARTS WITH e"), + ..BASE_CONF + }, + + utils::Conf { + group_name: "2 letters", + filter: Some("name STARTS WITH es"), + ..BASE_CONF + }, + + utils::Conf { + group_name: "3 letters", + filter: Some("name STARTS WITH est"), + ..BASE_CONF + }, + + utils::Conf { + group_name: "6 letters", + filter: Some("name STARTS WITH estoni"), + ..BASE_CONF + } + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, filter_starts_with); +criterion_main!(benches); From e8a818f53d6387194f645df2dea87acdb6327d28 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 28 Jul 2025 16:24:04 +0200 Subject: [PATCH 02/11] Optimize the filter --- crates/milli/src/search/facet/filter.rs | 37 +++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index c3eba8031..54c7535dd 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -18,6 +18,7 @@ use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::index::db_name::FACET_ID_STRING_DOCIDS; +use crate::search::facet::facet_range_search::find_docids_of_facet_within_bounds; use crate::{ distance_between_two_points, lat_lng_to_xyz, FieldId, FieldsIdsMap, FilterableAttributesFeatures, FilterableAttributesRule, Index, InternalError, Result, @@ -416,7 +417,43 @@ impl<'a> Filter<'a> { return Ok(docids); } Condition::StartsWith { keyword: _, word } => { + // There are two algorithms: + // + // - The first one looks directly at level 0 of the facet group database. + // This pessimistic approach is more efficient when the value is unique. + // + // - The second one is recursive over levels. + // This is more efficient when the prefix is common among many values. + let value = crate::normalize_facet(word.value()); + + if value.len() <= 6 { + // 6 is abitrary, but it works well in practice + let mut value2 = value.as_bytes().to_owned(); + if let Some(last) = value2.last_mut() { + if *last != 255 { + *last += 1; + if let Ok(value2) = String::from_utf8(value2) { + // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". + // We just increase the last letter to find the upper bound. + // The result could be invalid utf8, so it can fallback. + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds( + rtxn, + strings_db, + field_id, + &Included(&value), + &Excluded(&value2), + universe, + &mut docids, + )?; + + return Ok(docids); + } + } + } + } + let base = FacetGroupKey { field_id, level: 0, left_bound: value.as_str() }; let docids = strings_db .prefix_iter(rtxn, &base)? From 691a9ae4b18c00459ca2512429a364fc2b8722cf Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 28 Jul 2025 16:24:11 +0200 Subject: [PATCH 03/11] Format --- crates/benchmarks/benches/filter_starts_with.rs | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/crates/benchmarks/benches/filter_starts_with.rs b/crates/benchmarks/benches/filter_starts_with.rs index b1d0b502f..a7682cbf8 100644 --- a/crates/benchmarks/benches/filter_starts_with.rs +++ b/crates/benchmarks/benches/filter_starts_with.rs @@ -11,17 +11,11 @@ use utils::Conf; static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; fn base_conf(builder: &mut Settings) { - let displayed_fields = - ["geonameid", "name"] - .iter() - .map(|s| s.to_string()) - .collect(); + let displayed_fields = ["geonameid", "name"].iter().map(|s| s.to_string()).collect(); builder.set_displayed_fields(displayed_fields); - let filterable_fields = ["name"] - .iter() - .map(|s| FilterableAttributesRule::Field(s.to_string())) - .collect(); + let filterable_fields = + ["name"].iter().map(|s| FilterableAttributesRule::Field(s.to_string())).collect(); builder.set_filterable_fields(filterable_fields); } From 224892e692fb31df55dc25b275d9412d247314a2 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 28 Jul 2025 16:28:06 +0200 Subject: [PATCH 04/11] Enable new algorithm every time --- crates/milli/src/search/facet/filter.rs | 41 ++++++++++++------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 54c7535dd..907b12f1a 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -427,29 +427,26 @@ impl<'a> Filter<'a> { let value = crate::normalize_facet(word.value()); - if value.len() <= 6 { - // 6 is abitrary, but it works well in practice - let mut value2 = value.as_bytes().to_owned(); - if let Some(last) = value2.last_mut() { - if *last != 255 { - *last += 1; - if let Ok(value2) = String::from_utf8(value2) { - // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". - // We just increase the last letter to find the upper bound. - // The result could be invalid utf8, so it can fallback. - let mut docids = RoaringBitmap::new(); - find_docids_of_facet_within_bounds( - rtxn, - strings_db, - field_id, - &Included(&value), - &Excluded(&value2), - universe, - &mut docids, - )?; + let mut value2 = value.as_bytes().to_owned(); + if let Some(last) = value2.last_mut() { + if *last != 255 { + *last += 1; + if let Ok(value2) = String::from_utf8(value2) { + // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". + // We just increase the last letter to find the upper bound. + // The result could be invalid utf8, so it can fallback. + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds( + rtxn, + strings_db, + field_id, + &Included(&value), + &Excluded(&value2), + universe, + &mut docids, + )?; - return Ok(docids); - } + return Ok(docids); } } } From 48a5f4db2d48b2b8fb98abf9e74acf90d349e9c0 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 28 Jul 2025 16:42:33 +0200 Subject: [PATCH 05/11] Improve comment --- crates/milli/src/search/facet/filter.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 907b12f1a..955e75753 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -419,11 +419,12 @@ impl<'a> Filter<'a> { Condition::StartsWith { keyword: _, word } => { // There are two algorithms: // - // - The first one looks directly at level 0 of the facet group database. - // This pessimistic approach is more efficient when the value is unique. - // - // - The second one is recursive over levels. + // - The first one is recursive over levels. // This is more efficient when the prefix is common among many values. + // + // - The second one looks directly at level 0 of the facet group database. + // This pessimistic approach is more efficient when the value is unique. + // It's used as a fallback. let value = crate::normalize_facet(word.value()); From fc814b7537cfb8c1a8abdeb2835f1ebe0f317e0b Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 5 Aug 2025 10:25:14 +0200 Subject: [PATCH 06/11] Apply review suggestion --- crates/milli/src/search/facet/filter.rs | 78 +++++++++++++------------ 1 file changed, 42 insertions(+), 36 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 955e75753..af4a77814 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::BTreeSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included, Unbounded}; @@ -14,9 +15,7 @@ use super::facet_range_search; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::{Error, UserError}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; -use crate::heed_codec::facet::{ - FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, -}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::index::db_name::FACET_ID_STRING_DOCIDS; use crate::search::facet::facet_range_search::find_docids_of_facet_within_bounds; use crate::{ @@ -427,44 +426,51 @@ impl<'a> Filter<'a> { // It's used as a fallback. let value = crate::normalize_facet(word.value()); - let mut value2 = value.as_bytes().to_owned(); - if let Some(last) = value2.last_mut() { - if *last != 255 { - *last += 1; - if let Ok(value2) = String::from_utf8(value2) { - // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". - // We just increase the last letter to find the upper bound. - // The result could be invalid utf8, so it can fallback. - let mut docids = RoaringBitmap::new(); - find_docids_of_facet_within_bounds( - rtxn, - strings_db, - field_id, - &Included(&value), - &Excluded(&value2), - universe, - &mut docids, - )?; - return Ok(docids); - } + let last = match value2.last_mut() { + Some(last) => last, + None => { + // The prefix is empty, so all documents that have the field will match. + return index + .exists_faceted_documents_ids(rtxn, field_id) + .map_err(|e| e.into()); + } + }; + + if *last == 255 { + // The prefix is invalid utf8, so no documents will match anyway + return Ok(RoaringBitmap::new()); + } + *last += 1; + + // This is very similar to `heed::Bytes` but its `EItem` is `&[u8]` instead of `[u8]` + struct BytesRef; + impl<'a> BytesEncode<'a> for BytesRef { + type EItem = &'a [u8]; + + fn bytes_encode( + item: &'a Self::EItem, + ) -> std::result::Result, heed::BoxedError> { + Ok(Cow::Borrowed(item)) } } - let base = FacetGroupKey { field_id, level: 0, left_bound: value.as_str() }; - let docids = strings_db - .prefix_iter(rtxn, &base)? - .map(|result| -> Result { - match result { - Ok((_facet_group_key, FacetGroupValue { bitmap, .. })) => Ok(bitmap), - Err(_e) => Err(InternalError::from(SerializationError::Decoding { - db_name: Some(FACET_ID_STRING_DOCIDS), - }) - .into()), - } - }) - .union()?; + // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". + // We just incremented the last letter to find the upper bound. + // The upper bound may not be valid utf8, but lmdb doesn't care as it works over bytes. + let mut docids = RoaringBitmap::new(); + let bytes_db = + index.facet_id_string_docids.remap_key_type::>(); + find_docids_of_facet_within_bounds::( + rtxn, + bytes_db, + field_id, + &Included(value.as_bytes()), + &Excluded(value2.as_slice()), + universe, + &mut docids, + )?; return Ok(docids); } From afb367c7f4a8003c0d1c45b76c3f8a201e498276 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 5 Aug 2025 10:29:39 +0200 Subject: [PATCH 07/11] Update old comment --- crates/milli/src/search/facet/filter.rs | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index af4a77814..4d1e51767 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -416,14 +416,9 @@ impl<'a> Filter<'a> { return Ok(docids); } Condition::StartsWith { keyword: _, word } => { - // There are two algorithms: - // - // - The first one is recursive over levels. - // This is more efficient when the prefix is common among many values. - // - // - The second one looks directly at level 0 of the facet group database. - // This pessimistic approach is more efficient when the value is unique. - // It's used as a fallback. + // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". + // We just incremented the last letter to find the upper bound. + // The upper bound may not be valid utf8, but lmdb doesn't care as it works over bytes. let value = crate::normalize_facet(word.value()); let mut value2 = value.as_bytes().to_owned(); @@ -456,9 +451,6 @@ impl<'a> Filter<'a> { } } - // The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb". - // We just incremented the last letter to find the upper bound. - // The upper bound may not be valid utf8, but lmdb doesn't care as it works over bytes. let mut docids = RoaringBitmap::new(); let bytes_db = index.facet_id_string_docids.remap_key_type::>(); From 3d2c204f2d528040f6775ae0102666aa5e6987f6 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Aug 2025 11:26:10 +0200 Subject: [PATCH 08/11] Update crates/milli/src/search/facet/filter.rs --- crates/milli/src/search/facet/filter.rs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 4d1e51767..803a0635b 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -433,8 +433,9 @@ impl<'a> Filter<'a> { } }; - if *last == 255 { - // The prefix is invalid utf8, so no documents will match anyway + if *last == u8::MAX { + // u8::MAX is a forbidden UTF-8 byte, we're guaranteed it cannot be sent through a filter to meilisearch, but just in case, we're going to return something + tracing::warn!("Found non utf-8 character in filter. That shouldn't be possible"); return Ok(RoaringBitmap::new()); } *last += 1; From 4c61a227caf32b6e947722d1c4120a75b1e25dbe Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 5 Aug 2025 11:29:54 +0200 Subject: [PATCH 09/11] fmt after my suggestion --- crates/milli/src/search/facet/filter.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 803a0635b..76d935fc6 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -435,7 +435,9 @@ impl<'a> Filter<'a> { if *last == u8::MAX { // u8::MAX is a forbidden UTF-8 byte, we're guaranteed it cannot be sent through a filter to meilisearch, but just in case, we're going to return something - tracing::warn!("Found non utf-8 character in filter. That shouldn't be possible"); + tracing::warn!( + "Found non utf-8 character in filter. That shouldn't be possible" + ); return Ok(RoaringBitmap::new()); } *last += 1; From c4e7bf2e6022d946260e559366767f87af999082 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 5 Aug 2025 12:14:16 +0200 Subject: [PATCH 10/11] Stabilize `STARTS WITH` filter --- crates/filter-parser/src/lib.rs | 6 +++--- crates/index-scheduler/src/features.rs | 2 +- crates/meilisearch/tests/search/errors.rs | 24 +++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 938702103..e25636812 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -165,9 +165,9 @@ impl<'a> FilterCondition<'a> { | Condition::Exists | Condition::LowerThan(_) | Condition::LowerThanOrEqual(_) - | Condition::Between { .. } => None, - Condition::Contains { keyword, word: _ } - | Condition::StartsWith { keyword, word: _ } => Some(keyword), + | Condition::Between { .. } + | Condition::StartsWith { .. } => None, + Condition::Contains { keyword, word: _ } => Some(keyword), }, FilterCondition::Not(this) => this.use_contains_operator(), FilterCondition::Or(seq) | FilterCondition::And(seq) => { diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index b52a659a6..00e706a74 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -85,7 +85,7 @@ impl RoFeatures { Ok(()) } else { Err(FeatureNotEnabledError { - disabled_action: "Using `CONTAINS` or `STARTS WITH` in a filter", + disabled_action: "Using `CONTAINS` in a filter", feature: "contains filter", issue_link: "https://github.com/orgs/meilisearch/discussions/763", } diff --git a/crates/meilisearch/tests/search/errors.rs b/crates/meilisearch/tests/search/errors.rs index 363ece067..9cc7e06dd 100644 --- a/crates/meilisearch/tests/search/errors.rs +++ b/crates/meilisearch/tests/search/errors.rs @@ -1270,27 +1270,27 @@ async fn search_with_contains_without_enabling_the_feature() { index .search(json!({ "filter": "doggo CONTAINS kefir" }), |response, code| { snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir", + "message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" } - "###); + "#); }) .await; index .search(json!({ "filter": "doggo != echo AND doggo CONTAINS kefir" }), |response, code| { snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n25:33 doggo != echo AND doggo CONTAINS kefir", + "message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n25:33 doggo != echo AND doggo CONTAINS kefir", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" } - "###); + "#); }) .await; @@ -1299,24 +1299,24 @@ async fn search_with_contains_without_enabling_the_feature() { index.search_post(json!({ "filter": ["doggo != echo", "doggo CONTAINS kefir"] })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir", + "message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" } - "###); + "#); let (response, code) = index.search_post(json!({ "filter": ["doggo != echo", ["doggo CONTAINS kefir"]] })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir", + "message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir", "code": "feature_not_enabled", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#feature_not_enabled" } - "###); + "#); } From 3a9b08960abe17e46f31cf0196c2a8453df68772 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 5 Aug 2025 13:49:28 +0200 Subject: [PATCH 11/11] Add test --- .../milli/src/search/new/tests/integration.rs | 2 +- crates/milli/tests/search/filters.rs | 13 ++++++++++-- crates/milli/tests/search/mod.rs | 21 +++++++++++++++++-- 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 38f39e18b..6b8c25ab8 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -17,7 +17,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let path = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); let mut options = options.read_txn_without_tls(); - options.map_size(10 * 1024 * 1024); // 10 MB + options.map_size(10 * 1024 * 1024); // 10 MiB let index = Index::new(options, &path, true).unwrap(); let mut wtxn = index.write_txn().unwrap(); diff --git a/crates/milli/tests/search/filters.rs b/crates/milli/tests/search/filters.rs index bb5943782..c97143d48 100644 --- a/crates/milli/tests/search/filters.rs +++ b/crates/milli/tests/search/filters.rs @@ -25,13 +25,16 @@ macro_rules! test_filter { let SearchResult { documents_ids, .. } = search.execute().unwrap(); let filtered_ids = search::expected_filtered_ids($filter); - let expected_external_ids: Vec<_> = + let mut expected_external_ids: Vec<_> = search::expected_order(&criteria, TermsMatchingStrategy::default(), &[]) .into_iter() .filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None }) .collect(); - let documents_ids = search::internal_to_external_ids(&index, &documents_ids); + let mut documents_ids = search::internal_to_external_ids(&index, &documents_ids); + + expected_external_ids.sort_unstable(); + documents_ids.sort_unstable(); assert_eq!(documents_ids, expected_external_ids); } }; @@ -102,3 +105,9 @@ test_filter!(empty_filter_1_double_not, vec![Right("NOT opt1 IS NOT EMPTY")]); test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]); test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]); test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]); + +test_filter!(starts_with_filter_single_letter, vec![Right("tag STARTS WITH e")]); +test_filter!(starts_with_filter_diacritic, vec![Right("tag STARTS WITH é")]); +test_filter!(starts_with_filter_empty_prefix, vec![Right("tag STARTS WITH ''")]); +test_filter!(starts_with_filter_hell, vec![Right("title STARTS WITH hell")]); +test_filter!(starts_with_filter_hello, vec![Right("title STARTS WITH hello")]); diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index fa03f1cc1..578a22009 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -12,7 +12,8 @@ use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; use milli::vector::RuntimeEmbedders; use milli::{ - AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy, + normalize_facet, AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, + TermsMatchingStrategy, }; use serde::{Deserialize, Deserializer}; use slice_group_by::GroupBy; @@ -36,7 +37,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let path = tempfile::tempdir().unwrap(); let options = EnvOpenOptions::new(); let mut options = options.read_txn_without_tls(); - options.map_size(10 * 1024 * 1024); // 10 MB + options.map_size(10 * 1024 * 1024); // 10 MiB let index = Index::new(options, &path, true).unwrap(); let mut wtxn = index.write_txn().unwrap(); @@ -46,6 +47,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { builder.set_criteria(criteria.to_vec()); builder.set_filterable_fields(vec![ + FilterableAttributesRule::Field(S("title")), FilterableAttributesRule::Field(S("tag")), FilterableAttributesRule::Field(S("asc_desc_rank")), FilterableAttributesRule::Field(S("_geo")), @@ -220,6 +222,19 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { { id = Some(document.id.clone()) } + } else if let Some((field, prefix)) = filter.split_once("STARTS WITH") { + let field = match field.trim() { + "tag" => &document.tag, + "title" => &document.title, + "description" => &document.description, + _ => panic!("Unknown field: {field}"), + }; + + let field = normalize_facet(field); + let prefix = normalize_facet(prefix.trim().trim_matches('\'')); + if field.starts_with(&prefix) { + id = Some(document.id.clone()); + } } else if let Some(("asc_desc_rank", filter)) = filter.split_once('<') { if document.asc_desc_rank < filter.parse().unwrap() { id = Some(document.id.clone()) @@ -271,6 +286,8 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option { } else if matches!(filter, "tag_in NOT IN[1, 2, 3, four, five]") { id = (!matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E")) .then(|| document.id.clone()); + } else { + panic!("Unknown filter: {filter}"); } id }