mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-04 11:46:30 +00:00
Merge pull request #5783 from meilisearch/starts-with-optim
Optimize the starts_with filter
This commit is contained in:
@ -55,3 +55,7 @@ harness = false
|
||||
[[bench]]
|
||||
name = "sort"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "filter_starts_with"
|
||||
harness = false
|
||||
|
66
crates/benchmarks/benches/filter_starts_with.rs
Normal file
66
crates/benchmarks/benches/filter_starts_with.rs
Normal file
@ -0,0 +1,66 @@
|
||||
mod datasets_paths;
|
||||
mod utils;
|
||||
|
||||
use criterion::{criterion_group, criterion_main};
|
||||
use milli::update::Settings;
|
||||
use milli::FilterableAttributesRule;
|
||||
use utils::Conf;
|
||||
|
||||
#[cfg(not(windows))]
|
||||
#[global_allocator]
|
||||
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
|
||||
fn base_conf(builder: &mut Settings) {
|
||||
let displayed_fields = ["geonameid", "name"].iter().map(|s| s.to_string()).collect();
|
||||
builder.set_displayed_fields(displayed_fields);
|
||||
|
||||
let filterable_fields =
|
||||
["name"].iter().map(|s| FilterableAttributesRule::Field(s.to_string())).collect();
|
||||
builder.set_filterable_fields(filterable_fields);
|
||||
}
|
||||
|
||||
#[rustfmt::skip]
|
||||
const BASE_CONF: Conf = Conf {
|
||||
dataset: datasets_paths::SMOL_ALL_COUNTRIES,
|
||||
dataset_format: "jsonl",
|
||||
queries: &[
|
||||
"",
|
||||
],
|
||||
configure: base_conf,
|
||||
primary_key: Some("geonameid"),
|
||||
..Conf::BASE
|
||||
};
|
||||
|
||||
fn filter_starts_with(c: &mut criterion::Criterion) {
|
||||
#[rustfmt::skip]
|
||||
let confs = &[
|
||||
utils::Conf {
|
||||
group_name: "1 letter",
|
||||
filter: Some("name STARTS WITH e"),
|
||||
..BASE_CONF
|
||||
},
|
||||
|
||||
utils::Conf {
|
||||
group_name: "2 letters",
|
||||
filter: Some("name STARTS WITH es"),
|
||||
..BASE_CONF
|
||||
},
|
||||
|
||||
utils::Conf {
|
||||
group_name: "3 letters",
|
||||
filter: Some("name STARTS WITH est"),
|
||||
..BASE_CONF
|
||||
},
|
||||
|
||||
utils::Conf {
|
||||
group_name: "6 letters",
|
||||
filter: Some("name STARTS WITH estoni"),
|
||||
..BASE_CONF
|
||||
}
|
||||
];
|
||||
|
||||
utils::run_benches(c, confs);
|
||||
}
|
||||
|
||||
criterion_group!(benches, filter_starts_with);
|
||||
criterion_main!(benches);
|
@ -165,9 +165,9 @@ impl<'a> FilterCondition<'a> {
|
||||
| Condition::Exists
|
||||
| Condition::LowerThan(_)
|
||||
| Condition::LowerThanOrEqual(_)
|
||||
| Condition::Between { .. } => None,
|
||||
Condition::Contains { keyword, word: _ }
|
||||
| Condition::StartsWith { keyword, word: _ } => Some(keyword),
|
||||
| Condition::Between { .. }
|
||||
| Condition::StartsWith { .. } => None,
|
||||
Condition::Contains { keyword, word: _ } => Some(keyword),
|
||||
},
|
||||
FilterCondition::Not(this) => this.use_contains_operator(),
|
||||
FilterCondition::Or(seq) | FilterCondition::And(seq) => {
|
||||
|
@ -85,7 +85,7 @@ impl RoFeatures {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Using `CONTAINS` or `STARTS WITH` in a filter",
|
||||
disabled_action: "Using `CONTAINS` in a filter",
|
||||
feature: "contains filter",
|
||||
issue_link: "https://github.com/orgs/meilisearch/discussions/763",
|
||||
}
|
||||
|
@ -1270,27 +1270,27 @@ async fn search_with_contains_without_enabling_the_feature() {
|
||||
index
|
||||
.search(json!({ "filter": "doggo CONTAINS kefir" }), |response, code| {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
snapshot!(json_string!(response), @r#"
|
||||
{
|
||||
"message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir",
|
||||
"message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
"#);
|
||||
})
|
||||
.await;
|
||||
index
|
||||
.search(json!({ "filter": "doggo != echo AND doggo CONTAINS kefir" }), |response, code| {
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
snapshot!(json_string!(response), @r#"
|
||||
{
|
||||
"message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n25:33 doggo != echo AND doggo CONTAINS kefir",
|
||||
"message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n25:33 doggo != echo AND doggo CONTAINS kefir",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
"#);
|
||||
})
|
||||
.await;
|
||||
|
||||
@ -1299,24 +1299,24 @@ async fn search_with_contains_without_enabling_the_feature() {
|
||||
index.search_post(json!({ "filter": ["doggo != echo", "doggo CONTAINS kefir"] })).await;
|
||||
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
snapshot!(json_string!(response), @r#"
|
||||
{
|
||||
"message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir",
|
||||
"message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
"#);
|
||||
let (response, code) =
|
||||
index.search_post(json!({ "filter": ["doggo != echo", ["doggo CONTAINS kefir"]] })).await;
|
||||
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(json_string!(response), @r###"
|
||||
snapshot!(json_string!(response), @r#"
|
||||
{
|
||||
"message": "Using `CONTAINS` or `STARTS WITH` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir",
|
||||
"message": "Using `CONTAINS` in a filter requires enabling the `contains filter` experimental feature. See https://github.com/orgs/meilisearch/discussions/763\n7:15 doggo CONTAINS kefir",
|
||||
"code": "feature_not_enabled",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#feature_not_enabled"
|
||||
}
|
||||
"###);
|
||||
"#);
|
||||
}
|
||||
|
@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
||||
@ -14,10 +15,9 @@ use super::facet_range_search;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::error::{Error, UserError};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::index::db_name::FACET_ID_STRING_DOCIDS;
|
||||
use crate::search::facet::facet_range_search::find_docids_of_facet_within_bounds;
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz, FieldId, FieldsIdsMap,
|
||||
FilterableAttributesFeatures, FilterableAttributesRule, Index, InternalError, Result,
|
||||
@ -416,20 +416,56 @@ impl<'a> Filter<'a> {
|
||||
return Ok(docids);
|
||||
}
|
||||
Condition::StartsWith { keyword: _, word } => {
|
||||
// The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb".
|
||||
// We just incremented the last letter to find the upper bound.
|
||||
// The upper bound may not be valid utf8, but lmdb doesn't care as it works over bytes.
|
||||
|
||||
let value = crate::normalize_facet(word.value());
|
||||
let base = FacetGroupKey { field_id, level: 0, left_bound: value.as_str() };
|
||||
let docids = strings_db
|
||||
.prefix_iter(rtxn, &base)?
|
||||
.map(|result| -> Result<RoaringBitmap> {
|
||||
match result {
|
||||
Ok((_facet_group_key, FacetGroupValue { bitmap, .. })) => Ok(bitmap),
|
||||
Err(_e) => Err(InternalError::from(SerializationError::Decoding {
|
||||
db_name: Some(FACET_ID_STRING_DOCIDS),
|
||||
})
|
||||
.into()),
|
||||
}
|
||||
})
|
||||
.union()?;
|
||||
let mut value2 = value.as_bytes().to_owned();
|
||||
|
||||
let last = match value2.last_mut() {
|
||||
Some(last) => last,
|
||||
None => {
|
||||
// The prefix is empty, so all documents that have the field will match.
|
||||
return index
|
||||
.exists_faceted_documents_ids(rtxn, field_id)
|
||||
.map_err(|e| e.into());
|
||||
}
|
||||
};
|
||||
|
||||
if *last == u8::MAX {
|
||||
// u8::MAX is a forbidden UTF-8 byte, we're guaranteed it cannot be sent through a filter to meilisearch, but just in case, we're going to return something
|
||||
tracing::warn!(
|
||||
"Found non utf-8 character in filter. That shouldn't be possible"
|
||||
);
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
*last += 1;
|
||||
|
||||
// This is very similar to `heed::Bytes` but its `EItem` is `&[u8]` instead of `[u8]`
|
||||
struct BytesRef;
|
||||
impl<'a> BytesEncode<'a> for BytesRef {
|
||||
type EItem = &'a [u8];
|
||||
|
||||
fn bytes_encode(
|
||||
item: &'a Self::EItem,
|
||||
) -> std::result::Result<Cow<'a, [u8]>, heed::BoxedError> {
|
||||
Ok(Cow::Borrowed(item))
|
||||
}
|
||||
}
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
let bytes_db =
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRef>>();
|
||||
find_docids_of_facet_within_bounds::<BytesRef>(
|
||||
rtxn,
|
||||
bytes_db,
|
||||
field_id,
|
||||
&Included(value.as_bytes()),
|
||||
&Excluded(value2.as_slice()),
|
||||
universe,
|
||||
&mut docids,
|
||||
)?;
|
||||
|
||||
return Ok(docids);
|
||||
}
|
||||
|
@ -17,7 +17,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let options = EnvOpenOptions::new();
|
||||
let mut options = options.read_txn_without_tls();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
options.map_size(10 * 1024 * 1024); // 10 MiB
|
||||
let index = Index::new(options, &path, true).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
@ -25,13 +25,16 @@ macro_rules! test_filter {
|
||||
let SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
|
||||
let filtered_ids = search::expected_filtered_ids($filter);
|
||||
let expected_external_ids: Vec<_> =
|
||||
let mut expected_external_ids: Vec<_> =
|
||||
search::expected_order(&criteria, TermsMatchingStrategy::default(), &[])
|
||||
.into_iter()
|
||||
.filter_map(|d| if filtered_ids.contains(&d.id) { Some(d.id) } else { None })
|
||||
.collect();
|
||||
|
||||
let documents_ids = search::internal_to_external_ids(&index, &documents_ids);
|
||||
let mut documents_ids = search::internal_to_external_ids(&index, &documents_ids);
|
||||
|
||||
expected_external_ids.sort_unstable();
|
||||
documents_ids.sort_unstable();
|
||||
assert_eq!(documents_ids, expected_external_ids);
|
||||
}
|
||||
};
|
||||
@ -102,3 +105,9 @@ test_filter!(empty_filter_1_double_not, vec![Right("NOT opt1 IS NOT EMPTY")]);
|
||||
test_filter!(in_filter, vec![Right("tag_in IN[1, 2, 3, four, five]")]);
|
||||
test_filter!(not_in_filter, vec![Right("tag_in NOT IN[1, 2, 3, four, five]")]);
|
||||
test_filter!(not_not_in_filter, vec![Right("NOT tag_in NOT IN[1, 2, 3, four, five]")]);
|
||||
|
||||
test_filter!(starts_with_filter_single_letter, vec![Right("tag STARTS WITH e")]);
|
||||
test_filter!(starts_with_filter_diacritic, vec![Right("tag STARTS WITH é")]);
|
||||
test_filter!(starts_with_filter_empty_prefix, vec![Right("tag STARTS WITH ''")]);
|
||||
test_filter!(starts_with_filter_hell, vec![Right("title STARTS WITH hell")]);
|
||||
test_filter!(starts_with_filter_hello, vec![Right("title STARTS WITH hello")]);
|
||||
|
@ -12,7 +12,8 @@ use milli::update::new::indexer;
|
||||
use milli::update::{IndexerConfig, Settings};
|
||||
use milli::vector::RuntimeEmbedders;
|
||||
use milli::{
|
||||
AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy,
|
||||
normalize_facet, AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member,
|
||||
TermsMatchingStrategy,
|
||||
};
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use slice_group_by::GroupBy;
|
||||
@ -36,7 +37,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let options = EnvOpenOptions::new();
|
||||
let mut options = options.read_txn_without_tls();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
options.map_size(10 * 1024 * 1024); // 10 MiB
|
||||
let index = Index::new(options, &path, true).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
@ -46,6 +47,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
|
||||
builder.set_criteria(criteria.to_vec());
|
||||
builder.set_filterable_fields(vec![
|
||||
FilterableAttributesRule::Field(S("title")),
|
||||
FilterableAttributesRule::Field(S("tag")),
|
||||
FilterableAttributesRule::Field(S("asc_desc_rank")),
|
||||
FilterableAttributesRule::Field(S("_geo")),
|
||||
@ -220,6 +222,19 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
|
||||
{
|
||||
id = Some(document.id.clone())
|
||||
}
|
||||
} else if let Some((field, prefix)) = filter.split_once("STARTS WITH") {
|
||||
let field = match field.trim() {
|
||||
"tag" => &document.tag,
|
||||
"title" => &document.title,
|
||||
"description" => &document.description,
|
||||
_ => panic!("Unknown field: {field}"),
|
||||
};
|
||||
|
||||
let field = normalize_facet(field);
|
||||
let prefix = normalize_facet(prefix.trim().trim_matches('\''));
|
||||
if field.starts_with(&prefix) {
|
||||
id = Some(document.id.clone());
|
||||
}
|
||||
} else if let Some(("asc_desc_rank", filter)) = filter.split_once('<') {
|
||||
if document.asc_desc_rank < filter.parse().unwrap() {
|
||||
id = Some(document.id.clone())
|
||||
@ -271,6 +286,8 @@ fn execute_filter(filter: &str, document: &TestDocument) -> Option<String> {
|
||||
} else if matches!(filter, "tag_in NOT IN[1, 2, 3, four, five]") {
|
||||
id = (!matches!(document.id.as_str(), "A" | "B" | "C" | "D" | "E"))
|
||||
.then(|| document.id.clone());
|
||||
} else {
|
||||
panic!("Unknown filter: {filter}");
|
||||
}
|
||||
id
|
||||
}
|
||||
|
Reference in New Issue
Block a user