From a9bb64c55a285b9cf90d29e85f9d622cbf893887 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 7 Jul 2025 15:28:10 +0200 Subject: [PATCH 01/49] Unrelated minor fixes --- crates/meilisearch/src/routes/indexes/documents.rs | 2 -- crates/milli/src/filterable_attributes_rules.rs | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index a93d736f7..bc5539081 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1461,8 +1461,6 @@ fn some_documents<'a, 't: 'a>( document.remove("_vectors"); } RetrieveVectors::Retrieve => { - // Clippy is simply wrong - #[allow(clippy::manual_unwrap_or_default)] let mut vectors = match document.remove("_vectors") { Some(Value::Object(map)) => map, _ => Default::default(), diff --git a/crates/milli/src/filterable_attributes_rules.rs b/crates/milli/src/filterable_attributes_rules.rs index ae1a9755a..5ba8a99d8 100644 --- a/crates/milli/src/filterable_attributes_rules.rs +++ b/crates/milli/src/filterable_attributes_rules.rs @@ -111,7 +111,7 @@ impl FilterableAttributesFeatures { self.filter.is_filterable_null() } - /// Check if `IS EXISTS` is allowed + /// Check if `EXISTS` is allowed pub fn is_filterable_exists(&self) -> bool { self.filter.is_filterable_exists() } From 20525376815e6c0d6962744701f954bf9f03f7cb Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 7 Jul 2025 15:28:35 +0200 Subject: [PATCH 02/49] Implement core filter logic --- crates/milli/src/index.rs | 2 +- crates/milli/src/search/facet/filter.rs | 15 ++- .../milli/src/search/facet/filter_vector.rs | 123 ++++++++++++++++++ crates/milli/src/search/facet/mod.rs | 1 + .../src/update/index_documents/transform.rs | 2 +- 5 files changed, 138 insertions(+), 5 deletions(-) create mode 100644 crates/milli/src/search/facet/filter_vector.rs diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index b2ec992ba..2751498bf 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1776,7 +1776,7 @@ impl Index { embedder_info.embedder_id, config.config.quantized(), ); - let embeddings = reader.item_vectors(rtxn, docid)?; + let embeddings = reader.item_vectors(rtxn, docid)?; // MARKER res.insert( config.name.to_owned(), (embeddings, embedder_info.embedding_status.must_regenerate(docid)), diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index c3eba8031..f80d1681f 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -10,7 +10,7 @@ use memchr::memmem::Finder; use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; -use super::facet_range_search; +use super::{facet_range_search, filter_vector::VectorFilter}; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::{Error, UserError}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; @@ -234,8 +234,11 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn<'_>, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let fields_ids_map = index.fields_ids_map(rtxn)?; - let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + let filterable_attributes_rules = dbg!(index.filterable_attributes_rules(rtxn)?); + for fid in self.condition.fids(MAX_FILTER_DEPTH) { + println!("{fid:?}"); + let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) .is_some_and(|(_, features)| features.is_filterable()) @@ -542,7 +545,13 @@ impl<'a> Filter<'a> { .union() } FilterCondition::Condition { fid, op } => { - let Some(field_id) = field_ids_map.id(fid.value()) else { + let value = fid.value(); + if VectorFilter::matches(value, op) { + let vector_filter = VectorFilter::parse(value)?; + return vector_filter.evaluate(rtxn, index, universe); + } + + let Some(field_id) = field_ids_map.id(value) else { return Ok(RoaringBitmap::new()); }; let Some((rule_index, features)) = diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs new file mode 100644 index 000000000..701ab561c --- /dev/null +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -0,0 +1,123 @@ +use filter_parser::Condition; +use roaring::RoaringBitmap; + +use crate::error::{Error, UserError}; +use crate::vector::{ArroyStats, ArroyWrapper}; +use crate::{Index, Result}; + +pub(super) struct VectorFilter<'a> { + embedder_name: &'a str, + fragment_name: Option<&'a str>, + user_provided: bool, + // TODO: not_user_provided: bool, +} + +impl<'a> VectorFilter<'a> { + pub(super) fn matches(value: &str, op: &Condition) -> bool { + matches!(op, Condition::Exists) && value.starts_with("_vectors.") + } + + /// Parses a vector filter string. + /// + /// Valid formats: + /// - `_vectors.{embedder_name}` + /// - `_vectors.{embedder_name}.userProvided` + /// - `_vectors.{embedder_name}.fragments.{fragment_name}` + /// - `_vectors.{embedder_name}.fragments.{fragment_name}.userProvided` + pub(super) fn parse(s: &'a str) -> Result { + let mut split = s.split('.').peekable(); + + if split.next() != Some("_vectors") { + return Err(Error::UserError(UserError::InvalidFilter(String::from( + "Vector filter must start with '_vectors'", + )))); + } + + let embedder_name = split.next().ok_or_else(|| { + Error::UserError(UserError::InvalidFilter(String::from( + "Vector filter must contain an embedder name", + ))) + })?; + + let mut fragment_name = None; + if split.peek() == Some(&"fragments") { + split.next(); + + fragment_name = Some(split.next().ok_or_else(|| { + Error::UserError(UserError::InvalidFilter( + String::from("Vector filter is inconsistent: either specify a fragment name or remove the 'fragments' part"), + )) + })?); + } + + let mut user_provided = false; + if split.peek() == Some(&"userProvided") || split.peek() == Some(&"user_provided") { + split.next(); + user_provided = true; + } + + if let Some(next) = split.next() { + return Err(Error::UserError(UserError::InvalidFilter(format!( + "Unexpected part in vector filter: '{next}'" + )))); + } + + Ok(Self { embedder_name, fragment_name, user_provided }) + } + + pub(super) fn evaluate( + &self, + rtxn: &heed::RoTxn<'_>, + index: &Index, + universe: Option<&RoaringBitmap>, + ) -> Result { + let index_embedding_configs = index.embedding_configs(); + let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; + + let Some(embedder_config) = + embedding_configs.iter().find(|config| config.name == self.embedder_name) + else { + return Ok(RoaringBitmap::new()); + }; + let Some(embedder_info) = + index_embedding_configs.embedder_info(rtxn, self.embedder_name)? + else { + return Ok(RoaringBitmap::new()); + }; + + let arroy_wrapper = ArroyWrapper::new( + index.vector_arroy, + embedder_info.embedder_id, + embedder_config.config.quantized(), + ); + + let mut docids = if let Some(fragment_name) = self.fragment_name { + let Some(fragment_config) = embedder_config + .fragments + .as_slice() + .iter() + .find(|fragment| fragment.name == fragment_name) + else { + return Ok(RoaringBitmap::new()); + }; + + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + } else { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents + }; + + // FIXME: performance + if self.user_provided { + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + docids &= user_provided_docsids; + } + + if let Some(universe) = universe { + docids &= universe; + } + + Ok(docids) + } +} diff --git a/crates/milli/src/search/facet/mod.rs b/crates/milli/src/search/facet/mod.rs index a5e65c95d..fac85df59 100644 --- a/crates/milli/src/search/facet/mod.rs +++ b/crates/milli/src/search/facet/mod.rs @@ -17,6 +17,7 @@ mod facet_range_search; mod facet_sort_ascending; mod facet_sort_descending; mod filter; +mod filter_vector; mod search; fn facet_extreme_value<'t>( diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index e07483aff..d69768d4b 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -966,7 +966,7 @@ impl<'a, 'i> Transform<'a, 'i> { // some user provided, remove only the ids that are not user provided let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { items - infos.embedding_status.user_provided_docids() - })?; + })?; // MARKER for to_delete in to_delete { arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; From 9c60e9689f806442abd9e9ebe40f98ba17c65db9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 7 Jul 2025 18:34:24 +0200 Subject: [PATCH 03/49] Support not specifying an embedder in the vector filter --- .../milli/src/search/facet/filter_vector.rs | 95 +++++++++++-------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 701ab561c..ee1c19923 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -6,7 +6,7 @@ use crate::vector::{ArroyStats, ArroyWrapper}; use crate::{Index, Result}; pub(super) struct VectorFilter<'a> { - embedder_name: &'a str, + embedder_name: Option<&'a str>, fragment_name: Option<&'a str>, user_provided: bool, // TODO: not_user_provided: bool, @@ -14,12 +14,14 @@ pub(super) struct VectorFilter<'a> { impl<'a> VectorFilter<'a> { pub(super) fn matches(value: &str, op: &Condition) -> bool { - matches!(op, Condition::Exists) && value.starts_with("_vectors.") + matches!(op, Condition::Exists) && (value.starts_with("_vectors.") || value == "_vectors") } /// Parses a vector filter string. /// /// Valid formats: + /// - `_vectors` + /// - `_vectors.userProvided` /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.userProvided` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` @@ -33,11 +35,7 @@ impl<'a> VectorFilter<'a> { )))); } - let embedder_name = split.next().ok_or_else(|| { - Error::UserError(UserError::InvalidFilter(String::from( - "Vector filter must contain an embedder name", - ))) - })?; + let embedder_name = split.next(); let mut fragment_name = None; if split.peek() == Some(&"fragments") { @@ -74,44 +72,63 @@ impl<'a> VectorFilter<'a> { let index_embedding_configs = index.embedding_configs(); let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; - let Some(embedder_config) = - embedding_configs.iter().find(|config| config.name == self.embedder_name) - else { - return Ok(RoaringBitmap::new()); - }; - let Some(embedder_info) = - index_embedding_configs.embedder_info(rtxn, self.embedder_name)? - else { - return Ok(RoaringBitmap::new()); - }; - - let arroy_wrapper = ArroyWrapper::new( - index.vector_arroy, - embedder_info.embedder_id, - embedder_config.config.quantized(), - ); - - let mut docids = if let Some(fragment_name) = self.fragment_name { - let Some(fragment_config) = embedder_config - .fragments - .as_slice() - .iter() - .find(|fragment| fragment.name == fragment_name) + let mut embedders = Vec::new(); + if let Some(embedder_name) = self.embedder_name { + let Some(embedder_config) = + embedding_configs.iter().find(|config| config.name == embedder_name) else { return Ok(RoaringBitmap::new()); }; - - arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + let Some(embedder_info) = + index_embedding_configs.embedder_info(rtxn, embedder_name)? + else { + return Ok(RoaringBitmap::new()); + }; + + embedders.push((embedder_config, embedder_info)); } else { - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents + for embedder_config in embedding_configs.iter() { + let Some(embedder_info) = + index_embedding_configs.embedder_info(rtxn, &embedder_config.name)? + else { + continue; + }; + embedders.push((embedder_config, embedder_info)); + } }; + + let mut docids = RoaringBitmap::new(); + for (embedder_config, embedder_info) in embedders { + let arroy_wrapper = ArroyWrapper::new( + index.vector_arroy, + embedder_info.embedder_id, + embedder_config.config.quantized(), + ); - // FIXME: performance - if self.user_provided { - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - docids &= user_provided_docsids; + let mut new_docids = if let Some(fragment_name) = self.fragment_name { + let Some(fragment_config) = embedder_config + .fragments + .as_slice() + .iter() + .find(|fragment| fragment.name == fragment_name) + else { + return Ok(RoaringBitmap::new()); + }; + + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + } else { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents + }; + + // FIXME: performance + if self.user_provided { + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + new_docids &= user_provided_docsids; + } + + docids |= new_docids; } if let Some(universe) = universe { From 5cced0af02018067d78f85b1348ede9b2672eb86 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 7 Jul 2025 18:41:03 +0200 Subject: [PATCH 04/49] Prevent having both a fragment name and userProvided --- crates/milli/src/search/facet/filter_vector.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index ee1c19923..9a0a50124 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -25,7 +25,6 @@ impl<'a> VectorFilter<'a> { /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.userProvided` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` - /// - `_vectors.{embedder_name}.fragments.{fragment_name}.userProvided` pub(super) fn parse(s: &'a str) -> Result { let mut split = s.split('.').peekable(); @@ -54,6 +53,12 @@ impl<'a> VectorFilter<'a> { user_provided = true; } + if fragment_name.is_some() && user_provided { + return Err(Error::UserError(UserError::InvalidFilter( + String::from("Vector filter cannot specify both a fragment name and userProvided"), + ))); + } + if let Some(next) = split.next() { return Err(Error::UserError(UserError::InvalidFilter(format!( "Unexpected part in vector filter: '{next}'" From 40e7284d70865600e7832a10d357bad9ae68cec5 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 10:01:35 +0200 Subject: [PATCH 05/49] Add tests --- crates/meilisearch/tests/search/filters.rs | 162 +++++++++++++++++++ crates/meilisearch/tests/vector/fragments.rs | 2 +- crates/meilisearch/tests/vector/mod.rs | 1 + 3 files changed, 164 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index ffa025f5c..361762b6c 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -731,3 +731,165 @@ async fn test_filterable_attributes_priority() { ) .await; } + +#[actix_rt::test] +async fn test_vector_filter() { + let index = crate::vector::shared_index_for_fragments().await; + + let (value, _code) = index.search_post(json!({ + "filter": "_vectors EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [ + { + "id": 0 + }, + { + "id": 1 + }, + { + "id": 2 + }, + { + "id": 3 + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "#); + + let (value, _code) = index.search_post(json!({ + "filter": "_vectors.other EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "#); + + // This one is counterintuitive, but it is the same as the previous one. + // It's because userProvided is interpreted as an embedder name + let (value, _code) = index.search_post(json!({ + "filter": "_vectors.userProvided EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "#); + + let (value, _code) = index.search_post(json!({ + "filter": "_vectors.rest EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [ + { + "id": 0 + }, + { + "id": 1 + }, + { + "id": 2 + }, + { + "id": 3 + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "#); + + let (value, _code) = index.search_post(json!({ + "filter": "_vectors.rest.userProvided EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [ + { + "id": 1 + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 1 + } + "#); + + let (value, _code) = index.search_post(json!({ + "filter": "_vectors.rest.fragments.withBreed EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [ + { + "id": 2 + }, + { + "id": 3 + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 2 + } + "#); + + let (value, _code) = index.search_post(json!({ + "filter": "_vectors.rest.fragments.basic EXISTS", + "attributesToRetrieve": ["id"] + })).await; + snapshot!(value, @r#" + { + "hits": [ + { + "id": 0 + }, + { + "id": 1 + }, + { + "id": 2 + }, + { + "id": 3 + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "#); +} diff --git a/crates/meilisearch/tests/vector/fragments.rs b/crates/meilisearch/tests/vector/fragments.rs index 2626284a0..3ce452c1f 100644 --- a/crates/meilisearch/tests/vector/fragments.rs +++ b/crates/meilisearch/tests/vector/fragments.rs @@ -10,7 +10,7 @@ use crate::common::{Owned, Shared}; use crate::json; use crate::vector::{GetAllDocumentsOptions, Server}; -async fn shared_index_for_fragments() -> Index<'static, Shared> { +pub async fn shared_index_for_fragments() -> Index<'static, Shared> { static INDEX: OnceCell<(Server, String)> = OnceCell::const_new(); let (server, uid) = INDEX .get_or_init(|| async { diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 7f54489b6..9ba37cae3 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -14,6 +14,7 @@ use meilisearch::option::MaxThreads; use crate::common::index::Index; use crate::common::{default_settings, GetAllDocumentsOptions, Server}; use crate::json; +pub use fragments::shared_index_for_fragments; async fn get_server_vector() -> Server { Server::new().await From 2d45124d9b379fd73dba54e7d56d4f067eeb477c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 10:01:50 +0200 Subject: [PATCH 06/49] Fix parsing --- crates/milli/src/search/facet/filter.rs | 9 +++++++-- crates/milli/src/search/facet/filter_vector.rs | 6 ++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index f80d1681f..c9728966a 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -241,7 +241,7 @@ impl<'a> Filter<'a> { let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) - .is_some_and(|(_, features)| features.is_filterable()) + .is_some_and(|(_, features)| features.is_filterable()) || VectorFilter::matches(attribute) { continue; } @@ -546,7 +546,12 @@ impl<'a> Filter<'a> { } FilterCondition::Condition { fid, op } => { let value = fid.value(); - if VectorFilter::matches(value, op) { + if VectorFilter::matches(value) { + if !matches!(op, Condition::Exists) { + return Err(Error::UserError(UserError::InvalidFilter( + String::from("Vector filter can only be used with the `exists` operator"), + ))); + } let vector_filter = VectorFilter::parse(value)?; return vector_filter.evaluate(rtxn, index, universe); } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 9a0a50124..473741f14 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -1,4 +1,3 @@ -use filter_parser::Condition; use roaring::RoaringBitmap; use crate::error::{Error, UserError}; @@ -13,15 +12,14 @@ pub(super) struct VectorFilter<'a> { } impl<'a> VectorFilter<'a> { - pub(super) fn matches(value: &str, op: &Condition) -> bool { - matches!(op, Condition::Exists) && (value.starts_with("_vectors.") || value == "_vectors") + pub(super) fn matches(value: &str) -> bool { + value.starts_with("_vectors.") || value == "_vectors" } /// Parses a vector filter string. /// /// Valid formats: /// - `_vectors` - /// - `_vectors.userProvided` /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.userProvided` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` From 0301d8f239f9d860c663bec9b48b4b08477e7885 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 11:39:10 +0200 Subject: [PATCH 07/49] Improve error handling --- crates/filter-parser/src/lib.rs | 113 +++++++++- crates/meilisearch/tests/search/filters.rs | 109 ++++++---- crates/milli/src/search/facet/filter.rs | 11 +- .../milli/src/search/facet/filter_vector.rs | 196 ++++++++++++++---- 4 files changed, 342 insertions(+), 87 deletions(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 938702103..b64477170 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -60,7 +60,7 @@ use nom::combinator::{cut, eof, map, opt}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; -use nom::Finish; +use nom::{Finish, Slice}; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; use value::word_exact; @@ -121,6 +121,16 @@ impl<'a> Token<'a> { Err(Error::new_from_kind(self.span, ErrorKind::NonFiniteFloat)) } } + + /// Split the token by a delimiter and return an iterator of tokens. + /// Each token in the iterator will have its own span that corresponds to a slice of the original token's span. + pub fn split(&self, delimiter: &'a str) -> impl Iterator> + '_ { + let original_addr = self.value().as_ptr() as usize; + self.value().split(delimiter).map(move |part| { + let offset = part.as_ptr() as usize - original_addr; + Token::new(self.span.slice(offset..offset + part.len()), Some(part.to_string())) + }) + } } impl<'a> From> for Token<'a> { @@ -604,6 +614,8 @@ impl std::fmt::Display for Token<'_> { #[cfg(test)] pub mod tests { + use std::fmt::format; + use FilterCondition as Fc; use super::*; @@ -1043,4 +1055,103 @@ pub mod tests { let token: Token = s.into(); assert_eq!(token.value(), s); } + + #[test] + fn split() { + let s = "test string that should not be parsed\n newline"; + let token: Token = s.into(); + let parts: Vec<_> = token.split(" ").collect(); + insta::assert_snapshot!(format!("{parts:#?}"), @r#" + [ + Token { + span: LocatedSpan { + offset: 0, + line: 1, + fragment: "test", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "test", + ), + }, + Token { + span: LocatedSpan { + offset: 5, + line: 1, + fragment: "string", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "string", + ), + }, + Token { + span: LocatedSpan { + offset: 12, + line: 1, + fragment: "that", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "that", + ), + }, + Token { + span: LocatedSpan { + offset: 17, + line: 1, + fragment: "should", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "should", + ), + }, + Token { + span: LocatedSpan { + offset: 24, + line: 1, + fragment: "not", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "not", + ), + }, + Token { + span: LocatedSpan { + offset: 28, + line: 1, + fragment: "be", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "be", + ), + }, + Token { + span: LocatedSpan { + offset: 31, + line: 1, + fragment: "parsed\n", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "parsed\n", + ), + }, + Token { + span: LocatedSpan { + offset: 39, + line: 2, + fragment: "newline", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "newline", + ), + }, + ] + "#); + } } diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 361762b6c..384605ca6 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -736,10 +736,12 @@ async fn test_filterable_attributes_priority() { async fn test_vector_filter() { let index = crate::vector::shared_index_for_fragments().await; - let (value, _code) = index.search_post(json!({ - "filter": "_vectors EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -764,42 +766,44 @@ async fn test_vector_filter() { } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.other EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.other EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { - "hits": [], - "query": "", - "processingTimeMs": "[duration]", - "limit": 20, - "offset": 0, - "estimatedTotalHits": 0 + "message": "Index `[uuid]`: The embedder `other` does not exist. Available embedders are: `rest`.\n10:15 _vectors.other EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" } "#); - + // This one is counterintuitive, but it is the same as the previous one. // It's because userProvided is interpreted as an embedder name - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.userProvided EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.userProvided EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { - "hits": [], - "query": "", - "processingTimeMs": "[duration]", - "limit": 20, - "offset": 0, - "estimatedTotalHits": 0 + "message": "Index `[uuid]`: The embedder `userProvided` does not exist. Available embedders are: `rest`.\n10:22 _vectors.userProvided EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -824,10 +828,12 @@ async fn test_vector_filter() { } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest.userProvided EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.userProvided EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -843,10 +849,12 @@ async fn test_vector_filter() { } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest.fragments.withBreed EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.withBreed EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -864,11 +872,13 @@ async fn test_vector_filter() { "estimatedTotalHits": 2 } "#); - - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest.fragments.basic EXISTS", - "attributesToRetrieve": ["id"] - })).await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.basic EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -892,4 +902,19 @@ async fn test_vector_filter() { "estimatedTotalHits": 4 } "#); + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.other EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; + snapshot!(value, @r#" + { + "message": "Index `[uuid]`: The fragment `other` does not exist on embedder `rest`. Available fragments on this embedder are: `basic`, `withBreed`.\n25:30 _vectors.rest.fragments.other EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); } diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index c9728966a..c0419997c 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -241,7 +241,8 @@ impl<'a> Filter<'a> { let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) - .is_some_and(|(_, features)| features.is_filterable()) || VectorFilter::matches(attribute) + .is_some_and(|(_, features)| features.is_filterable()) + || VectorFilter::matches(attribute) { continue; } @@ -548,11 +549,11 @@ impl<'a> Filter<'a> { let value = fid.value(); if VectorFilter::matches(value) { if !matches!(op, Condition::Exists) { - return Err(Error::UserError(UserError::InvalidFilter( - String::from("Vector filter can only be used with the `exists` operator"), - ))); + return Err(Error::UserError(UserError::InvalidFilter(String::from( + "Vector filter can only be used with the `exists` operator", + )))); } - let vector_filter = VectorFilter::parse(value)?; + let vector_filter = VectorFilter::parse(fid)?; return vector_filter.evaluate(rtxn, index, universe); } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 473741f14..0a7ac313a 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -1,16 +1,118 @@ +use filter_parser::Token; use roaring::RoaringBitmap; use crate::error::{Error, UserError}; use crate::vector::{ArroyStats, ArroyWrapper}; -use crate::{Index, Result}; +use crate::Index; pub(super) struct VectorFilter<'a> { - embedder_name: Option<&'a str>, - fragment_name: Option<&'a str>, + embedder_token: Option>, + fragment_token: Option>, user_provided: bool, // TODO: not_user_provided: bool, } +#[derive(Debug)] +pub enum VectorFilterError<'a> { + EmptyFilter, + InvalidPrefix(Token<'a>), + MissingFragmentName(Token<'a>), + UserProvidedWithFragment(Token<'a>), + LeftoverToken(Token<'a>), + EmbedderDoesNotExist { + embedder: &'a Token<'a>, + available: Vec, + }, + FragmentDoesNotExist { + embedder: &'a Token<'a>, + fragment: &'a Token<'a>, + available: Vec, + }, +} + +use VectorFilterError::*; + +impl std::error::Error for VectorFilterError<'_> {} + +impl std::fmt::Display for VectorFilterError<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + EmptyFilter => { + write!(f, "Vector filter cannot be empty.") + } + InvalidPrefix(prefix) => { + write!( + f, + "Vector filter must start with `_vectors` but found `{}`.", + prefix.value() + ) + } + MissingFragmentName(_token) => { + write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.") + } + UserProvidedWithFragment(_token) => { + write!(f, "Vector filter cannot specify both a fragment name and userProvided.") + } + LeftoverToken(token) => { + write!(f, "Vector filter has leftover token: `{}`.", token.value()) + } + EmbedderDoesNotExist { embedder, available } => { + write!(f, "The embedder `{}` does not exist.", embedder.value())?; + if available.is_empty() { + write!(f, " This index does not have configured embedders.") + } else { + write!(f, " Available embedders are: ")?; + let mut available = available.clone(); + available.sort_unstable(); + for (idx, embedder) in available.iter().enumerate() { + write!(f, "`{embedder}`")?; + if idx != available.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, ".") + } + } + FragmentDoesNotExist { embedder, fragment, available } => { + write!( + f, + "The fragment `{}` does not exist on embedder `{}`.", + fragment.value(), + embedder.value(), + )?; + if available.is_empty() { + write!(f, " This embedder does not have configured fragments.") + } else { + write!(f, " Available fragments on this embedder are: ")?; + let mut available = available.clone(); + available.sort_unstable(); + for (idx, fragment) in available.iter().enumerate() { + write!(f, "`{fragment}`")?; + if idx != available.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, ".") + } + } + } + } +} + +impl<'a> From> for Error { + fn from(err: VectorFilterError<'a>) -> Self { + match &err { + EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())), + InvalidPrefix(token) + | MissingFragmentName(token) + | UserProvidedWithFragment(token) + | LeftoverToken(token) => token.clone().as_external_error(err).into(), + EmbedderDoesNotExist { embedder: token, .. } + | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), + } + } +} + impl<'a> VectorFilter<'a> { pub(super) fn matches(value: &str) -> bool { value.starts_with("_vectors.") || value == "_vectors" @@ -23,71 +125,74 @@ impl<'a> VectorFilter<'a> { /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.userProvided` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` - pub(super) fn parse(s: &'a str) -> Result { - let mut split = s.split('.').peekable(); + pub(super) fn parse(s: &'a Token<'a>) -> Result> { + let mut split = s.split(".").peekable(); - if split.next() != Some("_vectors") { - return Err(Error::UserError(UserError::InvalidFilter(String::from( - "Vector filter must start with '_vectors'", - )))); + match split.next() { + Some(token) if token.value() == "_vectors" => (), + Some(token) => return Err(InvalidPrefix(token)), + None => return Err(EmptyFilter), } let embedder_name = split.next(); let mut fragment_name = None; - if split.peek() == Some(&"fragments") { - split.next(); + if split.peek().map(|t| t.value()) == Some("fragments") { + let token = split.next().expect("it was peeked before"); - fragment_name = Some(split.next().ok_or_else(|| { - Error::UserError(UserError::InvalidFilter( - String::from("Vector filter is inconsistent: either specify a fragment name or remove the 'fragments' part"), - )) - })?); + fragment_name = Some(split.next().ok_or(MissingFragmentName(token))?); } - let mut user_provided = false; - if split.peek() == Some(&"userProvided") || split.peek() == Some(&"user_provided") { - split.next(); - user_provided = true; + let mut user_provided_token = None; + if split.peek().map(|t| t.value()) == Some("userProvided") + || split.peek().map(|t| t.value()) == Some("user_provided") + { + user_provided_token = split.next(); } - if fragment_name.is_some() && user_provided { - return Err(Error::UserError(UserError::InvalidFilter( - String::from("Vector filter cannot specify both a fragment name and userProvided"), - ))); + if let (Some(_), Some(user_provided_token)) = (&fragment_name, &user_provided_token) { + return Err(UserProvidedWithFragment(user_provided_token.clone()))?; } if let Some(next) = split.next() { - return Err(Error::UserError(UserError::InvalidFilter(format!( - "Unexpected part in vector filter: '{next}'" - )))); + return Err(LeftoverToken(next))?; } - Ok(Self { embedder_name, fragment_name, user_provided }) + Ok(Self { + embedder_token: embedder_name, + fragment_token: fragment_name, + user_provided: user_provided_token.is_some(), + }) } pub(super) fn evaluate( - &self, + self, rtxn: &heed::RoTxn<'_>, index: &Index, universe: Option<&RoaringBitmap>, - ) -> Result { + ) -> crate::Result { let index_embedding_configs = index.embedding_configs(); let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; let mut embedders = Vec::new(); - if let Some(embedder_name) = self.embedder_name { + if let Some(embedder_token) = &self.embedder_token { + let embedder_name = embedder_token.value(); let Some(embedder_config) = embedding_configs.iter().find(|config| config.name == embedder_name) else { - return Ok(RoaringBitmap::new()); + return Err(EmbedderDoesNotExist { + embedder: embedder_token, + available: embedding_configs.iter().map(|c| c.name.clone()).collect(), + })?; }; - let Some(embedder_info) = - index_embedding_configs.embedder_info(rtxn, embedder_name)? + let Some(embedder_info) = index_embedding_configs.embedder_info(rtxn, embedder_name)? else { - return Ok(RoaringBitmap::new()); + return Err(EmbedderDoesNotExist { + embedder: embedder_token, + available: embedding_configs.iter().map(|c| c.name.clone()).collect(), + })?; }; - + embedders.push((embedder_config, embedder_info)); } else { for embedder_config in embedding_configs.iter() { @@ -99,7 +204,7 @@ impl<'a> VectorFilter<'a> { embedders.push((embedder_config, embedder_info)); } }; - + let mut docids = RoaringBitmap::new(); for (embedder_config, embedder_info) in embedders { let arroy_wrapper = ArroyWrapper::new( @@ -108,14 +213,27 @@ impl<'a> VectorFilter<'a> { embedder_config.config.quantized(), ); - let mut new_docids = if let Some(fragment_name) = self.fragment_name { + let mut new_docids = if let Some(fragment_token) = &self.fragment_token { + let fragment_name = fragment_token.value(); let Some(fragment_config) = embedder_config .fragments .as_slice() .iter() .find(|fragment| fragment.name == fragment_name) else { - return Ok(RoaringBitmap::new()); + return Err(FragmentDoesNotExist { + embedder: self + .embedder_token + .as_ref() + .expect("there can't be a fragment without an embedder"), + fragment: fragment_token, + available: embedder_config + .fragments + .as_slice() + .iter() + .map(|f| f.name.clone()) + .collect(), + })?; }; arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? From d43cd40807f87624bb4a7c87c44830b01f887458 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 11:48:23 +0200 Subject: [PATCH 08/49] Split tests --- crates/meilisearch/tests/search/filters.rs | 98 ++++++++++++++++------ 1 file changed, 74 insertions(+), 24 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 384605ca6..1c49fa5e0 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -733,29 +733,29 @@ async fn test_filterable_attributes_priority() { } #[actix_rt::test] -async fn test_vector_filter() { +async fn vector_filter_all_embedders() { let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ "filter": "_vectors EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" { "hits": [ { - "id": 0 + "name": "kefir" }, { - "id": 1 + "name": "echo" }, { - "id": 2 + "name": "intel" }, { - "id": 3 + "name": "dustin" } ], "query": "", @@ -765,11 +765,16 @@ async fn test_vector_filter() { "estimatedTotalHits": 4 } "#); +} + +#[actix_rt::test] +async fn vector_filter_non_existant_embedder() { + let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ "filter": "_vectors.other EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" @@ -780,13 +785,18 @@ async fn test_vector_filter() { "link": "https://docs.meilisearch.com/errors#invalid_search_filter" } "#); +} + +#[actix_rt::test] +async fn vector_filter_all_embedders_user_provided() { + let index = crate::vector::shared_index_for_fragments().await; // This one is counterintuitive, but it is the same as the previous one. // It's because userProvided is interpreted as an embedder name let (value, _code) = index .search_post(json!({ "filter": "_vectors.userProvided EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" @@ -797,27 +807,32 @@ async fn test_vector_filter() { "link": "https://docs.meilisearch.com/errors#invalid_search_filter" } "#); +} + +#[actix_rt::test] +async fn vector_filter_specific_embedder() { + let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ "filter": "_vectors.rest EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" { "hits": [ { - "id": 0 + "name": "kefir" }, { - "id": 1 + "name": "echo" }, { - "id": 2 + "name": "intel" }, { - "id": 3 + "name": "dustin" } ], "query": "", @@ -827,18 +842,23 @@ async fn test_vector_filter() { "estimatedTotalHits": 4 } "#); +} + +#[actix_rt::test] +async fn vector_filter_user_provided() { + let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ "filter": "_vectors.rest.userProvided EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" { "hits": [ { - "id": 1 + "name": "echo" } ], "query": "", @@ -848,21 +868,26 @@ async fn test_vector_filter() { "estimatedTotalHits": 1 } "#); +} + +#[actix_rt::test] +async fn vector_filter_specific_fragment() { + let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ "filter": "_vectors.rest.fragments.withBreed EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" { "hits": [ { - "id": 2 + "name": "intel" }, { - "id": 3 + "name": "dustin" } ], "query": "", @@ -876,23 +901,23 @@ async fn test_vector_filter() { let (value, _code) = index .search_post(json!({ "filter": "_vectors.rest.fragments.basic EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" { "hits": [ { - "id": 0 + "name": "kefir" }, { - "id": 1 + "name": "echo" }, { - "id": 2 + "name": "intel" }, { - "id": 3 + "name": "dustin" } ], "query": "", @@ -902,11 +927,16 @@ async fn test_vector_filter() { "estimatedTotalHits": 4 } "#); +} + +#[actix_rt::test] +async fn vector_filter_non_existant_fragment() { + let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ "filter": "_vectors.rest.fragments.other EXISTS", - "attributesToRetrieve": ["id"] + "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" @@ -918,3 +948,23 @@ async fn test_vector_filter() { } "#); } + +#[actix_rt::test] +async fn vector_filter_specific_fragment_user_provided() { + let index = crate::vector::shared_index_for_fragments().await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.other.userProvided EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "message": "Index `[uuid]`: Vector filter cannot specify both a fragment name and userProvided.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); +} From b4cafec8b3941b2add4fe4ce99047b1b28da18bf Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 11:56:19 +0200 Subject: [PATCH 09/49] Add tests for operators along vector filter --- crates/meilisearch/tests/search/filters.rs | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 1c49fa5e0..2e71b5435 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -968,3 +968,67 @@ async fn vector_filter_specific_fragment_user_provided() { } "#); } + +#[actix_rt::test] +async fn vector_filter_negation() { + let index = crate::vector::shared_index_for_fragments().await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.userProvided NOT EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "hits": [ + { + "name": "kefir" + }, + { + "name": "intel" + }, + { + "name": "dustin" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "#); +} + +#[actix_rt::test] +async fn vector_filter_or_combination() { +let index = crate::vector::shared_index_for_fragments().await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.withBreed EXISTS OR _vectors.rest.userProvided EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "hits": [ + { + "name": "echo" + }, + { + "name": "intel" + }, + { + "name": "dustin" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "#); +} From 29b74424ad6313ea673869a658e10883fca7e4b1 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 12:03:32 +0200 Subject: [PATCH 10/49] Clean code --- crates/meilisearch/tests/search/filters.rs | 2 +- crates/milli/src/index.rs | 2 +- crates/milli/src/search/facet/filter.rs | 4 +--- crates/milli/src/search/facet/filter_vector.rs | 1 - crates/milli/src/update/index_documents/transform.rs | 2 +- 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 2e71b5435..3cc7cbab5 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -1003,7 +1003,7 @@ async fn vector_filter_negation() { #[actix_rt::test] async fn vector_filter_or_combination() { -let index = crate::vector::shared_index_for_fragments().await; + let index = crate::vector::shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 2751498bf..b2ec992ba 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1776,7 +1776,7 @@ impl Index { embedder_info.embedder_id, config.config.quantized(), ); - let embeddings = reader.item_vectors(rtxn, docid)?; // MARKER + let embeddings = reader.item_vectors(rtxn, docid)?; res.insert( config.name.to_owned(), (embeddings, embedder_info.embedding_status.must_regenerate(docid)), diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index c0419997c..1afdf87e6 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -234,11 +234,9 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn<'_>, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let fields_ids_map = index.fields_ids_map(rtxn)?; - let filterable_attributes_rules = dbg!(index.filterable_attributes_rules(rtxn)?); + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; for fid in self.condition.fids(MAX_FILTER_DEPTH) { - println!("{fid:?}"); - let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) .is_some_and(|(_, features)| features.is_filterable()) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 0a7ac313a..79e35366c 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -9,7 +9,6 @@ pub(super) struct VectorFilter<'a> { embedder_token: Option>, fragment_token: Option>, user_provided: bool, - // TODO: not_user_provided: bool, } #[derive(Debug)] diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index d69768d4b..e07483aff 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -966,7 +966,7 @@ impl<'a, 'i> Transform<'a, 'i> { // some user provided, remove only the ids that are not user provided let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { items - infos.embedding_status.user_provided_docids() - })?; // MARKER + })?; for to_delete in to_delete { arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; From fb73b83abe67532e496b7ac2f56aa5594ec81bdd Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 12:14:34 +0200 Subject: [PATCH 11/49] Fix performance --- .../milli/src/search/facet/filter_vector.rs | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 79e35366c..a0dc52bac 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -235,19 +235,33 @@ impl<'a> VectorFilter<'a> { })?; }; - arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + if let Some(universe) = universe { + arroy_wrapper + .items_in_store(rtxn, fragment_config.id, |bitmap| bitmap & universe)? + } else { + arroy_wrapper + .items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + } } else { + let mut universe = universe.cloned(); + if self.user_provided { + let user_provided_docsids = + embedder_info.embedding_status.user_provided_docids(); + match &mut universe { + Some(universe) => *universe &= user_provided_docsids, + None => universe = Some(user_provided_docsids.clone()), + } + } + let mut stats = ArroyStats::default(); arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents + if let Some(universe) = &universe { + stats.documents & universe + } else { + stats.documents + } }; - // FIXME: performance - if self.user_provided { - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - new_docids &= user_provided_docsids; - } - docids |= new_docids; } From 9e98a25e45ab68540e2aba398a0e3eadd9fc1b9d Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 15:56:09 +0200 Subject: [PATCH 12/49] Fix clippy --- crates/filter-parser/src/lib.rs | 2 -- crates/milli/src/search/facet/filter_vector.rs | 4 +--- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index b64477170..02f338673 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -614,8 +614,6 @@ impl std::fmt::Display for Token<'_> { #[cfg(test)] pub mod tests { - use std::fmt::format; - use FilterCondition as Fc; use super::*; diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index a0dc52bac..0b9cad702 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -212,7 +212,7 @@ impl<'a> VectorFilter<'a> { embedder_config.config.quantized(), ); - let mut new_docids = if let Some(fragment_token) = &self.fragment_token { + docids |= if let Some(fragment_token) = &self.fragment_token { let fragment_name = fragment_token.value(); let Some(fragment_config) = embedder_config .fragments @@ -261,8 +261,6 @@ impl<'a> VectorFilter<'a> { stats.documents } }; - - docids |= new_docids; } if let Some(universe) = universe { From 881c37393fb79208f114c9a16696f4841096daa0 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 16:06:27 +0200 Subject: [PATCH 13/49] Add telemetry --- .../src/routes/indexes/documents.rs | 25 +++++++++++++++++-- .../src/routes/indexes/search_analytics.rs | 6 +++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index bc5539081..173b3ecc8 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -135,6 +135,7 @@ pub struct DocumentsFetchAggregator { per_document_id: bool, // if a filter was used per_filter: bool, + with_vector_filter: bool, #[serde(rename = "vector.retrieve_vectors")] retrieve_vectors: bool, @@ -153,8 +154,17 @@ pub struct DocumentsFetchAggregator { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { - PerDocumentId { retrieve_vectors: bool }, - Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool, ids: usize }, + PerDocumentId { + retrieve_vectors: bool, + }, + Normal { + with_filter: bool, + with_vector_filter: bool, + limit: usize, + offset: usize, + retrieve_vectors: bool, + ids: usize, + }, } impl DocumentsFetchAggregator { @@ -174,6 +184,7 @@ impl DocumentsFetchAggregator { Self { per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), + with_vector_filter: matches!(query, DocumentFetchKind::Normal { with_vector_filter, .. } if *with_vector_filter), max_limit: limit, max_offset: offset, retrieve_vectors, @@ -193,6 +204,7 @@ impl Aggregate for DocumentsFetchAggregator { Box::new(Self { per_document_id: self.per_document_id | new.per_document_id, per_filter: self.per_filter | new.per_filter, + with_vector_filter: self.with_vector_filter | new.with_vector_filter, retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, max_limit: self.max_limit.max(new.max_limit), max_offset: self.max_offset.max(new.max_offset), @@ -276,6 +288,7 @@ pub async fn get_document( retrieve_vectors: param_retrieve_vectors.0, per_document_id: true, per_filter: false, + with_vector_filter: false, max_limit: 0, max_offset: 0, max_document_ids: 0, @@ -495,6 +508,10 @@ pub async fn documents_by_query_post( analytics.publish( DocumentsFetchAggregator:: { per_filter: body.filter.is_some(), + with_vector_filter: body + .filter + .as_ref() + .is_some_and(|f| f.to_string().contains("_vectors")), retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, max_offset: body.offset, @@ -596,6 +613,10 @@ pub async fn get_documents( analytics.publish( DocumentsFetchAggregator:: { per_filter: query.filter.is_some(), + with_vector_filter: query + .filter + .as_ref() + .is_some_and(|f| f.to_string().contains("_vectors")), retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, max_offset: query.offset, diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs index 07f79eba7..6b3b7ea46 100644 --- a/crates/meilisearch/src/routes/indexes/search_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -40,6 +40,7 @@ pub struct SearchAggregator { // filter filter_with_geo_radius: bool, filter_with_geo_bounding_box: bool, + filter_on_vectors: bool, // every time a request has a filter, this field must be incremented by the number of terms it contains filter_sum_of_criteria_terms: usize, // every time a request has a filter, this field must be incremented by one @@ -163,6 +164,7 @@ impl SearchAggregator { let stringified_filters = filter.to_string(); ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_on_vectors = stringified_filters.contains("_vectors"); ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); } @@ -260,6 +262,7 @@ impl Aggregate for SearchAggregator { distinct, filter_with_geo_radius, filter_with_geo_bounding_box, + filter_on_vectors, filter_sum_of_criteria_terms, filter_total_number_of_criteria, used_syntax, @@ -314,6 +317,7 @@ impl Aggregate for SearchAggregator { // filter self.filter_with_geo_radius |= filter_with_geo_radius; self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_on_vectors |= filter_on_vectors; self.filter_sum_of_criteria_terms = self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); self.filter_total_number_of_criteria = @@ -388,6 +392,7 @@ impl Aggregate for SearchAggregator { distinct, filter_with_geo_radius, filter_with_geo_bounding_box, + filter_on_vectors, filter_sum_of_criteria_terms, filter_total_number_of_criteria, used_syntax, @@ -445,6 +450,7 @@ impl Aggregate for SearchAggregator { "filter": { "with_geoRadius": filter_with_geo_radius, "with_geoBoundingBox": filter_with_geo_bounding_box, + "on_vectors": filter_on_vectors, "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, From feb53104e51a244660e3eb2d082f7b6be42d226c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 16:19:55 +0200 Subject: [PATCH 14/49] Grammar --- .../src/scheduler/test_document_addition.rs | 2 +- crates/meilisearch/src/routes/tasks.rs | 8 ++++---- crates/meilisearch/tests/documents/errors.rs | 2 +- crates/meilisearch/tests/search/errors.rs | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/test_document_addition.rs b/crates/index-scheduler/src/scheduler/test_document_addition.rs index b642f5604..7ca72da95 100644 --- a/crates/index-scheduler/src/scheduler/test_document_addition.rs +++ b/crates/index-scheduler/src/scheduler/test_document_addition.rs @@ -736,7 +736,7 @@ fn test_document_addition_mixed_rights_with_index() { #[test] fn test_document_addition_mixed_right_without_index_starts_with_cant_create() { // We're going to autobatch multiple document addition. - // - The index does not exists + // - The index does not exist // - The first document addition don't have the right to create an index // - The second do. They should not batch together. // - The second should batch with everything else as it's going to create an index. diff --git a/crates/meilisearch/src/routes/tasks.rs b/crates/meilisearch/src/routes/tasks.rs index 95c105894..fb0f73425 100644 --- a/crates/meilisearch/src/routes/tasks.rs +++ b/crates/meilisearch/src/routes/tasks.rs @@ -336,7 +336,7 @@ impl Aggregate for TaskFilterAnalytics Date: Tue, 8 Jul 2025 16:23:45 +0200 Subject: [PATCH 15/49] Add test --- crates/meilisearch/tests/search/filters.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 3cc7cbab5..3b26ab4ee 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -767,6 +767,26 @@ async fn vector_filter_all_embedders() { "#); } +#[actix_rt::test] +async fn vector_filter_missing_fragment() { + let index = crate::vector::shared_index_for_fragments().await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "message": "Index `[uuid]`: Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.\n15:24 _vectors.rest.fragments EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); +} + #[actix_rt::test] async fn vector_filter_non_existant_embedder() { let index = crate::vector::shared_index_for_fragments().await; From 8adf6141e09cc9dce662ddcc31a65f89fd81fdec Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 16:55:43 +0200 Subject: [PATCH 16/49] Fix old test --- crates/milli/src/test_index.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index 6bb6b1345..0ec348301 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -19,7 +19,9 @@ use crate::update::{ }; use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::vector::RuntimeEmbedders; -use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult}; +use crate::{ + db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult, UserError, +}; pub(crate) struct TempIndex { pub inner: Index, @@ -1341,8 +1343,8 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() { let results = search .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) .execute() - .unwrap(); - assert!(results.candidates.is_empty()); + .unwrap_err(); + assert!(matches!(results, Error::UserError(UserError::InvalidFilter(_)))); index .update_settings(|settings| { @@ -1373,6 +1375,6 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() { let results = search .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) .execute() - .unwrap(); - assert!(results.candidates.is_empty()); + .unwrap_err(); + assert!(matches!(results, Error::UserError(UserError::InvalidFilter(_)))); } From 39f808714d6ae7667d914817dcf645c17c027374 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 9 Jul 2025 18:03:32 +0200 Subject: [PATCH 17/49] Implement a documentTemplate filter --- crates/meilisearch/tests/search/filters.rs | 79 +++++++++++++++++++ crates/meilisearch/tests/vector/mod.rs | 4 +- crates/meilisearch/tests/vector/rest.rs | 2 +- .../milli/src/search/facet/filter_vector.rs | 39 +++++++-- 4 files changed, 115 insertions(+), 9 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 3b26ab4ee..d0f388220 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -989,6 +989,85 @@ async fn vector_filter_specific_fragment_user_provided() { "#); } +#[actix_rt::test] +async fn vector_filter_document_template_but_fragments_used() { + let index = crate::vector::shared_index_for_fragments().await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.documentTemplate EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "hits": [], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0 + } + "#); +} + +#[actix_rt::test] +async fn vector_filter_document_template() { + let (_mock, setting) = crate::vector::create_mock().await; + let server = crate::vector::get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + {"id": 3, "name": "iko" } + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await.succeeded(); + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.documentTemplate EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "hits": [ + { + "name": "kefir" + }, + { + "name": "echo" + }, + { + "name": "intel" + }, + { + "name": "iko" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 4 + } + "#); +} + #[actix_rt::test] async fn vector_filter_negation() { let index = crate::vector::shared_index_for_fragments().await; diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 9ba37cae3..8851d029e 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -14,9 +14,9 @@ use meilisearch::option::MaxThreads; use crate::common::index::Index; use crate::common::{default_settings, GetAllDocumentsOptions, Server}; use crate::json; -pub use fragments::shared_index_for_fragments; +pub use {fragments::shared_index_for_fragments, rest::create_mock}; -async fn get_server_vector() -> Server { +pub async fn get_server_vector() -> Server { Server::new().await } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 974341cd0..dae9e9139 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -12,7 +12,7 @@ use crate::common::Value; use crate::json; use crate::vector::{get_server_vector, GetAllDocumentsOptions}; -async fn create_mock() -> (&'static MockServer, Value) { +pub async fn create_mock() -> (&'static MockServer, Value) { let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 0b9cad702..e3ec698f5 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -8,6 +8,7 @@ use crate::Index; pub(super) struct VectorFilter<'a> { embedder_token: Option>, fragment_token: Option>, + document_template: bool, user_provided: bool, } @@ -17,6 +18,7 @@ pub enum VectorFilterError<'a> { InvalidPrefix(Token<'a>), MissingFragmentName(Token<'a>), UserProvidedWithFragment(Token<'a>), + DocumentTemplateWithFragment(Token<'a>), LeftoverToken(Token<'a>), EmbedderDoesNotExist { embedder: &'a Token<'a>, @@ -52,6 +54,9 @@ impl std::fmt::Display for VectorFilterError<'_> { UserProvidedWithFragment(_token) => { write!(f, "Vector filter cannot specify both a fragment name and userProvided.") } + DocumentTemplateWithFragment(_token) => { + write!(f, "Vector filter cannot specify both a fragment name and documentTemplate.") + } LeftoverToken(token) => { write!(f, "Vector filter has leftover token: `{}`.", token.value()) } @@ -105,6 +110,7 @@ impl<'a> From> for Error { InvalidPrefix(token) | MissingFragmentName(token) | UserProvidedWithFragment(token) + | DocumentTemplateWithFragment(token) | LeftoverToken(token) => token.clone().as_external_error(err).into(), EmbedderDoesNotExist { embedder: token, .. } | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), @@ -123,6 +129,8 @@ impl<'a> VectorFilter<'a> { /// - `_vectors` /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.userProvided` + /// - `_vectors.{embedder_name}.documentTemplate` + /// - `_vectors.{embedder_name}.documentTemplate.userProvided` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` pub(super) fn parse(s: &'a Token<'a>) -> Result> { let mut split = s.split(".").peekable(); @@ -149,10 +157,22 @@ impl<'a> VectorFilter<'a> { user_provided_token = split.next(); } + let mut document_template_token = None; + if split.peek().map(|t| t.value()) == Some("documentTemplate") + || split.peek().map(|t| t.value()) == Some("document_template") + { + document_template_token = split.next(); + } + if let (Some(_), Some(user_provided_token)) = (&fragment_name, &user_provided_token) { return Err(UserProvidedWithFragment(user_provided_token.clone()))?; } + if let (Some(_), Some(document_template_token)) = (&fragment_name, &document_template_token) + { + return Err(DocumentTemplateWithFragment(document_template_token.clone()))?; + } + if let Some(next) = split.next() { return Err(LeftoverToken(next))?; } @@ -161,6 +181,7 @@ impl<'a> VectorFilter<'a> { embedder_token: embedder_name, fragment_token: fragment_name, user_provided: user_provided_token.is_some(), + document_template: document_template_token.is_some(), }) } @@ -176,7 +197,8 @@ impl<'a> VectorFilter<'a> { let mut embedders = Vec::new(); if let Some(embedder_token) = &self.embedder_token { let embedder_name = embedder_token.value(); - let Some(embedder_config) = + + let Some(embedding_config) = embedding_configs.iter().find(|config| config.name == embedder_name) else { return Err(EmbedderDoesNotExist { @@ -184,6 +206,7 @@ impl<'a> VectorFilter<'a> { available: embedding_configs.iter().map(|c| c.name.clone()).collect(), })?; }; + let Some(embedder_info) = index_embedding_configs.embedder_info(rtxn, embedder_name)? else { return Err(EmbedderDoesNotExist { @@ -192,7 +215,11 @@ impl<'a> VectorFilter<'a> { })?; }; - embedders.push((embedder_config, embedder_info)); + if self.document_template && !embedding_config.fragments.as_slice().is_empty() { + return Ok(RoaringBitmap::new()); + } + + embedders.push((embedding_config, embedder_info)); } else { for embedder_config in embedding_configs.iter() { let Some(embedder_info) = @@ -205,16 +232,16 @@ impl<'a> VectorFilter<'a> { }; let mut docids = RoaringBitmap::new(); - for (embedder_config, embedder_info) in embedders { + for (embedding_config, embedder_info) in embedders { let arroy_wrapper = ArroyWrapper::new( index.vector_arroy, embedder_info.embedder_id, - embedder_config.config.quantized(), + embedding_config.config.quantized(), ); docids |= if let Some(fragment_token) = &self.fragment_token { let fragment_name = fragment_token.value(); - let Some(fragment_config) = embedder_config + let Some(fragment_config) = embedding_config .fragments .as_slice() .iter() @@ -226,7 +253,7 @@ impl<'a> VectorFilter<'a> { .as_ref() .expect("there can't be a fragment without an embedder"), fragment: fragment_token, - available: embedder_config + available: embedding_config .fragments .as_slice() .iter() From a3b8c2b71fc1aa2766a94b6daaf7c3872c466d01 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 9 Jul 2025 18:21:52 +0200 Subject: [PATCH 18/49] Gate behind multimodal experimental feature --- crates/filter-parser/src/lib.rs | 19 +++++++++++++++++++ crates/meilisearch/src/search/mod.rs | 14 +++++++++++++- crates/meilisearch/tests/search/filters.rs | 20 ++++++++++++++++++++ crates/milli/src/search/facet/filter.rs | 4 ++++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 02f338673..1590b08fd 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -189,6 +189,25 @@ impl<'a> FilterCondition<'a> { } } + pub fn use_vector_filter(&self) -> Option<&Token> { + match self { + FilterCondition::Condition { fid, op: _ } => { + if fid.value().starts_with("_vectors.") || fid.value() == "_vectors" { + Some(fid) + } else { + None + } + } + FilterCondition::Not(this) => this.use_vector_filter(), + FilterCondition::Or(seq) | FilterCondition::And(seq) => { + seq.iter().find_map(|filter| filter.use_vector_filter()) + } + FilterCondition::GeoLowerThan { .. } + | FilterCondition::GeoBoundingBox { .. } + | FilterCondition::In { .. } => None, + } + } + pub fn fids(&self, depth: usize) -> Box + '_> { if depth == 0 { return Box::new(std::iter::empty()); diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 1c987a70c..e82f4dff7 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -2077,7 +2077,7 @@ pub(crate) fn parse_filter( })?; if let Some(ref filter) = filter { - // If the contains operator is used while the contains filter features is not enabled, errors out + // If the contains operator is used while the contains filter feature is not enabled, errors out if let Some((token, error)) = filter.use_contains_operator().zip(features.check_contains_filter().err()) { @@ -2088,6 +2088,18 @@ pub(crate) fn parse_filter( } } + if let Some(ref filter) = filter { + // If a vector filter is used while the multi modal feature is not enabled, errors out + if let Some((token, error)) = + filter.use_vector_filter().zip(features.check_multimodal("using a vector filter").err()) + { + return Err(ResponseError::from_msg( + token.as_external_error(error).to_string(), + Code::FeatureNotEnabled, + )); + } + } + Ok(filter) } diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index d0f388220..ff6a0cb17 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -1068,6 +1068,26 @@ async fn vector_filter_document_template() { "#); } +#[actix_rt::test] +async fn vector_filter_feature_gate() { + let index = shared_index_with_documents().await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors EXISTS", + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "message": "using a vector filter requires enabling the `multimodal` experimental feature. See https://github.com/orgs/meilisearch/discussions/846\n1:9 _vectors EXISTS", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "#); +} + #[actix_rt::test] async fn vector_filter_negation() { let index = crate::vector::shared_index_for_fragments().await; diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 1afdf87e6..21a552965 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -228,6 +228,10 @@ impl<'a> Filter<'a> { pub fn use_contains_operator(&self) -> Option<&Token> { self.condition.use_contains_operator() } + + pub fn use_vector_filter(&self) -> Option<&Token> { + self.condition.use_vector_filter() + } } impl<'a> Filter<'a> { From a9309774602667c47a63ab04684c72bb73376338 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 10 Jul 2025 09:37:58 +0200 Subject: [PATCH 19/49] Fix test --- crates/meilisearch/tests/search/filters.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index ff6a0cb17..aa2b06e76 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -1017,6 +1017,9 @@ async fn vector_filter_document_template() { let server = crate::vector::get_server_vector().await; let index = server.index("doggo"); + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + snapshot!(code, @"200 OK"); + let (response, code) = index .update_settings(json!({ "embedders": { From 30fd546c1227bbfc3e71e3541f41847fca4c4713 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 10 Jul 2025 16:43:10 +0200 Subject: [PATCH 20/49] Format --- crates/milli/src/vector/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index f64223e41..3d77e78d7 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -485,6 +485,8 @@ impl ArroyWrapper { limit: usize, filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { + println!("nns_by_vector: quantized={} {:?} limit={} filter={:?}", + self.quantized, vector, limit, filter); if self.quantized { self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) } else { @@ -517,6 +519,8 @@ impl ArroyWrapper { results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + println!("nns_by_vector: results={:?}", results); + Ok(results) } From f244439b4f7493edb4a7f2387f80ae932cd3229b Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 10 Jul 2025 16:43:45 +0200 Subject: [PATCH 21/49] Revert "Format" This reverts commit 30fd546c1227bbfc3e71e3541f41847fca4c4713. --- crates/milli/src/vector/mod.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3d77e78d7..f64223e41 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -485,8 +485,6 @@ impl ArroyWrapper { limit: usize, filter: Option<&RoaringBitmap>, ) -> Result, arroy::Error> { - println!("nns_by_vector: quantized={} {:?} limit={} filter={:?}", - self.quantized, vector, limit, filter); if self.quantized { self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) } else { @@ -519,8 +517,6 @@ impl ArroyWrapper { results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); - println!("nns_by_vector: results={:?}", results); - Ok(results) } From ab07e9480ec8364dc2555252e5bc8b32c02d7379 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Mon, 21 Jul 2025 18:22:10 +0200 Subject: [PATCH 22/49] Resolve post-merge issues --- .../src/routes/indexes/documents.rs | 2 +- crates/meilisearch/tests/search/filters.rs | 28 +++++++++---------- crates/meilisearch/tests/vector/mod.rs | 2 +- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 0b70ad86f..cf6181de5 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -139,7 +139,7 @@ pub struct DocumentsFetchAggregator { // if a filter was used per_filter: bool, with_vector_filter: bool, - + // if documents were sorted sort: bool, diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index aa2b06e76..fd5bc57db 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -4,8 +4,8 @@ use tempfile::TempDir; use super::test_settings_documents_indexing_swapping_and_search; use crate::common::{ - default_settings, shared_index_with_documents, shared_index_with_nested_documents, Server, - DOCUMENTS, NESTED_DOCUMENTS, + default_settings, shared_index_for_fragments, shared_index_with_documents, + shared_index_with_nested_documents, Server, DOCUMENTS, NESTED_DOCUMENTS, }; use crate::json; @@ -734,7 +734,7 @@ async fn test_filterable_attributes_priority() { #[actix_rt::test] async fn vector_filter_all_embedders() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -769,7 +769,7 @@ async fn vector_filter_all_embedders() { #[actix_rt::test] async fn vector_filter_missing_fragment() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -789,7 +789,7 @@ async fn vector_filter_missing_fragment() { #[actix_rt::test] async fn vector_filter_non_existant_embedder() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -809,7 +809,7 @@ async fn vector_filter_non_existant_embedder() { #[actix_rt::test] async fn vector_filter_all_embedders_user_provided() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; // This one is counterintuitive, but it is the same as the previous one. // It's because userProvided is interpreted as an embedder name @@ -831,7 +831,7 @@ async fn vector_filter_all_embedders_user_provided() { #[actix_rt::test] async fn vector_filter_specific_embedder() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -866,7 +866,7 @@ async fn vector_filter_specific_embedder() { #[actix_rt::test] async fn vector_filter_user_provided() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -892,7 +892,7 @@ async fn vector_filter_user_provided() { #[actix_rt::test] async fn vector_filter_specific_fragment() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -951,7 +951,7 @@ async fn vector_filter_specific_fragment() { #[actix_rt::test] async fn vector_filter_non_existant_fragment() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -971,7 +971,7 @@ async fn vector_filter_non_existant_fragment() { #[actix_rt::test] async fn vector_filter_specific_fragment_user_provided() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -991,7 +991,7 @@ async fn vector_filter_specific_fragment_user_provided() { #[actix_rt::test] async fn vector_filter_document_template_but_fragments_used() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -1093,7 +1093,7 @@ async fn vector_filter_feature_gate() { #[actix_rt::test] async fn vector_filter_negation() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ @@ -1125,7 +1125,7 @@ async fn vector_filter_negation() { #[actix_rt::test] async fn vector_filter_or_combination() { - let index = crate::vector::shared_index_for_fragments().await; + let index = shared_index_for_fragments().await; let (value, _code) = index .search_post(json!({ diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 8851d029e..8a701ac83 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -14,7 +14,7 @@ use meilisearch::option::MaxThreads; use crate::common::index::Index; use crate::common::{default_settings, GetAllDocumentsOptions, Server}; use crate::json; -pub use {fragments::shared_index_for_fragments, rest::create_mock}; +pub use rest::create_mock; pub async fn get_server_vector() -> Server { Server::new().await From 0014ed31145b1c9a4021ab6069362cc23c884b37 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 22 Jul 2025 10:56:05 +0200 Subject: [PATCH 23/49] Apply review suggestions --- crates/meilisearch/tests/search/filters.rs | 21 +- .../milli/src/search/facet/filter_vector.rs | 278 ++++++++++-------- crates/milli/src/vector/db.rs | 1 + 3 files changed, 161 insertions(+), 139 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index fd5bc57db..4701e986d 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -981,7 +981,7 @@ async fn vector_filter_specific_fragment_user_provided() { .await; snapshot!(value, @r#" { - "message": "Index `[uuid]`: Vector filter cannot specify both a fragment name and userProvided.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS", + "message": "Index `[uuid]`: Vector filter cannot have both `other` and `userProvided`.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1022,19 +1022,19 @@ async fn vector_filter_document_template() { let (response, code) = index .update_settings(json!({ - "embedders": { - "rest": setting, - }, + "embedders": { + "rest": setting, + }, })) .await; snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await.succeeded(); let documents = json!([ - {"id": 0, "name": "kefir"}, - {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, - {"id": 2, "name": "intel"}, - {"id": 3, "name": "iko" } + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel"}, + {"id": 3, "name": "iko" } ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); @@ -1052,9 +1052,6 @@ async fn vector_filter_document_template() { { "name": "kefir" }, - { - "name": "echo" - }, { "name": "intel" }, @@ -1066,7 +1063,7 @@ async fn vector_filter_document_template() { "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 4 + "estimatedTotalHits": 3 } "#); } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index e3ec698f5..3a0f86637 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -2,14 +2,107 @@ use filter_parser::Token; use roaring::RoaringBitmap; use crate::error::{Error, UserError}; +use crate::vector::db::IndexEmbeddingConfig; use crate::vector::{ArroyStats, ArroyWrapper}; use crate::Index; +#[derive(Debug)] +enum VectorFilterInner<'a> { + Fragment { embedder_token: Token<'a>, fragment_token: Token<'a> }, + DocumentTemplate { embedder_token: Token<'a> }, + UserProvided { embedder_token: Token<'a> }, + FullEmbedder { embedder_token: Token<'a> }, +} + +impl VectorFilterInner<'_> { + fn evaluate_inner( + &self, + rtxn: &heed::RoTxn<'_>, + index: &Index, + embedding_configs: &[IndexEmbeddingConfig], + regenerate: bool, + ) -> crate::Result { + let embedder = match self { + VectorFilterInner::Fragment { embedder_token, .. } => embedder_token, + VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token, + VectorFilterInner::UserProvided { embedder_token } => embedder_token, + VectorFilterInner::FullEmbedder { embedder_token } => embedder_token, + }; + let embedder_name = embedder.value(); + let available_embedders = + || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); + + let embedding_config = embedding_configs + .iter() + .find(|config| config.name == embedder_name) + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + + let embedder_info = index + .embedding_configs() + .embedder_info(rtxn, embedder_name)? + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + + let arroy_wrapper = ArroyWrapper::new( + index.vector_arroy, + embedder_info.embedder_id, + embedding_config.config.quantized(), + ); + + let mut docids = match self { + VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => { + let fragment_name = fragment.value(); + let fragment_config = embedding_config + .fragments + .as_slice() + .iter() + .find(|fragment| fragment.name == fragment_name) + .ok_or_else(|| FragmentDoesNotExist { + embedder, + fragment, + available: embedding_config + .fragments + .as_slice() + .iter() + .map(|f| f.name.clone()) + .collect(), + })?; + + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + } + VectorFilterInner::DocumentTemplate { .. } => { + if !embedding_config.fragments.as_slice().is_empty() { + return Ok(RoaringBitmap::new()); + } + + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents - user_provided_docsids.clone() + } + VectorFilterInner::UserProvided { .. } => { + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + user_provided_docsids.clone() + } + VectorFilterInner::FullEmbedder { .. } => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents + } + }; + + if regenerate { + let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); + docids &= skip_regenerate; + } + + Ok(docids) + } +} + +#[derive(Debug)] pub(super) struct VectorFilter<'a> { - embedder_token: Option>, - fragment_token: Option>, - document_template: bool, - user_provided: bool, + inner: Option>, + regenerate: bool, } #[derive(Debug)] @@ -17,8 +110,7 @@ pub enum VectorFilterError<'a> { EmptyFilter, InvalidPrefix(Token<'a>), MissingFragmentName(Token<'a>), - UserProvidedWithFragment(Token<'a>), - DocumentTemplateWithFragment(Token<'a>), + ExclusiveOptions(Box<(Token<'a>, Token<'a>)>), LeftoverToken(Token<'a>), EmbedderDoesNotExist { embedder: &'a Token<'a>, @@ -51,11 +143,13 @@ impl std::fmt::Display for VectorFilterError<'_> { MissingFragmentName(_token) => { write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.") } - UserProvidedWithFragment(_token) => { - write!(f, "Vector filter cannot specify both a fragment name and userProvided.") - } - DocumentTemplateWithFragment(_token) => { - write!(f, "Vector filter cannot specify both a fragment name and documentTemplate.") + ExclusiveOptions(tokens) => { + write!( + f, + "Vector filter cannot have both `{}` and `{}`.", + tokens.0.value(), + tokens.1.value() + ) } LeftoverToken(token) => { write!(f, "Vector filter has leftover token: `{}`.", token.value()) @@ -107,11 +201,10 @@ impl<'a> From> for Error { fn from(err: VectorFilterError<'a>) -> Self { match &err { EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())), - InvalidPrefix(token) - | MissingFragmentName(token) - | UserProvidedWithFragment(token) - | DocumentTemplateWithFragment(token) - | LeftoverToken(token) => token.clone().as_external_error(err).into(), + InvalidPrefix(token) | MissingFragmentName(token) | LeftoverToken(token) => { + token.clone().as_external_error(err).into() + } + ExclusiveOptions(tokens) => tokens.1.clone().as_external_error(err).into(), EmbedderDoesNotExist { embedder: token, .. } | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), } @@ -127,11 +220,15 @@ impl<'a> VectorFilter<'a> { /// /// Valid formats: /// - `_vectors` + /// - `_vectors.mustRegenerate` /// - `_vectors.{embedder_name}` + /// - `_vectors.{embedder_name}.mustRegenerate` /// - `_vectors.{embedder_name}.userProvided` + /// - `_vectors.{embedder_name}.userProvided.mustRegenerate` /// - `_vectors.{embedder_name}.documentTemplate` - /// - `_vectors.{embedder_name}.documentTemplate.userProvided` + /// - `_vectors.{embedder_name}.documentTemplate.mustRegenerate` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` + /// - `_vectors.{embedder_name}.fragments.{fragment_name}.mustRegenerate` pub(super) fn parse(s: &'a Token<'a>) -> Result> { let mut split = s.split(".").peekable(); @@ -151,38 +248,46 @@ impl<'a> VectorFilter<'a> { } let mut user_provided_token = None; - if split.peek().map(|t| t.value()) == Some("userProvided") - || split.peek().map(|t| t.value()) == Some("user_provided") - { + if split.peek().map(|t| t.value()) == Some("userProvided") { user_provided_token = split.next(); } let mut document_template_token = None; - if split.peek().map(|t| t.value()) == Some("documentTemplate") - || split.peek().map(|t| t.value()) == Some("document_template") - { + if split.peek().map(|t| t.value()) == Some("documentTemplate") { document_template_token = split.next(); } - if let (Some(_), Some(user_provided_token)) = (&fragment_name, &user_provided_token) { - return Err(UserProvidedWithFragment(user_provided_token.clone()))?; + let mut regenerate_token = None; + if split.peek().map(|t| t.value()) == Some("regenerate") { + regenerate_token = split.next(); } - if let (Some(_), Some(document_template_token)) = (&fragment_name, &document_template_token) - { - return Err(DocumentTemplateWithFragment(document_template_token.clone()))?; - } + let inner = match (fragment_name, user_provided_token, document_template_token) { + (Some(fragment_name), None, None) => Some(VectorFilterInner::Fragment { + embedder_token: embedder_name + .expect("embedder name comes before fragment so it's always Some"), + fragment_token: fragment_name, + }), + (None, Some(_), None) => Some(VectorFilterInner::UserProvided { + embedder_token: embedder_name + .expect("embedder name comes before userProvided so it's always Some"), + }), + (None, None, Some(_)) => Some(VectorFilterInner::DocumentTemplate { + embedder_token: embedder_name + .expect("embedder name comes before documentTemplate so it's always Some"), + }), + (Some(a), Some(b), _) | (_, Some(a), Some(b)) | (Some(a), None, Some(b)) => { + return Err(ExclusiveOptions(Box::new((a, b)))); + } + (None, None, None) => embedder_name + .map(|embedder_token| VectorFilterInner::FullEmbedder { embedder_token }), + }; if let Some(next) = split.next() { return Err(LeftoverToken(next))?; } - Ok(Self { - embedder_token: embedder_name, - fragment_token: fragment_name, - user_provided: user_provided_token.is_some(), - document_template: document_template_token.is_some(), - }) + Ok(Self { inner, regenerate: regenerate_token.is_some() }) } pub(super) fn evaluate( @@ -194,100 +299,19 @@ impl<'a> VectorFilter<'a> { let index_embedding_configs = index.embedding_configs(); let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; - let mut embedders = Vec::new(); - if let Some(embedder_token) = &self.embedder_token { - let embedder_name = embedder_token.value(); - - let Some(embedding_config) = - embedding_configs.iter().find(|config| config.name == embedder_name) - else { - return Err(EmbedderDoesNotExist { - embedder: embedder_token, - available: embedding_configs.iter().map(|c| c.name.clone()).collect(), - })?; - }; - - let Some(embedder_info) = index_embedding_configs.embedder_info(rtxn, embedder_name)? - else { - return Err(EmbedderDoesNotExist { - embedder: embedder_token, - available: embedding_configs.iter().map(|c| c.name.clone()).collect(), - })?; - }; - - if self.document_template && !embedding_config.fragments.as_slice().is_empty() { - return Ok(RoaringBitmap::new()); - } - - embedders.push((embedding_config, embedder_info)); - } else { - for embedder_config in embedding_configs.iter() { - let Some(embedder_info) = - index_embedding_configs.embedder_info(rtxn, &embedder_config.name)? - else { - continue; - }; - embedders.push((embedder_config, embedder_info)); - } - }; + let inners = dbg!(match self.inner { + Some(inner) => vec![inner], + None => embedding_configs + .iter() + .map(|config| VectorFilterInner::FullEmbedder { + embedder_token: Token::from(config.name.as_str()), + }) + .collect(), + }); let mut docids = RoaringBitmap::new(); - for (embedding_config, embedder_info) in embedders { - let arroy_wrapper = ArroyWrapper::new( - index.vector_arroy, - embedder_info.embedder_id, - embedding_config.config.quantized(), - ); - - docids |= if let Some(fragment_token) = &self.fragment_token { - let fragment_name = fragment_token.value(); - let Some(fragment_config) = embedding_config - .fragments - .as_slice() - .iter() - .find(|fragment| fragment.name == fragment_name) - else { - return Err(FragmentDoesNotExist { - embedder: self - .embedder_token - .as_ref() - .expect("there can't be a fragment without an embedder"), - fragment: fragment_token, - available: embedding_config - .fragments - .as_slice() - .iter() - .map(|f| f.name.clone()) - .collect(), - })?; - }; - - if let Some(universe) = universe { - arroy_wrapper - .items_in_store(rtxn, fragment_config.id, |bitmap| bitmap & universe)? - } else { - arroy_wrapper - .items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? - } - } else { - let mut universe = universe.cloned(); - if self.user_provided { - let user_provided_docsids = - embedder_info.embedding_status.user_provided_docids(); - match &mut universe { - Some(universe) => *universe &= user_provided_docsids, - None => universe = Some(user_provided_docsids.clone()), - } - } - - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - if let Some(universe) = &universe { - stats.documents & universe - } else { - stats.documents - } - }; + for inner in inners.iter() { + docids |= inner.evaluate_inner(rtxn, index, &embedding_configs, self.regenerate)?; } if let Some(universe) = universe { diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs index 2fea75d68..d445b47c0 100644 --- a/crates/milli/src/vector/db.rs +++ b/crates/milli/src/vector/db.rs @@ -128,6 +128,7 @@ impl EmbeddingStatus { pub fn is_user_provided(&self, docid: DocumentId) -> bool { self.user_provided.contains(docid) } + /// Whether vectors should be regenerated for that document and that embedder. pub fn must_regenerate(&self, docid: DocumentId) -> bool { let invert = self.skip_regenerate_different_from_user_provided.contains(docid); From 982e9898861f4e29b0409cf3a51e610ba308cdc5 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 22 Jul 2025 11:10:05 +0200 Subject: [PATCH 24/49] Test regenerate filter --- crates/meilisearch/tests/search/filters.rs | 36 +++++++++++++++++++ .../milli/src/search/facet/filter_vector.rs | 11 +++--- 2 files changed, 41 insertions(+), 6 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 4701e986d..1a27bdf99 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -1151,3 +1151,39 @@ async fn vector_filter_or_combination() { } "#); } + + +#[actix_rt::test] +async fn vector_filter_regenerate() { + let index = shared_index_for_fragments().await; + + for selector in ["_vectors.rest.regenerate", "_vectors.rest.fragments.basic.regenerate"] { + let (value, _code) = index + .search_post(json!({ + "filter": format!("{selector} EXISTS"), + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "hits": [ + { + "name": "kefir" + }, + { + "name": "intel" + }, + { + "name": "dustin" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 + } + "#); + } +} + diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 3a0f86637..7fbd9c916 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -92,7 +92,7 @@ impl VectorFilterInner<'_> { if regenerate { let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); - docids &= skip_regenerate; + docids -= skip_regenerate; } Ok(docids) @@ -220,15 +220,14 @@ impl<'a> VectorFilter<'a> { /// /// Valid formats: /// - `_vectors` - /// - `_vectors.mustRegenerate` /// - `_vectors.{embedder_name}` - /// - `_vectors.{embedder_name}.mustRegenerate` + /// - `_vectors.{embedder_name}.regenerate` /// - `_vectors.{embedder_name}.userProvided` - /// - `_vectors.{embedder_name}.userProvided.mustRegenerate` + /// - `_vectors.{embedder_name}.userProvided.regenerate` /// - `_vectors.{embedder_name}.documentTemplate` - /// - `_vectors.{embedder_name}.documentTemplate.mustRegenerate` + /// - `_vectors.{embedder_name}.documentTemplate.regenerate` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` - /// - `_vectors.{embedder_name}.fragments.{fragment_name}.mustRegenerate` + /// - `_vectors.{embedder_name}.fragments.{fragment_name}.regenerate` pub(super) fn parse(s: &'a Token<'a>) -> Result> { let mut split = s.split(".").peekable(); From 6d93b36279c75901a637fe8a4aaf69461274286e Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 22 Jul 2025 11:18:41 +0200 Subject: [PATCH 25/49] Format --- crates/meilisearch/tests/search/filters.rs | 14 +- .../milli/src/search/facet/filter_vector.rs | 278 ++++++++---------- 2 files changed, 122 insertions(+), 170 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 1a27bdf99..12bfbe2ea 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -1152,19 +1152,18 @@ async fn vector_filter_or_combination() { "#); } - #[actix_rt::test] async fn vector_filter_regenerate() { let index = shared_index_for_fragments().await; for selector in ["_vectors.rest.regenerate", "_vectors.rest.fragments.basic.regenerate"] { let (value, _code) = index - .search_post(json!({ - "filter": format!("{selector} EXISTS"), - "attributesToRetrieve": ["name"] - })) - .await; - snapshot!(value, @r#" + .search_post(json!({ + "filter": format!("{selector} EXISTS"), + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" { "hits": [ { @@ -1186,4 +1185,3 @@ async fn vector_filter_regenerate() { "#); } } - diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 7fbd9c916..a59bbb5f9 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -14,108 +14,49 @@ enum VectorFilterInner<'a> { FullEmbedder { embedder_token: Token<'a> }, } -impl VectorFilterInner<'_> { - fn evaluate_inner( - &self, - rtxn: &heed::RoTxn<'_>, - index: &Index, - embedding_configs: &[IndexEmbeddingConfig], - regenerate: bool, - ) -> crate::Result { - let embedder = match self { - VectorFilterInner::Fragment { embedder_token, .. } => embedder_token, - VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token, - VectorFilterInner::UserProvided { embedder_token } => embedder_token, - VectorFilterInner::FullEmbedder { embedder_token } => embedder_token, - }; - let embedder_name = embedder.value(); - let available_embedders = - || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); - - let embedding_config = embedding_configs - .iter() - .find(|config| config.name == embedder_name) - .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; - - let embedder_info = index - .embedding_configs() - .embedder_info(rtxn, embedder_name)? - .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; - - let arroy_wrapper = ArroyWrapper::new( - index.vector_arroy, - embedder_info.embedder_id, - embedding_config.config.quantized(), - ); - - let mut docids = match self { - VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => { - let fragment_name = fragment.value(); - let fragment_config = embedding_config - .fragments - .as_slice() - .iter() - .find(|fragment| fragment.name == fragment_name) - .ok_or_else(|| FragmentDoesNotExist { - embedder, - fragment, - available: embedding_config - .fragments - .as_slice() - .iter() - .map(|f| f.name.clone()) - .collect(), - })?; - - arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? - } - VectorFilterInner::DocumentTemplate { .. } => { - if !embedding_config.fragments.as_slice().is_empty() { - return Ok(RoaringBitmap::new()); - } - - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents - user_provided_docsids.clone() - } - VectorFilterInner::UserProvided { .. } => { - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - user_provided_docsids.clone() - } - VectorFilterInner::FullEmbedder { .. } => { - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents - } - }; - - if regenerate { - let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); - docids -= skip_regenerate; - } - - Ok(docids) - } -} - #[derive(Debug)] pub(super) struct VectorFilter<'a> { inner: Option>, regenerate: bool, } -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum VectorFilterError<'a> { + #[error("Vector filter cannot be empty.")] EmptyFilter, + + #[error("Vector filter must start with `_vectors` but found `{}`.", _0.value())] InvalidPrefix(Token<'a>), + + #[error("Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")] MissingFragmentName(Token<'a>), + + #[error("Vector filter cannot have both `{}` and `{}`.", _0.0.value(), _0.1.value())] ExclusiveOptions(Box<(Token<'a>, Token<'a>)>), + + #[error("Vector filter has leftover token: `{}`.", _0.value())] LeftoverToken(Token<'a>), - EmbedderDoesNotExist { - embedder: &'a Token<'a>, - available: Vec, - }, + + #[error("The embedder `{}` does not exist. {}", embedder.value(), { + if available.is_empty() { + String::from("This index does not have any configured embedders.") + } else { + let mut available = available.clone(); + available.sort_unstable(); + format!("Available embedders are: {}.", available.iter().map(|e| format!("`{e}`")).collect::>().join(", ")) + } + })] + EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec }, + + #[error("The fragment `{}` does not exist on embedder `{}`. {}", fragment.value(), embedder.value(), { + if available.is_empty() { + String::from("This embedder does not have any configured fragments.") + } else { + let mut available = available.clone(); + available.sort_unstable(); + format!("Available fragments on this embedder are: {}.", available.iter().map(|f| format!("`{f}`")).collect::>().join(", ")) + } + })] FragmentDoesNotExist { embedder: &'a Token<'a>, fragment: &'a Token<'a>, @@ -125,78 +66,6 @@ pub enum VectorFilterError<'a> { use VectorFilterError::*; -impl std::error::Error for VectorFilterError<'_> {} - -impl std::fmt::Display for VectorFilterError<'_> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - EmptyFilter => { - write!(f, "Vector filter cannot be empty.") - } - InvalidPrefix(prefix) => { - write!( - f, - "Vector filter must start with `_vectors` but found `{}`.", - prefix.value() - ) - } - MissingFragmentName(_token) => { - write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.") - } - ExclusiveOptions(tokens) => { - write!( - f, - "Vector filter cannot have both `{}` and `{}`.", - tokens.0.value(), - tokens.1.value() - ) - } - LeftoverToken(token) => { - write!(f, "Vector filter has leftover token: `{}`.", token.value()) - } - EmbedderDoesNotExist { embedder, available } => { - write!(f, "The embedder `{}` does not exist.", embedder.value())?; - if available.is_empty() { - write!(f, " This index does not have configured embedders.") - } else { - write!(f, " Available embedders are: ")?; - let mut available = available.clone(); - available.sort_unstable(); - for (idx, embedder) in available.iter().enumerate() { - write!(f, "`{embedder}`")?; - if idx != available.len() - 1 { - write!(f, ", ")?; - } - } - write!(f, ".") - } - } - FragmentDoesNotExist { embedder, fragment, available } => { - write!( - f, - "The fragment `{}` does not exist on embedder `{}`.", - fragment.value(), - embedder.value(), - )?; - if available.is_empty() { - write!(f, " This embedder does not have configured fragments.") - } else { - write!(f, " Available fragments on this embedder are: ")?; - let mut available = available.clone(); - available.sort_unstable(); - for (idx, fragment) in available.iter().enumerate() { - write!(f, "`{fragment}`")?; - if idx != available.len() - 1 { - write!(f, ", ")?; - } - } - write!(f, ".") - } - } - } - } -} - impl<'a> From> for Error { fn from(err: VectorFilterError<'a>) -> Self { match &err { @@ -320,3 +189,88 @@ impl<'a> VectorFilter<'a> { Ok(docids) } } + +impl VectorFilterInner<'_> { + fn evaluate_inner( + &self, + rtxn: &heed::RoTxn<'_>, + index: &Index, + embedding_configs: &[IndexEmbeddingConfig], + regenerate: bool, + ) -> crate::Result { + let embedder = match self { + VectorFilterInner::Fragment { embedder_token, .. } => embedder_token, + VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token, + VectorFilterInner::UserProvided { embedder_token } => embedder_token, + VectorFilterInner::FullEmbedder { embedder_token } => embedder_token, + }; + let embedder_name = embedder.value(); + let available_embedders = + || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); + + let embedding_config = embedding_configs + .iter() + .find(|config| config.name == embedder_name) + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + + let embedder_info = index + .embedding_configs() + .embedder_info(rtxn, embedder_name)? + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + + let arroy_wrapper = ArroyWrapper::new( + index.vector_arroy, + embedder_info.embedder_id, + embedding_config.config.quantized(), + ); + + let mut docids = match self { + VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => { + let fragment_name = fragment.value(); + let fragment_config = embedding_config + .fragments + .as_slice() + .iter() + .find(|fragment| fragment.name == fragment_name) + .ok_or_else(|| FragmentDoesNotExist { + embedder, + fragment, + available: embedding_config + .fragments + .as_slice() + .iter() + .map(|f| f.name.clone()) + .collect(), + })?; + + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + } + VectorFilterInner::DocumentTemplate { .. } => { + if !embedding_config.fragments.as_slice().is_empty() { + return Ok(RoaringBitmap::new()); + } + + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents - user_provided_docsids.clone() + } + VectorFilterInner::UserProvided { .. } => { + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + user_provided_docsids.clone() + } + VectorFilterInner::FullEmbedder { .. } => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents + } + }; + + if regenerate { + let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); + docids -= skip_regenerate; + } + + Ok(docids) + } +} From 3362fb8476860efa36fb6a1b9fdf296e47b42d11 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 22 Jul 2025 11:21:06 +0200 Subject: [PATCH 26/49] Remove print --- crates/milli/src/search/facet/filter_vector.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index a59bbb5f9..e2c00a16a 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -167,7 +167,7 @@ impl<'a> VectorFilter<'a> { let index_embedding_configs = index.embedding_configs(); let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; - let inners = dbg!(match self.inner { + let inners = match self.inner { Some(inner) => vec![inner], None => embedding_configs .iter() @@ -175,7 +175,7 @@ impl<'a> VectorFilter<'a> { embedder_token: Token::from(config.name.as_str()), }) .collect(), - }); + }; let mut docids = RoaringBitmap::new(); for inner in inners.iter() { From 776e55d2096febba707582273243721eebb7b1dc Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 22 Jul 2025 11:37:21 +0200 Subject: [PATCH 27/49] Improve code readability --- crates/milli/src/search/facet/filter_vector.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index e2c00a16a..83b3adcc3 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -1,5 +1,5 @@ use filter_parser::Token; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use crate::error::{Error, UserError}; use crate::vector::db::IndexEmbeddingConfig; @@ -177,10 +177,10 @@ impl<'a> VectorFilter<'a> { .collect(), }; - let mut docids = RoaringBitmap::new(); - for inner in inners.iter() { - docids |= inner.evaluate_inner(rtxn, index, &embedding_configs, self.regenerate)?; - } + let mut docids = inners + .iter() + .map(|i| i.evaluate_inner(rtxn, index, &embedding_configs, self.regenerate)) + .union()?; if let Some(universe) = universe { docids &= universe; From aa5a1f333aa01a3161fa979cec4a85da35c5b803 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 23 Jul 2025 15:33:17 +0200 Subject: [PATCH 28/49] Refactor to support less combinations --- crates/meilisearch/tests/search/filters.rs | 69 +++++---- .../milli/src/search/facet/filter_vector.rs | 140 +++++++++--------- 2 files changed, 109 insertions(+), 100 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 12bfbe2ea..cd2da747d 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -981,7 +981,7 @@ async fn vector_filter_specific_fragment_user_provided() { .await; snapshot!(value, @r#" { - "message": "Index `[uuid]`: Vector filter cannot have both `other` and `userProvided`.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS", + "message": "Index `[uuid]`: Vector filter cannot have both `fragments` and `userProvided`.\n15:24 _vectors.rest.fragments.other.userProvided EXISTS", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1156,32 +1156,45 @@ async fn vector_filter_or_combination() { async fn vector_filter_regenerate() { let index = shared_index_for_fragments().await; - for selector in ["_vectors.rest.regenerate", "_vectors.rest.fragments.basic.regenerate"] { - let (value, _code) = index - .search_post(json!({ - "filter": format!("{selector} EXISTS"), - "attributesToRetrieve": ["name"] - })) - .await; - snapshot!(value, @r#" - { - "hits": [ - { - "name": "kefir" - }, - { - "name": "intel" - }, - { - "name": "dustin" - } - ], - "query": "", - "processingTimeMs": "[duration]", - "limit": 20, - "offset": 0, - "estimatedTotalHits": 3 - } - "#); + let (value, _code) = index + .search_post(json!({ + "filter": format!("_vectors.rest.regenerate EXISTS"), + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "hits": [ + { + "name": "kefir" + }, + { + "name": "intel" + }, + { + "name": "dustin" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 20, + "offset": 0, + "estimatedTotalHits": 3 } + "#); + + let (value, _code) = index + .search_post(json!({ + "filter": format!("_vectors.rest.fragments.basic.regenerate EXISTS"), + "attributesToRetrieve": ["name"] + })) + .await; + snapshot!(value, @r#" + { + "message": "Index `[uuid]`: Vector filter cannot have both `fragments` and `regenerate`.\n15:24 _vectors.rest.fragments.basic.regenerate EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 83b3adcc3..91f138685 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -8,16 +8,17 @@ use crate::Index; #[derive(Debug)] enum VectorFilterInner<'a> { - Fragment { embedder_token: Token<'a>, fragment_token: Token<'a> }, - DocumentTemplate { embedder_token: Token<'a> }, - UserProvided { embedder_token: Token<'a> }, - FullEmbedder { embedder_token: Token<'a> }, + Fragment(Token<'a>), + DocumentTemplate, + UserProvided, + Regenerate, + None, } #[derive(Debug)] pub(super) struct VectorFilter<'a> { - inner: Option>, - regenerate: bool, + embedder: Option>, + inner: VectorFilterInner<'a>, } #[derive(Debug, thiserror::Error)] @@ -31,8 +32,10 @@ pub enum VectorFilterError<'a> { #[error("Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")] MissingFragmentName(Token<'a>), - #[error("Vector filter cannot have both `{}` and `{}`.", _0.0.value(), _0.1.value())] - ExclusiveOptions(Box<(Token<'a>, Token<'a>)>), + #[error("Vector filter cannot have both {}.", { + _0.iter().map(|t| format!("`{}`", t.value())).collect::>().join(" and ") + })] + ExclusiveOptions(Vec>), #[error("Vector filter has leftover token: `{}`.", _0.value())] LeftoverToken(Token<'a>), @@ -73,7 +76,12 @@ impl<'a> From> for Error { InvalidPrefix(token) | MissingFragmentName(token) | LeftoverToken(token) => { token.clone().as_external_error(err).into() } - ExclusiveOptions(tokens) => tokens.1.clone().as_external_error(err).into(), + ExclusiveOptions(tokens) => tokens + .first() + .cloned() + .unwrap_or_else(|| Token::from("")) // Should never happen: tokens is never created empty + .as_external_error(err) + .into(), EmbedderDoesNotExist { embedder: token, .. } | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), } @@ -92,11 +100,8 @@ impl<'a> VectorFilter<'a> { /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.regenerate` /// - `_vectors.{embedder_name}.userProvided` - /// - `_vectors.{embedder_name}.userProvided.regenerate` /// - `_vectors.{embedder_name}.documentTemplate` - /// - `_vectors.{embedder_name}.documentTemplate.regenerate` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` - /// - `_vectors.{embedder_name}.fragments.{fragment_name}.regenerate` pub(super) fn parse(s: &'a Token<'a>) -> Result> { let mut split = s.split(".").peekable(); @@ -108,54 +113,53 @@ impl<'a> VectorFilter<'a> { let embedder_name = split.next(); - let mut fragment_name = None; + let mut fragment_tokens = None; if split.peek().map(|t| t.value()) == Some("fragments") { let token = split.next().expect("it was peeked before"); + let name = split.next().ok_or_else(|| MissingFragmentName(token.clone()))?; - fragment_name = Some(split.next().ok_or(MissingFragmentName(token))?); + fragment_tokens = Some((token, name)); } + let mut remaining_tokens = split.collect::>(); + let mut user_provided_token = None; - if split.peek().map(|t| t.value()) == Some("userProvided") { - user_provided_token = split.next(); + if let Some(position) = remaining_tokens.iter().position(|t| t.value() == "userProvided") { + user_provided_token = Some(remaining_tokens.remove(position)); } let mut document_template_token = None; - if split.peek().map(|t| t.value()) == Some("documentTemplate") { - document_template_token = split.next(); + if let Some(position) = + remaining_tokens.iter().position(|t| t.value() == "documentTemplate") + { + document_template_token = Some(remaining_tokens.remove(position)); } let mut regenerate_token = None; - if split.peek().map(|t| t.value()) == Some("regenerate") { - regenerate_token = split.next(); + if let Some(position) = remaining_tokens.iter().position(|t| t.value() == "regenerate") { + regenerate_token = Some(remaining_tokens.remove(position)); } - let inner = match (fragment_name, user_provided_token, document_template_token) { - (Some(fragment_name), None, None) => Some(VectorFilterInner::Fragment { - embedder_token: embedder_name - .expect("embedder name comes before fragment so it's always Some"), - fragment_token: fragment_name, - }), - (None, Some(_), None) => Some(VectorFilterInner::UserProvided { - embedder_token: embedder_name - .expect("embedder name comes before userProvided so it's always Some"), - }), - (None, None, Some(_)) => Some(VectorFilterInner::DocumentTemplate { - embedder_token: embedder_name - .expect("embedder name comes before documentTemplate so it's always Some"), - }), - (Some(a), Some(b), _) | (_, Some(a), Some(b)) | (Some(a), None, Some(b)) => { - return Err(ExclusiveOptions(Box::new((a, b)))); - } - (None, None, None) => embedder_name - .map(|embedder_token| VectorFilterInner::FullEmbedder { embedder_token }), - }; - - if let Some(next) = split.next() { - return Err(LeftoverToken(next))?; + if !remaining_tokens.is_empty() { + return Err(LeftoverToken(remaining_tokens.remove(0))); } - Ok(Self { inner, regenerate: regenerate_token.is_some() }) + let inner = + match (fragment_tokens, user_provided_token, document_template_token, regenerate_token) + { + (Some((_token, name)), None, None, None) => VectorFilterInner::Fragment(name), + (None, Some(_), None, None) => VectorFilterInner::UserProvided, + (None, None, Some(_), None) => VectorFilterInner::DocumentTemplate, + (None, None, None, Some(_)) => VectorFilterInner::Regenerate, + (None, None, None, None) => VectorFilterInner::None, + (a, b, c, d) => { + let a = a.map(|(token, _)| token); + let present = [a, b, c, d].into_iter().flatten().collect(); + return Err(ExclusiveOptions(present)); + } + }; + + Ok(Self { inner, embedder: embedder_name }) } pub(super) fn evaluate( @@ -167,19 +171,16 @@ impl<'a> VectorFilter<'a> { let index_embedding_configs = index.embedding_configs(); let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; - let inners = match self.inner { - Some(inner) => vec![inner], - None => embedding_configs - .iter() - .map(|config| VectorFilterInner::FullEmbedder { - embedder_token: Token::from(config.name.as_str()), - }) - .collect(), + let embedders = match self.embedder { + Some(embedder) => vec![embedder], + None => { + embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect() + } }; - let mut docids = inners + let mut docids = embedders .iter() - .map(|i| i.evaluate_inner(rtxn, index, &embedding_configs, self.regenerate)) + .map(|e| self.inner.evaluate(rtxn, index, e, &embedding_configs)) .union()?; if let Some(universe) = universe { @@ -191,19 +192,13 @@ impl<'a> VectorFilter<'a> { } impl VectorFilterInner<'_> { - fn evaluate_inner( + fn evaluate( &self, rtxn: &heed::RoTxn<'_>, index: &Index, + embedder: &Token<'_>, embedding_configs: &[IndexEmbeddingConfig], - regenerate: bool, ) -> crate::Result { - let embedder = match self { - VectorFilterInner::Fragment { embedder_token, .. } => embedder_token, - VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token, - VectorFilterInner::UserProvided { embedder_token } => embedder_token, - VectorFilterInner::FullEmbedder { embedder_token } => embedder_token, - }; let embedder_name = embedder.value(); let available_embedders = || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); @@ -224,8 +219,8 @@ impl VectorFilterInner<'_> { embedding_config.config.quantized(), ); - let mut docids = match self { - VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => { + let docids = match self { + VectorFilterInner::Fragment(fragment) => { let fragment_name = fragment.value(); let fragment_config = embedding_config .fragments @@ -245,7 +240,7 @@ impl VectorFilterInner<'_> { arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? } - VectorFilterInner::DocumentTemplate { .. } => { + VectorFilterInner::DocumentTemplate => { if !embedding_config.fragments.as_slice().is_empty() { return Ok(RoaringBitmap::new()); } @@ -255,22 +250,23 @@ impl VectorFilterInner<'_> { arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; stats.documents - user_provided_docsids.clone() } - VectorFilterInner::UserProvided { .. } => { + VectorFilterInner::UserProvided => { let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); user_provided_docsids.clone() } - VectorFilterInner::FullEmbedder { .. } => { + VectorFilterInner::Regenerate => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); + stats.documents - skip_regenerate + } + VectorFilterInner::None => { let mut stats = ArroyStats::default(); arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; stats.documents } }; - if regenerate { - let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); - docids -= skip_regenerate; - } - Ok(docids) } } From bb4d57386280796928f311133eb14d1a4af470a4 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 14:56:35 +0200 Subject: [PATCH 29/49] Switch to a nom parser --- crates/filter-parser/src/condition.rs | 60 ++++ crates/filter-parser/src/error.rs | 34 ++ crates/filter-parser/src/lib.rs | 46 ++- crates/filter-parser/src/value.rs | 33 ++ crates/meilisearch/tests/search/filters.rs | 18 +- crates/milli/src/search/facet/filter.rs | 23 +- .../milli/src/search/facet/filter_vector.rs | 306 ++++++------------ 7 files changed, 269 insertions(+), 251 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 0fc007bf1..af0767706 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -7,11 +7,20 @@ use nom::branch::alt; use nom::bytes::complete::tag; +use nom::character::complete::char; +use nom::character::complete::multispace0; use nom::character::complete::multispace1; use nom::combinator::cut; +use nom::combinator::map; +use nom::combinator::value; +use nom::sequence::preceded; use nom::sequence::{terminated, tuple}; use Condition::*; +use crate::error::IResultExt; +use crate::value::parse_vector_value; +use crate::ErrorKind; +use crate::VectorFilter; use crate::{parse_value, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -113,6 +122,57 @@ pub fn parse_not_exists(input: Span) -> IResult { Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Exists })))) } +fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_>)> { + let (input, _) = multispace0(input)?; + let (input, fid) = tag("_vectors")(input)?; + + if let Ok((input, _)) = multispace1::<_, crate::Error>(input) { + return Ok((input, (Token::from(fid), None, VectorFilter::None))); + } + + let (input, _) = char('.')(input)?; + + // From this point, we are certain this is a vector filter, so our errors must be final. + // We could use nom's `cut`` but it's better to be explicit about the errors + + let (input, embedder_name) = parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?; + + let (input, filter) = alt(( + map( + preceded(tag(".fragments"), |input| { + let (input, _) = tag(".")(input).map_cut(ErrorKind::VectorFilterMissingFragment)?; + parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidFragment) + }), + VectorFilter::Fragment, + ), + value(VectorFilter::UserProvided, tag(".userProvided")), + value(VectorFilter::DocumentTemplate, tag(".documentTemplate")), + value(VectorFilter::Regenerate, tag(".regenerate")), + value(VectorFilter::None, nom::combinator::success("")), + ))(input)?; + + let (input, _) = multispace1(input).map_cut(ErrorKind::VectorFilterLeftover)?; + + Ok((input, (Token::from(fid), Some(embedder_name), filter))) +} + +/// vectors_exists = vectors "EXISTS" +pub fn parse_vectors_exists(input: Span) -> IResult { + let (input, (fid, embedder, filter)) = terminated(parse_vectors, tag("EXISTS"))(input)?; + + Ok((input, FilterCondition::VectorExists { fid, embedder, filter })) +} +/// vectors_not_exists = vectors "NOT" WS+ "EXISTS" +pub fn parse_vectors_not_exists(input: Span) -> IResult { + let (input, (fid, embedder, filter)) = parse_vectors(input)?; + + let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; + Ok(( + input, + FilterCondition::Not(Box::new(FilterCondition::VectorExists { fid, embedder, filter })), + )) +} + /// contains = value "CONTAINS" value pub fn parse_contains(input: Span) -> IResult { let (input, (fid, contains, value)) = diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index 855ce983e..cf2419b01 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -42,6 +42,23 @@ pub fn cut_with_err<'a, O>( } } +pub trait IResultExt<'a> { + fn map_cut(self, kind: ErrorKind<'a>) -> Self; +} + +impl<'a, T> IResultExt<'a> for IResult<'a, T> { + fn map_cut(self, kind: ErrorKind<'a>) -> Self { + self.map_err(move |e: nom::Err>| { + let input = match e { + nom::Err::Incomplete(_) => return e, + nom::Err::Error(e) => *e.context(), + nom::Err::Failure(e) => *e.context(), + }; + nom::Err::Failure(Error::new_from_kind(input, kind)) + }) + } +} + #[derive(Debug)] pub struct Error<'a> { context: Span<'a>, @@ -76,6 +93,11 @@ pub enum ErrorKind<'a> { InternalError(error::ErrorKind), DepthLimitReached, External(String), + + VectorFilterLeftover, + VectorFilterInvalidEmbedder, + VectorFilterMissingFragment, + VectorFilterInvalidFragment, } impl<'a> Error<'a> { @@ -169,6 +191,18 @@ impl Display for Error<'_> { ErrorKind::MisusedGeoBoundingBox => { writeln!(f, "The `_geoBoundingBox` filter is an operation and can't be used as a value.")? } + ErrorKind::VectorFilterLeftover => { + writeln!(f, "The vector filter has leftover tokens.")? + } + ErrorKind::VectorFilterInvalidFragment => { + writeln!(f, "The vector filter's fragment is invalid.")? + } + ErrorKind::VectorFilterMissingFragment => { + writeln!(f, "The vector filter is missing a fragment name.")? + } + ErrorKind::VectorFilterInvalidEmbedder => { + writeln!(f, "The vector filter's embedder is invalid.")? + } ErrorKind::ReservedKeyword(word) => { writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 1590b08fd..b5697f914 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -65,6 +65,9 @@ use nom_locate::LocatedSpan; pub(crate) use value::parse_value; use value::word_exact; +use crate::condition::{parse_vectors_exists, parse_vectors_not_exists}; +use crate::error::IResultExt; + pub type Span<'a> = LocatedSpan<&'a str, &'a str>; type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; @@ -146,6 +149,15 @@ impl<'a> From<&'a str> for Token<'a> { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum VectorFilter<'a> { + Fragment(Token<'a>), + DocumentTemplate, + UserProvided, + Regenerate, + None, +} + #[derive(Debug, Clone, PartialEq, Eq)] pub enum FilterCondition<'a> { Not(Box), @@ -153,6 +165,7 @@ pub enum FilterCondition<'a> { In { fid: Token<'a>, els: Vec> }, Or(Vec), And(Vec), + VectorExists { fid: Token<'a>, embedder: Option>, filter: VectorFilter<'a> }, GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, GeoBoundingBox { top_right_point: [Token<'a>; 2], bottom_left_point: [Token<'a>; 2] }, } @@ -183,7 +196,8 @@ impl<'a> FilterCondition<'a> { FilterCondition::Or(seq) | FilterCondition::And(seq) => { seq.iter().find_map(|filter| filter.use_contains_operator()) } - FilterCondition::GeoLowerThan { .. } + FilterCondition::VectorExists { .. } + | FilterCondition::GeoLowerThan { .. } | FilterCondition::GeoBoundingBox { .. } | FilterCondition::In { .. } => None, } @@ -191,13 +205,7 @@ impl<'a> FilterCondition<'a> { pub fn use_vector_filter(&self) -> Option<&Token> { match self { - FilterCondition::Condition { fid, op: _ } => { - if fid.value().starts_with("_vectors.") || fid.value() == "_vectors" { - Some(fid) - } else { - None - } - } + FilterCondition::Condition { .. } => None, FilterCondition::Not(this) => this.use_vector_filter(), FilterCondition::Or(seq) | FilterCondition::And(seq) => { seq.iter().find_map(|filter| filter.use_vector_filter()) @@ -205,6 +213,7 @@ impl<'a> FilterCondition<'a> { FilterCondition::GeoLowerThan { .. } | FilterCondition::GeoBoundingBox { .. } | FilterCondition::In { .. } => None, + FilterCondition::VectorExists { fid, .. } => Some(fid), } } @@ -292,10 +301,7 @@ fn parse_in_body(input: Span) -> IResult> { let (input, _) = ws(word_exact("IN"))(input)?; // everything after `IN` can be a failure - let (input, _) = - cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))( - input, - )?; + let (input, _) = tag("[")(input).map_cut(ErrorKind::InOpeningBracket)?; let (input, content) = cut(parse_value_list)(input)?; @@ -529,8 +535,7 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_is_not_null, parse_is_empty, parse_is_not_empty, - parse_exists, - parse_not_exists, + alt((parse_vectors_exists, parse_vectors_not_exists, parse_exists, parse_not_exists)), parse_to, parse_contains, parse_not_contains, @@ -586,6 +591,19 @@ impl std::fmt::Display for FilterCondition<'_> { } write!(f, "]") } + FilterCondition::VectorExists { fid: _, embedder, filter: inner } => { + write!(f, "_vectors")?; + if let Some(embedder) = embedder { + write!(f, ".{embedder:?}")?; + } + match inner { + VectorFilter::Fragment(fragment) => write!(f, ".fragments.{fragment:?}"), + VectorFilter::DocumentTemplate => write!(f, ".documentTemplate"), + VectorFilter::UserProvided => write!(f, ".userProvided"), + VectorFilter::Regenerate => write!(f, ".regenerate"), + VectorFilter::None => Ok(()), + } + } FilterCondition::GeoLowerThan { point, radius } => { write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius) } diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index 98cac39fe..345f0b0a2 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -80,6 +80,39 @@ pub fn word_exact<'a, 'b: 'a>(tag: &'b str) -> impl Fn(Span<'a>) -> IResult<'a, } } +/// vector_value = ( non_dot_word | singleQuoted | doubleQuoted) +pub fn parse_vector_value(input: Span) -> IResult { + pub fn non_dot_word(input: Span) -> IResult { + let (input, word) = take_while1(|c| is_value_component(c) && c != '.')(input)?; + Ok((input, word.into())) + } + + let (input, value) = alt(( + delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))), + delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))), + non_dot_word, + ))(input)?; + + match unescaper::unescape(value.value()) { + Ok(content) => { + if content.len() != value.value().len() { + Ok((input, Token::new(value.original_span(), Some(content)))) + } else { + Ok((input, value)) + } + } + Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)), + Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::InvalidEscapedNumber, + ))), + Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::MalformedValue, + ))), + } +} + /// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value(input: Span) -> IResult { // to get better diagnostic message we are going to strip the left whitespaces from the input right now diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index cd2da747d..67f9ebb71 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -779,7 +779,7 @@ async fn vector_filter_missing_fragment() { .await; snapshot!(value, @r#" { - "message": "Index `[uuid]`: Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.\n15:24 _vectors.rest.fragments EXISTS", + "message": "The vector filter is missing a fragment name.\n24:31 _vectors.rest.fragments EXISTS", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -981,7 +981,7 @@ async fn vector_filter_specific_fragment_user_provided() { .await; snapshot!(value, @r#" { - "message": "Index `[uuid]`: Vector filter cannot have both `fragments` and `userProvided`.\n15:24 _vectors.rest.fragments.other.userProvided EXISTS", + "message": "The vector filter has leftover tokens.\n30:50 _vectors.rest.fragments.other.userProvided EXISTS", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" @@ -1190,11 +1190,11 @@ async fn vector_filter_regenerate() { })) .await; snapshot!(value, @r#" - { - "message": "Index `[uuid]`: Vector filter cannot have both `fragments` and `regenerate`.\n15:24 _vectors.rest.fragments.basic.regenerate EXISTS", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - } - "#); + { + "message": "The vector filter has leftover tokens.\n30:48 _vectors.rest.fragments.basic.regenerate EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); } diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 21a552965..4e67814d3 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -10,8 +10,8 @@ use memchr::memmem::Finder; use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; -use super::{facet_range_search, filter_vector::VectorFilter}; -use crate::constants::RESERVED_GEO_FIELD_NAME; +use super::facet_range_search; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::error::{Error, UserError}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{ @@ -230,7 +230,7 @@ impl<'a> Filter<'a> { } pub fn use_vector_filter(&self) -> Option<&Token> { - self.condition.use_vector_filter() + dbg!(self.condition.use_vector_filter()) } } @@ -241,10 +241,10 @@ impl<'a> Filter<'a> { let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; for fid in self.condition.fids(MAX_FILTER_DEPTH) { - let attribute = fid.value(); + let attribute = dbg!(fid.value()); if matching_features(attribute, &filterable_attributes_rules) .is_some_and(|(_, features)| features.is_filterable()) - || VectorFilter::matches(attribute) + || attribute == RESERVED_VECTORS_FIELD_NAME { continue; } @@ -549,16 +549,6 @@ impl<'a> Filter<'a> { } FilterCondition::Condition { fid, op } => { let value = fid.value(); - if VectorFilter::matches(value) { - if !matches!(op, Condition::Exists) { - return Err(Error::UserError(UserError::InvalidFilter(String::from( - "Vector filter can only be used with the `exists` operator", - )))); - } - let vector_filter = VectorFilter::parse(fid)?; - return vector_filter.evaluate(rtxn, index, universe); - } - let Some(field_id) = field_ids_map.id(value) else { return Ok(RoaringBitmap::new()); }; @@ -616,6 +606,9 @@ impl<'a> Filter<'a> { Ok(RoaringBitmap::new()) } } + FilterCondition::VectorExists { fid: _, embedder, filter } => { + super::filter_vector::evaluate(rtxn, index, universe, embedder.clone(), filter) + } FilterCondition::GeoLowerThan { point, radius } => { if index.is_geo_filtering_enabled(rtxn)? { let base_point: [f64; 2] = diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 91f138685..2ddd801ed 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -1,45 +1,13 @@ -use filter_parser::Token; +use filter_parser::{Token, VectorFilter}; use roaring::{MultiOps, RoaringBitmap}; -use crate::error::{Error, UserError}; +use crate::error::Error; use crate::vector::db::IndexEmbeddingConfig; use crate::vector::{ArroyStats, ArroyWrapper}; use crate::Index; -#[derive(Debug)] -enum VectorFilterInner<'a> { - Fragment(Token<'a>), - DocumentTemplate, - UserProvided, - Regenerate, - None, -} - -#[derive(Debug)] -pub(super) struct VectorFilter<'a> { - embedder: Option>, - inner: VectorFilterInner<'a>, -} - #[derive(Debug, thiserror::Error)] pub enum VectorFilterError<'a> { - #[error("Vector filter cannot be empty.")] - EmptyFilter, - - #[error("Vector filter must start with `_vectors` but found `{}`.", _0.value())] - InvalidPrefix(Token<'a>), - - #[error("Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")] - MissingFragmentName(Token<'a>), - - #[error("Vector filter cannot have both {}.", { - _0.iter().map(|t| format!("`{}`", t.value())).collect::>().join(" and ") - })] - ExclusiveOptions(Vec>), - - #[error("Vector filter has leftover token: `{}`.", _0.value())] - LeftoverToken(Token<'a>), - #[error("The embedder `{}` does not exist. {}", embedder.value(), { if available.is_empty() { String::from("This index does not have any configured embedders.") @@ -72,201 +40,113 @@ use VectorFilterError::*; impl<'a> From> for Error { fn from(err: VectorFilterError<'a>) -> Self { match &err { - EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())), - InvalidPrefix(token) | MissingFragmentName(token) | LeftoverToken(token) => { - token.clone().as_external_error(err).into() - } - ExclusiveOptions(tokens) => tokens - .first() - .cloned() - .unwrap_or_else(|| Token::from("")) // Should never happen: tokens is never created empty - .as_external_error(err) - .into(), EmbedderDoesNotExist { embedder: token, .. } | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), } } } -impl<'a> VectorFilter<'a> { - pub(super) fn matches(value: &str) -> bool { - value.starts_with("_vectors.") || value == "_vectors" +pub(super) fn evaluate( + rtxn: &heed::RoTxn<'_>, + index: &Index, + universe: Option<&RoaringBitmap>, + embedder: Option>, + filter: &VectorFilter<'_>, +) -> crate::Result { + let index_embedding_configs = index.embedding_configs(); + let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; + + let embedders = match embedder { + Some(embedder) => vec![embedder], + None => embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect(), + }; + + let mut docids = embedders + .iter() + .map(|e| evaluate_inner(rtxn, index, e, &embedding_configs, filter)) + .union()?; + + if let Some(universe) = universe { + docids &= universe; } - /// Parses a vector filter string. - /// - /// Valid formats: - /// - `_vectors` - /// - `_vectors.{embedder_name}` - /// - `_vectors.{embedder_name}.regenerate` - /// - `_vectors.{embedder_name}.userProvided` - /// - `_vectors.{embedder_name}.documentTemplate` - /// - `_vectors.{embedder_name}.fragments.{fragment_name}` - pub(super) fn parse(s: &'a Token<'a>) -> Result> { - let mut split = s.split(".").peekable(); - - match split.next() { - Some(token) if token.value() == "_vectors" => (), - Some(token) => return Err(InvalidPrefix(token)), - None => return Err(EmptyFilter), - } - - let embedder_name = split.next(); - - let mut fragment_tokens = None; - if split.peek().map(|t| t.value()) == Some("fragments") { - let token = split.next().expect("it was peeked before"); - let name = split.next().ok_or_else(|| MissingFragmentName(token.clone()))?; - - fragment_tokens = Some((token, name)); - } - - let mut remaining_tokens = split.collect::>(); - - let mut user_provided_token = None; - if let Some(position) = remaining_tokens.iter().position(|t| t.value() == "userProvided") { - user_provided_token = Some(remaining_tokens.remove(position)); - } - - let mut document_template_token = None; - if let Some(position) = - remaining_tokens.iter().position(|t| t.value() == "documentTemplate") - { - document_template_token = Some(remaining_tokens.remove(position)); - } - - let mut regenerate_token = None; - if let Some(position) = remaining_tokens.iter().position(|t| t.value() == "regenerate") { - regenerate_token = Some(remaining_tokens.remove(position)); - } - - if !remaining_tokens.is_empty() { - return Err(LeftoverToken(remaining_tokens.remove(0))); - } - - let inner = - match (fragment_tokens, user_provided_token, document_template_token, regenerate_token) - { - (Some((_token, name)), None, None, None) => VectorFilterInner::Fragment(name), - (None, Some(_), None, None) => VectorFilterInner::UserProvided, - (None, None, Some(_), None) => VectorFilterInner::DocumentTemplate, - (None, None, None, Some(_)) => VectorFilterInner::Regenerate, - (None, None, None, None) => VectorFilterInner::None, - (a, b, c, d) => { - let a = a.map(|(token, _)| token); - let present = [a, b, c, d].into_iter().flatten().collect(); - return Err(ExclusiveOptions(present)); - } - }; - - Ok(Self { inner, embedder: embedder_name }) - } - - pub(super) fn evaluate( - self, - rtxn: &heed::RoTxn<'_>, - index: &Index, - universe: Option<&RoaringBitmap>, - ) -> crate::Result { - let index_embedding_configs = index.embedding_configs(); - let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; - - let embedders = match self.embedder { - Some(embedder) => vec![embedder], - None => { - embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect() - } - }; - - let mut docids = embedders - .iter() - .map(|e| self.inner.evaluate(rtxn, index, e, &embedding_configs)) - .union()?; - - if let Some(universe) = universe { - docids &= universe; - } - - Ok(docids) - } + Ok(docids) } -impl VectorFilterInner<'_> { - fn evaluate( - &self, - rtxn: &heed::RoTxn<'_>, - index: &Index, - embedder: &Token<'_>, - embedding_configs: &[IndexEmbeddingConfig], - ) -> crate::Result { - let embedder_name = embedder.value(); - let available_embedders = - || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); +fn evaluate_inner( + rtxn: &heed::RoTxn<'_>, + index: &Index, + embedder: &Token<'_>, + embedding_configs: &[IndexEmbeddingConfig], + filter: &VectorFilter<'_>, +) -> crate::Result { + let embedder_name = embedder.value(); + let available_embedders = + || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); - let embedding_config = embedding_configs - .iter() - .find(|config| config.name == embedder_name) - .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + let embedding_config = embedding_configs + .iter() + .find(|config| config.name == embedder_name) + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; - let embedder_info = index - .embedding_configs() - .embedder_info(rtxn, embedder_name)? - .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + let embedder_info = index + .embedding_configs() + .embedder_info(rtxn, embedder_name)? + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; - let arroy_wrapper = ArroyWrapper::new( - index.vector_arroy, - embedder_info.embedder_id, - embedding_config.config.quantized(), - ); + let arroy_wrapper = ArroyWrapper::new( + index.vector_arroy, + embedder_info.embedder_id, + embedding_config.config.quantized(), + ); - let docids = match self { - VectorFilterInner::Fragment(fragment) => { - let fragment_name = fragment.value(); - let fragment_config = embedding_config - .fragments - .as_slice() - .iter() - .find(|fragment| fragment.name == fragment_name) - .ok_or_else(|| FragmentDoesNotExist { - embedder, - fragment, - available: embedding_config - .fragments - .as_slice() - .iter() - .map(|f| f.name.clone()) - .collect(), - })?; + let docids = match filter { + VectorFilter::Fragment(fragment) => { + let fragment_name = fragment.value(); + let fragment_config = embedding_config + .fragments + .as_slice() + .iter() + .find(|fragment| fragment.name == fragment_name) + .ok_or_else(|| FragmentDoesNotExist { + embedder, + fragment, + available: embedding_config + .fragments + .as_slice() + .iter() + .map(|f| f.name.clone()) + .collect(), + })?; - arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + } + VectorFilter::DocumentTemplate => { + if !embedding_config.fragments.as_slice().is_empty() { + return Ok(RoaringBitmap::new()); } - VectorFilterInner::DocumentTemplate => { - if !embedding_config.fragments.as_slice().is_empty() { - return Ok(RoaringBitmap::new()); - } - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents - user_provided_docsids.clone() - } - VectorFilterInner::UserProvided => { - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - user_provided_docsids.clone() - } - VectorFilterInner::Regenerate => { - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); - stats.documents - skip_regenerate - } - VectorFilterInner::None => { - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents - } - }; + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents - user_provided_docsids.clone() + } + VectorFilter::UserProvided => { + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + user_provided_docsids.clone() + } + VectorFilter::Regenerate => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); + stats.documents - skip_regenerate + } + VectorFilter::None => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents + } + }; - Ok(docids) - } + Ok(docids) } From 8f1b697b911c41a86ba91a1f8119690dd8ecadae Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 14:57:06 +0200 Subject: [PATCH 30/49] Format --- crates/filter-parser/src/condition.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index af0767706..1e8deef64 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -135,7 +135,8 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> // From this point, we are certain this is a vector filter, so our errors must be final. // We could use nom's `cut`` but it's better to be explicit about the errors - let (input, embedder_name) = parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?; + let (input, embedder_name) = + parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?; let (input, filter) = alt(( map( From ad068286856e5360952863d26066e6cac1c63e86 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 15:24:42 +0200 Subject: [PATCH 31/49] Add tests on parser --- crates/filter-parser/src/lib.rs | 83 ++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 6 deletions(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index b5697f914..1e342d8d2 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -594,15 +594,18 @@ impl std::fmt::Display for FilterCondition<'_> { FilterCondition::VectorExists { fid: _, embedder, filter: inner } => { write!(f, "_vectors")?; if let Some(embedder) = embedder { - write!(f, ".{embedder:?}")?; + write!(f, ".{:?}", embedder.value())?; } match inner { - VectorFilter::Fragment(fragment) => write!(f, ".fragments.{fragment:?}"), - VectorFilter::DocumentTemplate => write!(f, ".documentTemplate"), - VectorFilter::UserProvided => write!(f, ".userProvided"), - VectorFilter::Regenerate => write!(f, ".regenerate"), - VectorFilter::None => Ok(()), + VectorFilter::Fragment(fragment) => { + write!(f, ".fragments.{:?}", fragment.value())? + } + VectorFilter::DocumentTemplate => write!(f, ".documentTemplate")?, + VectorFilter::UserProvided => write!(f, ".userProvided")?, + VectorFilter::Regenerate => write!(f, ".regenerate")?, + VectorFilter::None => (), } + write!(f, " EXISTS") } FilterCondition::GeoLowerThan { point, radius } => { write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius) @@ -677,6 +680,9 @@ pub mod tests { insta::assert_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#); // but it also works with other sequences insta::assert_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); + + insta::assert_snapshot!(p(r#"_vectors." valid.name ".fragments."also.. valid! " EXISTS"#), @r#"_vectors." valid.name ".fragments."also.. valid! " EXISTS"#); + insta::assert_snapshot!(p("_vectors.\"\n\t\r\\\"\" EXISTS"), @r#"_vectors."\n\t\r\"" EXISTS"#); } #[test] @@ -739,6 +745,18 @@ pub mod tests { insta::assert_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY"); insta::assert_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + // Test _vectors EXISTS + _vectors NOT EXITS + insta::assert_snapshot!(p("_vectors EXISTS"), @"_vectors EXISTS"); + insta::assert_snapshot!(p("_vectors.embedderName EXISTS"), @r#"_vectors."embedderName" EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.documentTemplate EXISTS"), @r#"_vectors."embedderName".documentTemplate EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.regenerate EXISTS"), @r#"_vectors."embedderName".regenerate EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.regenerate EXISTS"), @r#"_vectors."embedderName".regenerate EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.fragments.fragmentName EXISTS"), @r#"_vectors."embedderName".fragments."fragmentName" EXISTS"#); + insta::assert_snapshot!(p(" _vectors.embedderName.fragments.fragmentName EXISTS"), @r#"_vectors."embedderName".fragments."fragmentName" EXISTS"#); + insta::assert_snapshot!(p("NOT _vectors EXISTS"), @"NOT (_vectors EXISTS)"); + insta::assert_snapshot!(p(" NOT _vectors EXISTS"), @"NOT (_vectors EXISTS)"); + insta::assert_snapshot!(p(" _vectors NOT EXISTS"), @"NOT (_vectors EXISTS)"); + // Test EXISTS + NOT EXITS insta::assert_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); insta::assert_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); @@ -993,6 +1011,59 @@ pub mod tests { "### ); + insta::assert_snapshot!(p(r#"_vectors _vectors EXISTS"#), @r" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors _vectors EXISTS`. + 1:25 _vectors _vectors EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors. embedderName EXISTS"#), @r" + The vector filter's embedder is invalid. + 10:30 _vectors. embedderName EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors .embedderName EXISTS"#), @r" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors .embedderName EXISTS`. + 1:30 _vectors .embedderName EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName. EXISTS"#), @r" + The vector filter has leftover tokens. + 22:30 _vectors.embedderName. EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors."embedderName EXISTS"#), @r#" + The vector filter's embedder is invalid. + 30:30 _vectors."embedderName EXISTS + "#); + insta::assert_snapshot!(p(r#"_vectors."embedderNam"e EXISTS"#), @r#" + The vector filter has leftover tokens. + 23:31 _vectors."embedderNam"e EXISTS + "#); + insta::assert_snapshot!(p(r#"_vectors.embedderName.documentTemplate. EXISTS"#), @r" + The vector filter has leftover tokens. + 39:47 _vectors.embedderName.documentTemplate. EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments EXISTS"#), @r" + The vector filter is missing a fragment name. + 32:39 _vectors.embedderName.fragments EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. EXISTS"#), @r" + The vector filter's fragment is invalid. + 33:40 _vectors.embedderName.fragments. EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments.test test EXISTS"#), @r" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors.embedderName.fragments.test test EXISTS`. + 1:49 _vectors.embedderName.fragments.test test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. test EXISTS"#), @r" + The vector filter's fragment is invalid. + 33:45 _vectors.embedderName.fragments. test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments. test EXISTS"#), @r" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors.embedderName .fragments. test EXISTS`. + 1:46 _vectors.embedderName .fragments. test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments.test EXISTS"#), @r" + Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors.embedderName .fragments.test EXISTS`. + 1:45 _vectors.embedderName .fragments.test EXISTS + "); + insta::assert_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS From a92e36ab836626bfbbcc2bdade5d029f0a458ba2 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 15:28:17 +0200 Subject: [PATCH 32/49] Small improvements --- crates/filter-parser/src/condition.rs | 2 +- crates/filter-parser/src/error.rs | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 1e8deef64..4d156c269 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -133,7 +133,7 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> let (input, _) = char('.')(input)?; // From this point, we are certain this is a vector filter, so our errors must be final. - // We could use nom's `cut`` but it's better to be explicit about the errors + // We could use nom's `cut` but it's better to be explicit about the errors let (input, embedder_name) = parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?; diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index cf2419b01..bbf2c8d17 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -78,6 +78,10 @@ pub enum ErrorKind<'a> { GeoBoundingBox, MisusedGeoRadius, MisusedGeoBoundingBox, + VectorFilterLeftover, + VectorFilterInvalidEmbedder, + VectorFilterMissingFragment, + VectorFilterInvalidFragment, InvalidPrimary, InvalidEscapedNumber, ExpectedEof, @@ -93,11 +97,6 @@ pub enum ErrorKind<'a> { InternalError(error::ErrorKind), DepthLimitReached, External(String), - - VectorFilterLeftover, - VectorFilterInvalidEmbedder, - VectorFilterMissingFragment, - VectorFilterInvalidFragment, } impl<'a> Error<'a> { From dbb670a9eeabed8c4d1a465fb8a0223296988086 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 15:28:58 +0200 Subject: [PATCH 33/49] Remove old split function --- crates/filter-parser/src/lib.rs | 109 -------------------------------- 1 file changed, 109 deletions(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 1e342d8d2..608f73290 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -124,16 +124,6 @@ impl<'a> Token<'a> { Err(Error::new_from_kind(self.span, ErrorKind::NonFiniteFloat)) } } - - /// Split the token by a delimiter and return an iterator of tokens. - /// Each token in the iterator will have its own span that corresponds to a slice of the original token's span. - pub fn split(&self, delimiter: &'a str) -> impl Iterator> + '_ { - let original_addr = self.value().as_ptr() as usize; - self.value().split(delimiter).map(move |part| { - let offset = part.as_ptr() as usize - original_addr; - Token::new(self.span.slice(offset..offset + part.len()), Some(part.to_string())) - }) - } } impl<'a> From> for Token<'a> { @@ -1161,103 +1151,4 @@ pub mod tests { let token: Token = s.into(); assert_eq!(token.value(), s); } - - #[test] - fn split() { - let s = "test string that should not be parsed\n newline"; - let token: Token = s.into(); - let parts: Vec<_> = token.split(" ").collect(); - insta::assert_snapshot!(format!("{parts:#?}"), @r#" - [ - Token { - span: LocatedSpan { - offset: 0, - line: 1, - fragment: "test", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "test", - ), - }, - Token { - span: LocatedSpan { - offset: 5, - line: 1, - fragment: "string", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "string", - ), - }, - Token { - span: LocatedSpan { - offset: 12, - line: 1, - fragment: "that", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "that", - ), - }, - Token { - span: LocatedSpan { - offset: 17, - line: 1, - fragment: "should", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "should", - ), - }, - Token { - span: LocatedSpan { - offset: 24, - line: 1, - fragment: "not", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "not", - ), - }, - Token { - span: LocatedSpan { - offset: 28, - line: 1, - fragment: "be", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "be", - ), - }, - Token { - span: LocatedSpan { - offset: 31, - line: 1, - fragment: "parsed\n", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "parsed\n", - ), - }, - Token { - span: LocatedSpan { - offset: 39, - line: 2, - fragment: "newline", - extra: "test string that should not be parsed\n newline", - }, - value: Some( - "newline", - ), - }, - ] - "#); - } } From 4264abda23d9b1723a0a0e89d94076bbc10a1214 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 15:30:36 +0200 Subject: [PATCH 34/49] Remove debugs --- crates/milli/src/search/facet/filter.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 4e67814d3..1ddfe96c7 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -230,7 +230,7 @@ impl<'a> Filter<'a> { } pub fn use_vector_filter(&self) -> Option<&Token> { - dbg!(self.condition.use_vector_filter()) + self.condition.use_vector_filter() } } @@ -241,7 +241,7 @@ impl<'a> Filter<'a> { let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; for fid in self.condition.fids(MAX_FILTER_DEPTH) { - let attribute = dbg!(fid.value()); + let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) .is_some_and(|(_, features)| features.is_filterable()) || attribute == RESERVED_VECTORS_FIELD_NAME From 13d38d59bfacfe7dbefe3de003de3bddee44df4c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Thu, 24 Jul 2025 15:44:11 +0200 Subject: [PATCH 35/49] Remove useless import --- crates/filter-parser/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 608f73290..ae11ccf55 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -60,7 +60,7 @@ use nom::combinator::{cut, eof, map, opt}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; -use nom::{Finish, Slice}; +use nom::Finish; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; use value::word_exact; From 6c3dd83ae528a8b82614880c0206cd7886a187c7 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 29 Jul 2025 09:03:48 +0200 Subject: [PATCH 36/49] Fix old test --- crates/milli/src/test_index.rs | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index 0ec348301..a6ed9ad91 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -1340,11 +1340,10 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() { assert!(results.candidates.is_empty()); let mut search = index.search(&rtxn); - let results = search - .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) - .execute() - .unwrap_err(); - assert!(matches!(results, Error::UserError(UserError::InvalidFilter(_)))); + let results = + dbg!(search.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()).execute()) + .unwrap(); + assert!(results.candidates.is_empty()); index .update_settings(|settings| { @@ -1375,6 +1374,6 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() { let results = search .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) .execute() - .unwrap_err(); - assert!(matches!(results, Error::UserError(UserError::InvalidFilter(_)))); + .unwrap(); + assert!(results.candidates.is_empty()); } From 66b6e47494c0d6809c0c78fd7f794dcd673afc62 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 29 Jul 2025 10:52:21 +0200 Subject: [PATCH 37/49] Remove warning --- crates/milli/src/test_index.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index a6ed9ad91..12ac4e158 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -19,9 +19,7 @@ use crate::update::{ }; use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::vector::RuntimeEmbedders; -use crate::{ - db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult, UserError, -}; +use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult}; pub(crate) struct TempIndex { pub inner: Index, From 3580b3a4ef5370c5168954378e2aa4df9464b6af Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 29 Jul 2025 10:56:54 +0200 Subject: [PATCH 38/49] Remove userProvided from fragments --- crates/meilisearch/tests/search/filters.rs | 5 +---- crates/milli/src/search/facet/filter_vector.rs | 5 ++++- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 67f9ebb71..209261f70 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -930,9 +930,6 @@ async fn vector_filter_specific_fragment() { { "name": "kefir" }, - { - "name": "echo" - }, { "name": "intel" }, @@ -944,7 +941,7 @@ async fn vector_filter_specific_fragment() { "processingTimeMs": "[duration]", "limit": 20, "offset": 0, - "estimatedTotalHits": 4 + "estimatedTotalHits": 3 } "#); } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 2ddd801ed..2cde3aaa7 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -119,7 +119,10 @@ fn evaluate_inner( .collect(), })?; - arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())? + let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| { + bitmap.clone() - user_provided_docsids + })? } VectorFilter::DocumentTemplate => { if !embedding_config.fragments.as_slice().is_empty() { From 223df5a43346ea4bdd2c63218c9e0985d21a9f30 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 29 Jul 2025 11:02:59 +0200 Subject: [PATCH 39/49] Remove incorrect break --- crates/milli/src/vector/mod.rs | 6 ------ 1 file changed, 6 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index f64223e41..088d98b72 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -556,9 +556,6 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.quantized_db()) { let reader = reader?; let documents = reader.item_ids(); - if documents.is_empty() { - break; - } stats.documents |= documents; stats.number_of_embeddings += documents.len(); } @@ -566,9 +563,6 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.angular_db()) { let reader = reader?; let documents = reader.item_ids(); - if documents.is_empty() { - break; - } stats.documents |= documents; stats.number_of_embeddings += documents.len(); } From 93864009cc68a5af622ef0ce2f6d7f96ec85f332 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 29 Jul 2025 11:04:08 +0200 Subject: [PATCH 40/49] Rename variable with typo Co-Authored-By: Louis Dureuil --- crates/milli/src/search/facet/filter_vector.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 2cde3aaa7..625bd5dde 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -119,9 +119,9 @@ fn evaluate_inner( .collect(), })?; - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| { - bitmap.clone() - user_provided_docsids + bitmap.clone() - user_provided_docids })? } VectorFilter::DocumentTemplate => { @@ -129,14 +129,14 @@ fn evaluate_inner( return Ok(RoaringBitmap::new()); } - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); + let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); let mut stats = ArroyStats::default(); arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; - stats.documents - user_provided_docsids.clone() + stats.documents - user_provided_docids.clone() } VectorFilter::UserProvided => { - let user_provided_docsids = embedder_info.embedding_status.user_provided_docids(); - user_provided_docsids.clone() + let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); + user_provided_docids.clone() } VectorFilter::Regenerate => { let mut stats = ArroyStats::default(); From 60acdf8574e9e0d2e7e26eb553dd5410554f8b9c Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 29 Jul 2025 11:05:16 +0200 Subject: [PATCH 41/49] Fix grammar Co-Authored-By: Louis Dureuil --- crates/meilisearch/tests/search/filters.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 209261f70..76f252a6d 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -788,7 +788,7 @@ async fn vector_filter_missing_fragment() { } #[actix_rt::test] -async fn vector_filter_non_existant_embedder() { +async fn vector_filter_nonexistent_embedder() { let index = shared_index_for_fragments().await; let (value, _code) = index From 2121819c66a0d66b9627a46962316045f77a1af0 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 5 Aug 2025 14:18:45 +0200 Subject: [PATCH 42/49] Fix tests --- crates/meilisearch/tests/search/filters.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 76f252a6d..8e3ee9249 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -1035,7 +1035,7 @@ async fn vector_filter_document_template() { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.succeeded(); + server.wait_task(value.uid()).await.succeeded(); let (value, _code) = index .search_post(json!({ From b5ba0e42b32ced20909a961b056a2b329be7c479 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 09:58:16 +0200 Subject: [PATCH 43/49] Add new error --- crates/filter-parser/src/condition.rs | 7 +++++++ crates/filter-parser/src/error.rs | 10 +++++++++- crates/filter-parser/src/lib.rs | 12 ++++++------ crates/filter-parser/src/value.rs | 18 ++++-------------- 4 files changed, 26 insertions(+), 21 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 4d156c269..24c6c50cc 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -135,6 +135,13 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> // From this point, we are certain this is a vector filter, so our errors must be final. // We could use nom's `cut` but it's better to be explicit about the errors + if let Ok((_, space)) = tag::<_, _, ()>(" ")(input) { + return Err(crate::Error::new_failure_from_kind( + space, + ErrorKind::VectorFilterMissingEmbedder, + )); + } + let (input, embedder_name) = parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?; diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index bbf2c8d17..a5905b1cb 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -54,7 +54,7 @@ impl<'a, T> IResultExt<'a> for IResult<'a, T> { nom::Err::Error(e) => *e.context(), nom::Err::Failure(e) => *e.context(), }; - nom::Err::Failure(Error::new_from_kind(input, kind)) + Error::new_failure_from_kind(input, kind) }) } } @@ -79,6 +79,7 @@ pub enum ErrorKind<'a> { MisusedGeoRadius, MisusedGeoBoundingBox, VectorFilterLeftover, + VectorFilterMissingEmbedder, VectorFilterInvalidEmbedder, VectorFilterMissingFragment, VectorFilterInvalidFragment, @@ -112,6 +113,10 @@ impl<'a> Error<'a> { Self { context, kind } } + pub fn new_failure_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> nom::Err { + nom::Err::Failure(Self::new_from_kind(context, kind)) + } + pub fn new_from_external(context: Span<'a>, error: impl std::error::Error) -> Self { Self::new_from_kind(context, ErrorKind::External(error.to_string())) } @@ -199,6 +204,9 @@ impl Display for Error<'_> { ErrorKind::VectorFilterMissingFragment => { writeln!(f, "The vector filter is missing a fragment name.")? } + ErrorKind::VectorFilterMissingEmbedder => { + writeln!(f, "Was expecting embedder name but found nothing.")? + } ErrorKind::VectorFilterInvalidEmbedder => { writeln!(f, "The vector filter's embedder is invalid.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index ae11ccf55..8ecbf5dc4 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -437,7 +437,7 @@ fn parse_geo_bounding_box(input: Span) -> IResult { let (input, args) = parsed?; if args.len() != 2 || args[0].len() != 2 || args[1].len() != 2 { - return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::GeoBoundingBox))); + return Err(Error::new_failure_from_kind(input, ErrorKind::GeoBoundingBox)); } let res = FilterCondition::GeoBoundingBox { @@ -458,7 +458,7 @@ fn parse_geo_point(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; // if we succeeded we still return a `Failure` because geoPoints are not allowed - Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) + Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))) } /// geoPoint = WS* "_geoDistance(float WS* "," WS* float WS* "," WS* float) @@ -472,7 +472,7 @@ fn parse_geo_distance(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))))?; // if we succeeded we still return a `Failure` because `geoDistance` filters are not allowed - Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoDistance")))) + Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))) } /// geo = WS* "_geo(float WS* "," WS* float WS* "," WS* float) @@ -486,7 +486,7 @@ fn parse_geo(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geo"))))?; // if we succeeded we still return a `Failure` because `_geo` filter is not allowed - Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geo")))) + Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geo"))) } fn parse_error_reserved_keyword(input: Span) -> IResult { @@ -1006,8 +1006,8 @@ pub mod tests { 1:25 _vectors _vectors EXISTS "); insta::assert_snapshot!(p(r#"_vectors. embedderName EXISTS"#), @r" - The vector filter's embedder is invalid. - 10:30 _vectors. embedderName EXISTS + Was expecting embedder name but found nothing. + 10:11 _vectors. embedderName EXISTS "); insta::assert_snapshot!(p(r#"_vectors .embedderName EXISTS"#), @r" Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors .embedderName EXISTS`. diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index 345f0b0a2..ac645799b 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -132,31 +132,21 @@ pub fn parse_value(input: Span) -> IResult { } match parse_geo_radius(input) { - Ok(_) => { - return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeoRadius))) - } + Ok(_) => return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoRadius)), // if we encountered a failure it means the user badly wrote a _geoRadius filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeoRadius))) + return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoRadius)) } _ => (), } match parse_geo_bounding_box(input) { - Ok(_) => { - return Err(nom::Err::Failure(Error::new_from_kind( - input, - ErrorKind::MisusedGeoBoundingBox, - ))) - } + Ok(_) => return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)), // if we encountered a failure it means the user badly wrote a _geoBoundingBox filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(nom::Err::Failure(Error::new_from_kind( - input, - ErrorKind::MisusedGeoBoundingBox, - ))) + return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)) } _ => (), } From f6559258ced2f025a8e54dc9be0d7e41be82d0dc Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 10:32:28 +0200 Subject: [PATCH 44/49] Improve operation error on vector filters --- crates/filter-parser/src/condition.rs | 27 +++++++++++++++------------ crates/filter-parser/src/error.rs | 4 ++++ crates/filter-parser/src/lib.rs | 24 ++++++++++++------------ 3 files changed, 31 insertions(+), 24 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 24c6c50cc..12542110a 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -164,21 +164,24 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> Ok((input, (Token::from(fid), Some(embedder_name), filter))) } -/// vectors_exists = vectors "EXISTS" +/// vectors_exists = vectors ("EXISTS" | ("NOT" WS+ "EXISTS")) pub fn parse_vectors_exists(input: Span) -> IResult { - let (input, (fid, embedder, filter)) = terminated(parse_vectors, tag("EXISTS"))(input)?; - - Ok((input, FilterCondition::VectorExists { fid, embedder, filter })) -} -/// vectors_not_exists = vectors "NOT" WS+ "EXISTS" -pub fn parse_vectors_not_exists(input: Span) -> IResult { let (input, (fid, embedder, filter)) = parse_vectors(input)?; - let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?; - Ok(( - input, - FilterCondition::Not(Box::new(FilterCondition::VectorExists { fid, embedder, filter })), - )) + // Try parsing "EXISTS" first + if let Ok((input, _)) = tag::<_, _, ()>("EXISTS")(input) { + return Ok((input, FilterCondition::VectorExists { fid, embedder, filter })); + } + + // Try parsing "NOT EXISTS" + if let Ok((input, _)) = tuple::<_, _, (), _>((tag("NOT"), multispace1, tag("EXISTS")))(input) { + return Ok(( + input, + FilterCondition::Not(Box::new(FilterCondition::VectorExists { fid, embedder, filter })), + )); + } + + Err(crate::Error::new_failure_from_kind(input, ErrorKind::VectorFilterOperation)) } /// contains = value "CONTAINS" value diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index a5905b1cb..f6356f3fd 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -83,6 +83,7 @@ pub enum ErrorKind<'a> { VectorFilterInvalidEmbedder, VectorFilterMissingFragment, VectorFilterInvalidFragment, + VectorFilterOperation, InvalidPrimary, InvalidEscapedNumber, ExpectedEof, @@ -210,6 +211,9 @@ impl Display for Error<'_> { ErrorKind::VectorFilterInvalidEmbedder => { writeln!(f, "The vector filter's embedder is invalid.")? } + ErrorKind::VectorFilterOperation => { + writeln!(f, "Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter.")? + } ErrorKind::ReservedKeyword(word) => { writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 8ecbf5dc4..2c8dfac80 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -65,7 +65,7 @@ use nom_locate::LocatedSpan; pub(crate) use value::parse_value; use value::word_exact; -use crate::condition::{parse_vectors_exists, parse_vectors_not_exists}; +use crate::condition::parse_vectors_exists; use crate::error::IResultExt; pub type Span<'a> = LocatedSpan<&'a str, &'a str>; @@ -525,7 +525,7 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_is_not_null, parse_is_empty, parse_is_not_empty, - alt((parse_vectors_exists, parse_vectors_not_exists, parse_exists, parse_not_exists)), + alt((parse_vectors_exists, parse_exists, parse_not_exists)), parse_to, parse_contains, parse_not_contains, @@ -1002,16 +1002,16 @@ pub mod tests { ); insta::assert_snapshot!(p(r#"_vectors _vectors EXISTS"#), @r" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors _vectors EXISTS`. - 1:25 _vectors _vectors EXISTS + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 10:25 _vectors _vectors EXISTS "); insta::assert_snapshot!(p(r#"_vectors. embedderName EXISTS"#), @r" Was expecting embedder name but found nothing. 10:11 _vectors. embedderName EXISTS "); insta::assert_snapshot!(p(r#"_vectors .embedderName EXISTS"#), @r" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors .embedderName EXISTS`. - 1:30 _vectors .embedderName EXISTS + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 10:30 _vectors .embedderName EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName. EXISTS"#), @r" The vector filter has leftover tokens. @@ -1038,20 +1038,20 @@ pub mod tests { 33:40 _vectors.embedderName.fragments. EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments.test test EXISTS"#), @r" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors.embedderName.fragments.test test EXISTS`. - 1:49 _vectors.embedderName.fragments.test test EXISTS + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 38:49 _vectors.embedderName.fragments.test test EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. test EXISTS"#), @r" The vector filter's fragment is invalid. 33:45 _vectors.embedderName.fragments. test EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments. test EXISTS"#), @r" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors.embedderName .fragments. test EXISTS`. - 1:46 _vectors.embedderName .fragments. test EXISTS + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 23:46 _vectors.embedderName .fragments. test EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments.test EXISTS"#), @r" - Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `CONTAINS`, `NOT CONTAINS`, `STARTS WITH`, `NOT STARTS WITH`, `_geoRadius`, or `_geoBoundingBox` at `_vectors.embedderName .fragments.test EXISTS`. - 1:45 _vectors.embedderName .fragments.test EXISTS + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 23:45 _vectors.embedderName .fragments.test EXISTS "); insta::assert_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" From 666ae1a3e745f167946cacc039ca511bd9fc52c9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 13:00:38 +0200 Subject: [PATCH 45/49] Add "did you mean" message --- Cargo.lock | 1 + crates/filter-parser/Cargo.toml | 1 + crates/filter-parser/src/condition.rs | 18 ++++++++++++----- crates/filter-parser/src/error.rs | 29 +++++++++++++++++++++++++-- crates/filter-parser/src/lib.rs | 20 ++++++++++-------- crates/filter-parser/src/value.rs | 8 ++++---- 6 files changed, 58 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8413b3d14..43f491f1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2031,6 +2031,7 @@ name = "filter-parser" version = "1.16.0" dependencies = [ "insta", + "levenshtein_automata", "nom", "nom_locate", "unescaper", diff --git a/crates/filter-parser/Cargo.toml b/crates/filter-parser/Cargo.toml index 6eeb0794b..173cabd4b 100644 --- a/crates/filter-parser/Cargo.toml +++ b/crates/filter-parser/Cargo.toml @@ -15,6 +15,7 @@ license.workspace = true nom = "7.1.3" nom_locate = "4.2.0" unescaper = "0.1.6" +levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } [dev-dependencies] # fixed version due to format breakages in v1.40 diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 12542110a..e4795d048 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -19,6 +19,7 @@ use Condition::*; use crate::error::IResultExt; use crate::value::parse_vector_value; +use crate::Error; use crate::ErrorKind; use crate::VectorFilter; use crate::{parse_value, FilterCondition, IResult, Span, Token}; @@ -136,10 +137,7 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> // We could use nom's `cut` but it's better to be explicit about the errors if let Ok((_, space)) = tag::<_, _, ()>(" ")(input) { - return Err(crate::Error::new_failure_from_kind( - space, - ErrorKind::VectorFilterMissingEmbedder, - )); + return Err(crate::Error::failure_from_kind(space, ErrorKind::VectorFilterMissingEmbedder)); } let (input, embedder_name) = @@ -159,6 +157,16 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> value(VectorFilter::None, nom::combinator::success("")), ))(input)?; + if let Ok((input, point)) = tag::<_, _, ()>(".")(input) { + let opt_value = parse_vector_value(input).ok().map(|(_, v)| v); + let value = opt_value + .as_ref() + .map(|v| v.original_span().to_string()) + .unwrap_or_else(|| point.to_string()); + let context = opt_value.map(|v| v.original_span()).unwrap_or(point); + return Err(Error::failure_from_kind(context, ErrorKind::VectorFilterUnknownSuffix(value))); + } + let (input, _) = multispace1(input).map_cut(ErrorKind::VectorFilterLeftover)?; Ok((input, (Token::from(fid), Some(embedder_name), filter))) @@ -181,7 +189,7 @@ pub fn parse_vectors_exists(input: Span) -> IResult { )); } - Err(crate::Error::new_failure_from_kind(input, ErrorKind::VectorFilterOperation)) + Err(crate::Error::failure_from_kind(input, ErrorKind::VectorFilterOperation)) } /// contains = value "CONTAINS" value diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index f6356f3fd..91ae2e33c 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -54,7 +54,7 @@ impl<'a, T> IResultExt<'a> for IResult<'a, T> { nom::Err::Error(e) => *e.context(), nom::Err::Failure(e) => *e.context(), }; - Error::new_failure_from_kind(input, kind) + Error::failure_from_kind(input, kind) }) } } @@ -83,6 +83,7 @@ pub enum ErrorKind<'a> { VectorFilterInvalidEmbedder, VectorFilterMissingFragment, VectorFilterInvalidFragment, + VectorFilterUnknownSuffix(String), VectorFilterOperation, InvalidPrimary, InvalidEscapedNumber, @@ -114,7 +115,7 @@ impl<'a> Error<'a> { Self { context, kind } } - pub fn new_failure_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> nom::Err { + pub fn failure_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> nom::Err { nom::Err::Failure(Self::new_from_kind(context, kind)) } @@ -155,6 +156,20 @@ impl Display for Error<'_> { // first line being the diagnostic and the second line being the incriminated filter. let escaped_input = input.escape_debug(); + fn key_suggestion<'a>(key: &str, keys: &[&'a str]) -> Option<&'a str> { + let typos = + levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true).build_dfa(key); + for key in keys.iter() { + match typos.eval(key) { + levenshtein_automata::Distance::Exact(_) => { + return Some(key); + } + levenshtein_automata::Distance::AtLeast(_) => continue, + } + } + None + } + match &self.kind { ErrorKind::ExpectedValue(_) if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? @@ -199,6 +214,16 @@ impl Display for Error<'_> { ErrorKind::VectorFilterLeftover => { writeln!(f, "The vector filter has leftover tokens.")? } + ErrorKind::VectorFilterUnknownSuffix(value) if value.as_str() == "." => { + writeln!(f, "Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value.")?; + } + ErrorKind::VectorFilterUnknownSuffix(value) => { + if let Some(suggestion) = key_suggestion(value, &["fragments", "userProvided", "documentTemplate", "regenerate"]) { + writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`. Did you mean `{suggestion}`?")?; + } else { + writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`.")?; + } + } ErrorKind::VectorFilterInvalidFragment => { writeln!(f, "The vector filter's fragment is invalid.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 2c8dfac80..75cbecc26 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -437,7 +437,7 @@ fn parse_geo_bounding_box(input: Span) -> IResult { let (input, args) = parsed?; if args.len() != 2 || args[0].len() != 2 || args[1].len() != 2 { - return Err(Error::new_failure_from_kind(input, ErrorKind::GeoBoundingBox)); + return Err(Error::failure_from_kind(input, ErrorKind::GeoBoundingBox)); } let res = FilterCondition::GeoBoundingBox { @@ -458,7 +458,7 @@ fn parse_geo_point(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; // if we succeeded we still return a `Failure` because geoPoints are not allowed - Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))) } /// geoPoint = WS* "_geoDistance(float WS* "," WS* float WS* "," WS* float) @@ -472,7 +472,7 @@ fn parse_geo_distance(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))))?; // if we succeeded we still return a `Failure` because `geoDistance` filters are not allowed - Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))) } /// geo = WS* "_geo(float WS* "," WS* float WS* "," WS* float) @@ -486,7 +486,7 @@ fn parse_geo(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geo"))))?; // if we succeeded we still return a `Failure` because `_geo` filter is not allowed - Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geo"))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geo"))) } fn parse_error_reserved_keyword(input: Span) -> IResult { @@ -1014,8 +1014,8 @@ pub mod tests { 10:30 _vectors .embedderName EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName. EXISTS"#), @r" - The vector filter has leftover tokens. - 22:30 _vectors.embedderName. EXISTS + Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value. + 22:23 _vectors.embedderName. EXISTS "); insta::assert_snapshot!(p(r#"_vectors."embedderName EXISTS"#), @r#" The vector filter's embedder is invalid. @@ -1026,8 +1026,8 @@ pub mod tests { 23:31 _vectors."embedderNam"e EXISTS "#); insta::assert_snapshot!(p(r#"_vectors.embedderName.documentTemplate. EXISTS"#), @r" - The vector filter has leftover tokens. - 39:47 _vectors.embedderName.documentTemplate. EXISTS + Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value. + 39:40 _vectors.embedderName.documentTemplate. EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments EXISTS"#), @r" The vector filter is missing a fragment name. @@ -1053,6 +1053,10 @@ pub mod tests { Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. 23:45 _vectors.embedderName .fragments.test EXISTS "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fargments.test EXISTS"#), @r" + Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `fargments`. Did you mean `fragments`? + 23:32 _vectors.embedderName.fargments.test EXISTS + "); insta::assert_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index ac645799b..dac96b4f4 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -132,21 +132,21 @@ pub fn parse_value(input: Span) -> IResult { } match parse_geo_radius(input) { - Ok(_) => return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoRadius)), + Ok(_) => return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoRadius)), // if we encountered a failure it means the user badly wrote a _geoRadius filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoRadius)) + return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoRadius)) } _ => (), } match parse_geo_bounding_box(input) { - Ok(_) => return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)), + Ok(_) => return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)), // if we encountered a failure it means the user badly wrote a _geoBoundingBox filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)) + return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)) } _ => (), } From b80869f2beaab82705911a10afe8bcfeea2324ab Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 13:16:25 +0200 Subject: [PATCH 46/49] Add two other "did you mean" messages --- crates/filter-parser/src/condition.rs | 6 ++--- crates/meilisearch/tests/search/filters.rs | 4 +-- crates/milli/src/error.rs | 26 +++++++++++++++++++ .../milli/src/search/facet/filter_vector.rs | 8 +++--- 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index e4795d048..bdc5038e8 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -159,10 +159,8 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> if let Ok((input, point)) = tag::<_, _, ()>(".")(input) { let opt_value = parse_vector_value(input).ok().map(|(_, v)| v); - let value = opt_value - .as_ref() - .map(|v| v.original_span().to_string()) - .unwrap_or_else(|| point.to_string()); + let value = + opt_value.as_ref().map(|v| v.value().to_owned()).unwrap_or_else(|| point.to_string()); let context = opt_value.map(|v| v.original_span()).unwrap_or(point); return Err(Error::failure_from_kind(context, ErrorKind::VectorFilterUnknownSuffix(value))); } diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 8e3ee9249..9cd6575ac 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -952,13 +952,13 @@ async fn vector_filter_non_existant_fragment() { let (value, _code) = index .search_post(json!({ - "filter": "_vectors.rest.fragments.other EXISTS", + "filter": "_vectors.rest.fragments.withBred EXISTS", "attributesToRetrieve": ["name"] })) .await; snapshot!(value, @r#" { - "message": "Index `[uuid]`: The fragment `other` does not exist on embedder `rest`. Available fragments on this embedder are: `basic`, `withBreed`.\n25:30 _vectors.rest.fragments.other EXISTS", + "message": "Index `[uuid]`: The fragment `withBred` does not exist on embedder `rest`. Available fragments on this embedder are: `basic`, `withBreed`. Did you mean `withBreed`?\n25:33 _vectors.rest.fragments.withBred EXISTS", "code": "invalid_search_filter", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_search_filter" diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 9ad9d0511..76ad3fda0 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -639,3 +639,29 @@ fn conditionally_lookup_for_error_message() { assert_eq!(err.to_string(), format!("{} {}", prefix, suffix)); } } + +pub struct DidYouMean<'a>(Option<&'a str>); + +impl<'a> DidYouMean<'a> { + pub fn new(key: &str, keys: &'a [String]) -> DidYouMean<'a> { + let typos = levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true).build_dfa(key); + for key in keys.iter() { + match typos.eval(key) { + levenshtein_automata::Distance::Exact(_) => { + return DidYouMean(Some(key)); + } + levenshtein_automata::Distance::AtLeast(_) => continue, + } + } + DidYouMean(None) + } +} + +impl std::fmt::Display for DidYouMean<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(suggestion) = self.0 { + write!(f, " Did you mean `{suggestion}`?")?; + } + Ok(()) + } +} diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 625bd5dde..1ef4b8e3d 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -1,7 +1,7 @@ use filter_parser::{Token, VectorFilter}; use roaring::{MultiOps, RoaringBitmap}; -use crate::error::Error; +use crate::error::{DidYouMean, Error}; use crate::vector::db::IndexEmbeddingConfig; use crate::vector::{ArroyStats, ArroyWrapper}; use crate::Index; @@ -14,7 +14,8 @@ pub enum VectorFilterError<'a> { } else { let mut available = available.clone(); available.sort_unstable(); - format!("Available embedders are: {}.", available.iter().map(|e| format!("`{e}`")).collect::>().join(", ")) + let did_you_mean = DidYouMean::new(embedder.value(), &available); + format!("Available embedders are: {}.{did_you_mean}", available.iter().map(|e| format!("`{e}`")).collect::>().join(", ")) } })] EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec }, @@ -25,7 +26,8 @@ pub enum VectorFilterError<'a> { } else { let mut available = available.clone(); available.sort_unstable(); - format!("Available fragments on this embedder are: {}.", available.iter().map(|f| format!("`{f}`")).collect::>().join(", ")) + let did_you_mean = DidYouMean::new(fragment.value(), &available); + format!("Available fragments on this embedder are: {}.{did_you_mean}", available.iter().map(|f| format!("`{f}`")).collect::>().join(", ")) } })] FragmentDoesNotExist { From 8529e2161acc5c3894a1123c320ef02d60b4d380 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 13:37:19 +0200 Subject: [PATCH 47/49] Clarify more errors --- crates/filter-parser/src/condition.rs | 12 +++++++++++- crates/filter-parser/src/error.rs | 15 +++++++++++---- crates/filter-parser/src/lib.rs | 10 +++++++++- 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index bdc5038e8..c407a1e45 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -162,7 +162,17 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> let value = opt_value.as_ref().map(|v| v.value().to_owned()).unwrap_or_else(|| point.to_string()); let context = opt_value.map(|v| v.original_span()).unwrap_or(point); - return Err(Error::failure_from_kind(context, ErrorKind::VectorFilterUnknownSuffix(value))); + let previous_kind = match filter { + VectorFilter::Fragment(_) => Some("fragments"), + VectorFilter::DocumentTemplate => Some("documentTemplate"), + VectorFilter::UserProvided => Some("userProvided"), + VectorFilter::Regenerate => Some("regenerate"), + VectorFilter::None => None, + }; + return Err(Error::failure_from_kind( + context, + ErrorKind::VectorFilterUnknownSuffix(previous_kind, value), + )); } let (input, _) = multispace1(input).map_cut(ErrorKind::VectorFilterLeftover)?; diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index 91ae2e33c..05aaf8c17 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -83,7 +83,7 @@ pub enum ErrorKind<'a> { VectorFilterInvalidEmbedder, VectorFilterMissingFragment, VectorFilterInvalidFragment, - VectorFilterUnknownSuffix(String), + VectorFilterUnknownSuffix(Option<&'static str>, String), VectorFilterOperation, InvalidPrimary, InvalidEscapedNumber, @@ -214,16 +214,23 @@ impl Display for Error<'_> { ErrorKind::VectorFilterLeftover => { writeln!(f, "The vector filter has leftover tokens.")? } - ErrorKind::VectorFilterUnknownSuffix(value) if value.as_str() == "." => { + ErrorKind::VectorFilterUnknownSuffix(_, value) if value.as_str() == "." => { writeln!(f, "Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value.")?; } - ErrorKind::VectorFilterUnknownSuffix(value) => { + ErrorKind::VectorFilterUnknownSuffix(None, value) if ["fragments", "userProvided", "documentTemplate", "regenerate"].contains(&value.as_str()) => { + // This will happen with "_vectors.rest.\"userProvided\"" for instance + writeln!(f, "Was expecting this part to be unquoted.")? + } + ErrorKind::VectorFilterUnknownSuffix(None, value) => { if let Some(suggestion) = key_suggestion(value, &["fragments", "userProvided", "documentTemplate", "regenerate"]) { writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`. Did you mean `{suggestion}`?")?; } else { writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`.")?; } } + ErrorKind::VectorFilterUnknownSuffix(Some(previous_filter_kind), value) => { + writeln!(f, "Vector filter can only accept one of `fragments`, `userProvided`, `documentTemplate` or `regenerate`, but found both `{previous_filter_kind}` and `{value}`.")? + }, ErrorKind::VectorFilterInvalidFragment => { writeln!(f, "The vector filter's fragment is invalid.")? } @@ -234,7 +241,7 @@ impl Display for Error<'_> { writeln!(f, "Was expecting embedder name but found nothing.")? } ErrorKind::VectorFilterInvalidEmbedder => { - writeln!(f, "The vector filter's embedder is invalid.")? + writeln!(f, "The vector filter's embedder name is invalid.")? } ErrorKind::VectorFilterOperation => { writeln!(f, "Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter.")? diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 75cbecc26..8f6f02691 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -1018,7 +1018,7 @@ pub mod tests { 22:23 _vectors.embedderName. EXISTS "); insta::assert_snapshot!(p(r#"_vectors."embedderName EXISTS"#), @r#" - The vector filter's embedder is invalid. + The vector filter's embedder name is invalid. 30:30 _vectors."embedderName EXISTS "#); insta::assert_snapshot!(p(r#"_vectors."embedderNam"e EXISTS"#), @r#" @@ -1057,6 +1057,14 @@ pub mod tests { Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `fargments`. Did you mean `fragments`? 23:32 _vectors.embedderName.fargments.test EXISTS "); + insta::assert_snapshot!(p(r#"_vectors.embedderName."userProvided" EXISTS"#), @r#" + Was expecting this part to be unquoted. + 24:36 _vectors.embedderName."userProvided" EXISTS + "#); + insta::assert_snapshot!(p(r#"_vectors.embedderName.userProvided.fragments.test EXISTS"#), @r" + Vector filter can only accept one of `fragments`, `userProvided`, `documentTemplate` or `regenerate`, but found both `userProvided` and `fragments`. + 36:45 _vectors.embedderName.userProvided.fragments.test EXISTS + "); insta::assert_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. From cdeca595872bb8fa668f0c2fc2f9e714f3da5294 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 17:14:36 +0200 Subject: [PATCH 48/49] Add error message for quoting errors --- crates/filter-parser/src/condition.rs | 5 +++-- crates/filter-parser/src/error.rs | 6 +++++- crates/filter-parser/src/lib.rs | 8 ++++---- crates/filter-parser/src/value.rs | 12 ++++++++++++ 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index c407a1e45..8e3c04040 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -19,6 +19,7 @@ use Condition::*; use crate::error::IResultExt; use crate::value::parse_vector_value; +use crate::value::parse_vector_value_cut; use crate::Error; use crate::ErrorKind; use crate::VectorFilter; @@ -141,13 +142,13 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> } let (input, embedder_name) = - parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?; + parse_vector_value_cut(input, ErrorKind::VectorFilterInvalidEmbedder)?; let (input, filter) = alt(( map( preceded(tag(".fragments"), |input| { let (input, _) = tag(".")(input).map_cut(ErrorKind::VectorFilterMissingFragment)?; - parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidFragment) + parse_vector_value_cut(input, ErrorKind::VectorFilterInvalidFragment) }), VectorFilter::Fragment, ), diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index 05aaf8c17..e381f45e2 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -79,6 +79,7 @@ pub enum ErrorKind<'a> { MisusedGeoRadius, MisusedGeoBoundingBox, VectorFilterLeftover, + VectorFilterInvalidQuotes, VectorFilterMissingEmbedder, VectorFilterInvalidEmbedder, VectorFilterMissingFragment, @@ -232,7 +233,7 @@ impl Display for Error<'_> { writeln!(f, "Vector filter can only accept one of `fragments`, `userProvided`, `documentTemplate` or `regenerate`, but found both `{previous_filter_kind}` and `{value}`.")? }, ErrorKind::VectorFilterInvalidFragment => { - writeln!(f, "The vector filter's fragment is invalid.")? + writeln!(f, "The vector filter's fragment name is invalid.")? } ErrorKind::VectorFilterMissingFragment => { writeln!(f, "The vector filter is missing a fragment name.")? @@ -246,6 +247,9 @@ impl Display for Error<'_> { ErrorKind::VectorFilterOperation => { writeln!(f, "Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter.")? } + ErrorKind::VectorFilterInvalidQuotes => { + writeln!(f, "The quotes in one of the values are inconsistent.")? + } ErrorKind::ReservedKeyword(word) => { writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 8f6f02691..c761c583b 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -1018,8 +1018,8 @@ pub mod tests { 22:23 _vectors.embedderName. EXISTS "); insta::assert_snapshot!(p(r#"_vectors."embedderName EXISTS"#), @r#" - The vector filter's embedder name is invalid. - 30:30 _vectors."embedderName EXISTS + The quotes in one of the values are inconsistent. + 10:30 _vectors."embedderName EXISTS "#); insta::assert_snapshot!(p(r#"_vectors."embedderNam"e EXISTS"#), @r#" The vector filter has leftover tokens. @@ -1034,7 +1034,7 @@ pub mod tests { 32:39 _vectors.embedderName.fragments EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. EXISTS"#), @r" - The vector filter's fragment is invalid. + The vector filter's fragment name is invalid. 33:40 _vectors.embedderName.fragments. EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments.test test EXISTS"#), @r" @@ -1042,7 +1042,7 @@ pub mod tests { 38:49 _vectors.embedderName.fragments.test test EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. test EXISTS"#), @r" - The vector filter's fragment is invalid. + The vector filter's fragment name is invalid. 33:45 _vectors.embedderName.fragments. test EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments. test EXISTS"#), @r" diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index dac96b4f4..35a5c0ab4 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -113,6 +113,18 @@ pub fn parse_vector_value(input: Span) -> IResult { } } +pub fn parse_vector_value_cut<'a>(input: Span<'a>, kind: ErrorKind<'a>) -> IResult<'a, Token<'a>> { + parse_vector_value(input).map_err(|e| match e { + nom::Err::Failure(e) => match e.kind() { + ErrorKind::Char(c) if *c == '"' || *c == '\'' => { + crate::Error::failure_from_kind(input, ErrorKind::VectorFilterInvalidQuotes) + } + _ => crate::Error::failure_from_kind(input, kind), + }, + _ => crate::Error::failure_from_kind(input, kind), + }) +} + /// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value(input: Span) -> IResult { // to get better diagnostic message we are going to strip the left whitespaces from the input right now From 307ea38c2a8e3c08e5da6734f280a583388a653a Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 17:19:37 +0200 Subject: [PATCH 49/49] Remove old irrelevant tests --- crates/meilisearch/tests/search/filters.rs | 35 ---------------------- 1 file changed, 35 deletions(-) diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 9cd6575ac..ef562bf4f 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -966,26 +966,6 @@ async fn vector_filter_non_existant_fragment() { "#); } -#[actix_rt::test] -async fn vector_filter_specific_fragment_user_provided() { - let index = shared_index_for_fragments().await; - - let (value, _code) = index - .search_post(json!({ - "filter": "_vectors.rest.fragments.other.userProvided EXISTS", - "attributesToRetrieve": ["name"] - })) - .await; - snapshot!(value, @r#" - { - "message": "The vector filter has leftover tokens.\n30:50 _vectors.rest.fragments.other.userProvided EXISTS", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - } - "#); -} - #[actix_rt::test] async fn vector_filter_document_template_but_fragments_used() { let index = shared_index_for_fragments().await; @@ -1179,19 +1159,4 @@ async fn vector_filter_regenerate() { "estimatedTotalHits": 3 } "#); - - let (value, _code) = index - .search_post(json!({ - "filter": format!("_vectors.rest.fragments.basic.regenerate EXISTS"), - "attributesToRetrieve": ["name"] - })) - .await; - snapshot!(value, @r#" - { - "message": "The vector filter has leftover tokens.\n30:48 _vectors.rest.fragments.basic.regenerate EXISTS", - "code": "invalid_search_filter", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_search_filter" - } - "#); }