From 0301d8f239f9d860c663bec9b48b4b08477e7885 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 8 Jul 2025 11:39:10 +0200 Subject: [PATCH] Improve error handling --- crates/filter-parser/src/lib.rs | 113 +++++++++- crates/meilisearch/tests/search/filters.rs | 109 ++++++---- crates/milli/src/search/facet/filter.rs | 11 +- .../milli/src/search/facet/filter_vector.rs | 196 ++++++++++++++---- 4 files changed, 342 insertions(+), 87 deletions(-) diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 938702103..b64477170 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -60,7 +60,7 @@ use nom::combinator::{cut, eof, map, opt}; use nom::multi::{many0, separated_list1}; use nom::number::complete::recognize_float; use nom::sequence::{delimited, preceded, terminated, tuple}; -use nom::Finish; +use nom::{Finish, Slice}; use nom_locate::LocatedSpan; pub(crate) use value::parse_value; use value::word_exact; @@ -121,6 +121,16 @@ impl<'a> Token<'a> { Err(Error::new_from_kind(self.span, ErrorKind::NonFiniteFloat)) } } + + /// Split the token by a delimiter and return an iterator of tokens. + /// Each token in the iterator will have its own span that corresponds to a slice of the original token's span. + pub fn split(&self, delimiter: &'a str) -> impl Iterator> + '_ { + let original_addr = self.value().as_ptr() as usize; + self.value().split(delimiter).map(move |part| { + let offset = part.as_ptr() as usize - original_addr; + Token::new(self.span.slice(offset..offset + part.len()), Some(part.to_string())) + }) + } } impl<'a> From> for Token<'a> { @@ -604,6 +614,8 @@ impl std::fmt::Display for Token<'_> { #[cfg(test)] pub mod tests { + use std::fmt::format; + use FilterCondition as Fc; use super::*; @@ -1043,4 +1055,103 @@ pub mod tests { let token: Token = s.into(); assert_eq!(token.value(), s); } + + #[test] + fn split() { + let s = "test string that should not be parsed\n newline"; + let token: Token = s.into(); + let parts: Vec<_> = token.split(" ").collect(); + insta::assert_snapshot!(format!("{parts:#?}"), @r#" + [ + Token { + span: LocatedSpan { + offset: 0, + line: 1, + fragment: "test", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "test", + ), + }, + Token { + span: LocatedSpan { + offset: 5, + line: 1, + fragment: "string", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "string", + ), + }, + Token { + span: LocatedSpan { + offset: 12, + line: 1, + fragment: "that", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "that", + ), + }, + Token { + span: LocatedSpan { + offset: 17, + line: 1, + fragment: "should", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "should", + ), + }, + Token { + span: LocatedSpan { + offset: 24, + line: 1, + fragment: "not", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "not", + ), + }, + Token { + span: LocatedSpan { + offset: 28, + line: 1, + fragment: "be", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "be", + ), + }, + Token { + span: LocatedSpan { + offset: 31, + line: 1, + fragment: "parsed\n", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "parsed\n", + ), + }, + Token { + span: LocatedSpan { + offset: 39, + line: 2, + fragment: "newline", + extra: "test string that should not be parsed\n newline", + }, + value: Some( + "newline", + ), + }, + ] + "#); + } } diff --git a/crates/meilisearch/tests/search/filters.rs b/crates/meilisearch/tests/search/filters.rs index 361762b6c..384605ca6 100644 --- a/crates/meilisearch/tests/search/filters.rs +++ b/crates/meilisearch/tests/search/filters.rs @@ -736,10 +736,12 @@ async fn test_filterable_attributes_priority() { async fn test_vector_filter() { let index = crate::vector::shared_index_for_fragments().await; - let (value, _code) = index.search_post(json!({ - "filter": "_vectors EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -764,42 +766,44 @@ async fn test_vector_filter() { } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.other EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.other EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { - "hits": [], - "query": "", - "processingTimeMs": "[duration]", - "limit": 20, - "offset": 0, - "estimatedTotalHits": 0 + "message": "Index `[uuid]`: The embedder `other` does not exist. Available embedders are: `rest`.\n10:15 _vectors.other EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" } "#); - + // This one is counterintuitive, but it is the same as the previous one. // It's because userProvided is interpreted as an embedder name - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.userProvided EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.userProvided EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { - "hits": [], - "query": "", - "processingTimeMs": "[duration]", - "limit": 20, - "offset": 0, - "estimatedTotalHits": 0 + "message": "Index `[uuid]`: The embedder `userProvided` does not exist. Available embedders are: `rest`.\n10:22 _vectors.userProvided EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -824,10 +828,12 @@ async fn test_vector_filter() { } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest.userProvided EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.userProvided EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -843,10 +849,12 @@ async fn test_vector_filter() { } "#); - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest.fragments.withBreed EXISTS", - "attributesToRetrieve": ["id"] - })).await; + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.withBreed EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -864,11 +872,13 @@ async fn test_vector_filter() { "estimatedTotalHits": 2 } "#); - - let (value, _code) = index.search_post(json!({ - "filter": "_vectors.rest.fragments.basic EXISTS", - "attributesToRetrieve": ["id"] - })).await; + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.basic EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; snapshot!(value, @r#" { "hits": [ @@ -892,4 +902,19 @@ async fn test_vector_filter() { "estimatedTotalHits": 4 } "#); + + let (value, _code) = index + .search_post(json!({ + "filter": "_vectors.rest.fragments.other EXISTS", + "attributesToRetrieve": ["id"] + })) + .await; + snapshot!(value, @r#" + { + "message": "Index `[uuid]`: The fragment `other` does not exist on embedder `rest`. Available fragments on this embedder are: `basic`, `withBreed`.\n25:30 _vectors.rest.fragments.other EXISTS", + "code": "invalid_search_filter", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_filter" + } + "#); } diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index c9728966a..c0419997c 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -241,7 +241,8 @@ impl<'a> Filter<'a> { let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) - .is_some_and(|(_, features)| features.is_filterable()) || VectorFilter::matches(attribute) + .is_some_and(|(_, features)| features.is_filterable()) + || VectorFilter::matches(attribute) { continue; } @@ -548,11 +549,11 @@ impl<'a> Filter<'a> { let value = fid.value(); if VectorFilter::matches(value) { if !matches!(op, Condition::Exists) { - return Err(Error::UserError(UserError::InvalidFilter( - String::from("Vector filter can only be used with the `exists` operator"), - ))); + return Err(Error::UserError(UserError::InvalidFilter(String::from( + "Vector filter can only be used with the `exists` operator", + )))); } - let vector_filter = VectorFilter::parse(value)?; + let vector_filter = VectorFilter::parse(fid)?; return vector_filter.evaluate(rtxn, index, universe); } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 473741f14..0a7ac313a 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -1,16 +1,118 @@ +use filter_parser::Token; use roaring::RoaringBitmap; use crate::error::{Error, UserError}; use crate::vector::{ArroyStats, ArroyWrapper}; -use crate::{Index, Result}; +use crate::Index; pub(super) struct VectorFilter<'a> { - embedder_name: Option<&'a str>, - fragment_name: Option<&'a str>, + embedder_token: Option>, + fragment_token: Option>, user_provided: bool, // TODO: not_user_provided: bool, } +#[derive(Debug)] +pub enum VectorFilterError<'a> { + EmptyFilter, + InvalidPrefix(Token<'a>), + MissingFragmentName(Token<'a>), + UserProvidedWithFragment(Token<'a>), + LeftoverToken(Token<'a>), + EmbedderDoesNotExist { + embedder: &'a Token<'a>, + available: Vec, + }, + FragmentDoesNotExist { + embedder: &'a Token<'a>, + fragment: &'a Token<'a>, + available: Vec, + }, +} + +use VectorFilterError::*; + +impl std::error::Error for VectorFilterError<'_> {} + +impl std::fmt::Display for VectorFilterError<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + EmptyFilter => { + write!(f, "Vector filter cannot be empty.") + } + InvalidPrefix(prefix) => { + write!( + f, + "Vector filter must start with `_vectors` but found `{}`.", + prefix.value() + ) + } + MissingFragmentName(_token) => { + write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.") + } + UserProvidedWithFragment(_token) => { + write!(f, "Vector filter cannot specify both a fragment name and userProvided.") + } + LeftoverToken(token) => { + write!(f, "Vector filter has leftover token: `{}`.", token.value()) + } + EmbedderDoesNotExist { embedder, available } => { + write!(f, "The embedder `{}` does not exist.", embedder.value())?; + if available.is_empty() { + write!(f, " This index does not have configured embedders.") + } else { + write!(f, " Available embedders are: ")?; + let mut available = available.clone(); + available.sort_unstable(); + for (idx, embedder) in available.iter().enumerate() { + write!(f, "`{embedder}`")?; + if idx != available.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, ".") + } + } + FragmentDoesNotExist { embedder, fragment, available } => { + write!( + f, + "The fragment `{}` does not exist on embedder `{}`.", + fragment.value(), + embedder.value(), + )?; + if available.is_empty() { + write!(f, " This embedder does not have configured fragments.") + } else { + write!(f, " Available fragments on this embedder are: ")?; + let mut available = available.clone(); + available.sort_unstable(); + for (idx, fragment) in available.iter().enumerate() { + write!(f, "`{fragment}`")?; + if idx != available.len() - 1 { + write!(f, ", ")?; + } + } + write!(f, ".") + } + } + } + } +} + +impl<'a> From> for Error { + fn from(err: VectorFilterError<'a>) -> Self { + match &err { + EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())), + InvalidPrefix(token) + | MissingFragmentName(token) + | UserProvidedWithFragment(token) + | LeftoverToken(token) => token.clone().as_external_error(err).into(), + EmbedderDoesNotExist { embedder: token, .. } + | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), + } + } +} + impl<'a> VectorFilter<'a> { pub(super) fn matches(value: &str) -> bool { value.starts_with("_vectors.") || value == "_vectors" @@ -23,71 +125,74 @@ impl<'a> VectorFilter<'a> { /// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}.userProvided` /// - `_vectors.{embedder_name}.fragments.{fragment_name}` - pub(super) fn parse(s: &'a str) -> Result { - let mut split = s.split('.').peekable(); + pub(super) fn parse(s: &'a Token<'a>) -> Result> { + let mut split = s.split(".").peekable(); - if split.next() != Some("_vectors") { - return Err(Error::UserError(UserError::InvalidFilter(String::from( - "Vector filter must start with '_vectors'", - )))); + match split.next() { + Some(token) if token.value() == "_vectors" => (), + Some(token) => return Err(InvalidPrefix(token)), + None => return Err(EmptyFilter), } let embedder_name = split.next(); let mut fragment_name = None; - if split.peek() == Some(&"fragments") { - split.next(); + if split.peek().map(|t| t.value()) == Some("fragments") { + let token = split.next().expect("it was peeked before"); - fragment_name = Some(split.next().ok_or_else(|| { - Error::UserError(UserError::InvalidFilter( - String::from("Vector filter is inconsistent: either specify a fragment name or remove the 'fragments' part"), - )) - })?); + fragment_name = Some(split.next().ok_or(MissingFragmentName(token))?); } - let mut user_provided = false; - if split.peek() == Some(&"userProvided") || split.peek() == Some(&"user_provided") { - split.next(); - user_provided = true; + let mut user_provided_token = None; + if split.peek().map(|t| t.value()) == Some("userProvided") + || split.peek().map(|t| t.value()) == Some("user_provided") + { + user_provided_token = split.next(); } - if fragment_name.is_some() && user_provided { - return Err(Error::UserError(UserError::InvalidFilter( - String::from("Vector filter cannot specify both a fragment name and userProvided"), - ))); + if let (Some(_), Some(user_provided_token)) = (&fragment_name, &user_provided_token) { + return Err(UserProvidedWithFragment(user_provided_token.clone()))?; } if let Some(next) = split.next() { - return Err(Error::UserError(UserError::InvalidFilter(format!( - "Unexpected part in vector filter: '{next}'" - )))); + return Err(LeftoverToken(next))?; } - Ok(Self { embedder_name, fragment_name, user_provided }) + Ok(Self { + embedder_token: embedder_name, + fragment_token: fragment_name, + user_provided: user_provided_token.is_some(), + }) } pub(super) fn evaluate( - &self, + self, rtxn: &heed::RoTxn<'_>, index: &Index, universe: Option<&RoaringBitmap>, - ) -> Result { + ) -> crate::Result { let index_embedding_configs = index.embedding_configs(); let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; let mut embedders = Vec::new(); - if let Some(embedder_name) = self.embedder_name { + if let Some(embedder_token) = &self.embedder_token { + let embedder_name = embedder_token.value(); let Some(embedder_config) = embedding_configs.iter().find(|config| config.name == embedder_name) else { - return Ok(RoaringBitmap::new()); + return Err(EmbedderDoesNotExist { + embedder: embedder_token, + available: embedding_configs.iter().map(|c| c.name.clone()).collect(), + })?; }; - let Some(embedder_info) = - index_embedding_configs.embedder_info(rtxn, embedder_name)? + let Some(embedder_info) = index_embedding_configs.embedder_info(rtxn, embedder_name)? else { - return Ok(RoaringBitmap::new()); + return Err(EmbedderDoesNotExist { + embedder: embedder_token, + available: embedding_configs.iter().map(|c| c.name.clone()).collect(), + })?; }; - + embedders.push((embedder_config, embedder_info)); } else { for embedder_config in embedding_configs.iter() { @@ -99,7 +204,7 @@ impl<'a> VectorFilter<'a> { embedders.push((embedder_config, embedder_info)); } }; - + let mut docids = RoaringBitmap::new(); for (embedder_config, embedder_info) in embedders { let arroy_wrapper = ArroyWrapper::new( @@ -108,14 +213,27 @@ impl<'a> VectorFilter<'a> { embedder_config.config.quantized(), ); - let mut new_docids = if let Some(fragment_name) = self.fragment_name { + let mut new_docids = if let Some(fragment_token) = &self.fragment_token { + let fragment_name = fragment_token.value(); let Some(fragment_config) = embedder_config .fragments .as_slice() .iter() .find(|fragment| fragment.name == fragment_name) else { - return Ok(RoaringBitmap::new()); + return Err(FragmentDoesNotExist { + embedder: self + .embedder_token + .as_ref() + .expect("there can't be a fragment without an embedder"), + fragment: fragment_token, + available: embedder_config + .fragments + .as_slice() + .iter() + .map(|f| f.name.clone()) + .collect(), + })?; }; arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?