diff --git a/Cargo.lock b/Cargo.lock index 30f774e33..04a3cdb76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2031,6 +2031,7 @@ name = "filter-parser" version = "1.17.1" dependencies = [ "insta", + "levenshtein_automata", "nom", "nom_locate", "unescaper", diff --git a/crates/filter-parser/Cargo.toml b/crates/filter-parser/Cargo.toml index 6eeb0794b..173cabd4b 100644 --- a/crates/filter-parser/Cargo.toml +++ b/crates/filter-parser/Cargo.toml @@ -15,6 +15,7 @@ license.workspace = true nom = "7.1.3" nom_locate = "4.2.0" unescaper = "0.1.6" +levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } [dev-dependencies] # fixed version due to format breakages in v1.40 diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 0fc007bf1..8e3c04040 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -7,11 +7,22 @@ use nom::branch::alt; use nom::bytes::complete::tag; +use nom::character::complete::char; +use nom::character::complete::multispace0; use nom::character::complete::multispace1; use nom::combinator::cut; +use nom::combinator::map; +use nom::combinator::value; +use nom::sequence::preceded; use nom::sequence::{terminated, tuple}; use Condition::*; +use crate::error::IResultExt; +use crate::value::parse_vector_value; +use crate::value::parse_vector_value_cut; +use crate::Error; +use crate::ErrorKind; +use crate::VectorFilter; use crate::{parse_value, FilterCondition, IResult, Span, Token}; #[derive(Debug, Clone, PartialEq, Eq)] @@ -113,6 +124,83 @@ pub fn parse_not_exists(input: Span) -> IResult { Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Exists })))) } +fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_>)> { + let (input, _) = multispace0(input)?; + let (input, fid) = tag("_vectors")(input)?; + + if let Ok((input, _)) = multispace1::<_, crate::Error>(input) { + return Ok((input, (Token::from(fid), None, VectorFilter::None))); + } + + let (input, _) = char('.')(input)?; + + // From this point, we are certain this is a vector filter, so our errors must be final. + // We could use nom's `cut` but it's better to be explicit about the errors + + if let Ok((_, space)) = tag::<_, _, ()>(" ")(input) { + return Err(crate::Error::failure_from_kind(space, ErrorKind::VectorFilterMissingEmbedder)); + } + + let (input, embedder_name) = + parse_vector_value_cut(input, ErrorKind::VectorFilterInvalidEmbedder)?; + + let (input, filter) = alt(( + map( + preceded(tag(".fragments"), |input| { + let (input, _) = tag(".")(input).map_cut(ErrorKind::VectorFilterMissingFragment)?; + parse_vector_value_cut(input, ErrorKind::VectorFilterInvalidFragment) + }), + VectorFilter::Fragment, + ), + value(VectorFilter::UserProvided, tag(".userProvided")), + value(VectorFilter::DocumentTemplate, tag(".documentTemplate")), + value(VectorFilter::Regenerate, tag(".regenerate")), + value(VectorFilter::None, nom::combinator::success("")), + ))(input)?; + + if let Ok((input, point)) = tag::<_, _, ()>(".")(input) { + let opt_value = parse_vector_value(input).ok().map(|(_, v)| v); + let value = + opt_value.as_ref().map(|v| v.value().to_owned()).unwrap_or_else(|| point.to_string()); + let context = opt_value.map(|v| v.original_span()).unwrap_or(point); + let previous_kind = match filter { + VectorFilter::Fragment(_) => Some("fragments"), + VectorFilter::DocumentTemplate => Some("documentTemplate"), + VectorFilter::UserProvided => Some("userProvided"), + VectorFilter::Regenerate => Some("regenerate"), + VectorFilter::None => None, + }; + return Err(Error::failure_from_kind( + context, + ErrorKind::VectorFilterUnknownSuffix(previous_kind, value), + )); + } + + let (input, _) = multispace1(input).map_cut(ErrorKind::VectorFilterLeftover)?; + + Ok((input, (Token::from(fid), Some(embedder_name), filter))) +} + +/// vectors_exists = vectors ("EXISTS" | ("NOT" WS+ "EXISTS")) +pub fn parse_vectors_exists(input: Span) -> IResult { + let (input, (fid, embedder, filter)) = parse_vectors(input)?; + + // Try parsing "EXISTS" first + if let Ok((input, _)) = tag::<_, _, ()>("EXISTS")(input) { + return Ok((input, FilterCondition::VectorExists { fid, embedder, filter })); + } + + // Try parsing "NOT EXISTS" + if let Ok((input, _)) = tuple::<_, _, (), _>((tag("NOT"), multispace1, tag("EXISTS")))(input) { + return Ok(( + input, + FilterCondition::Not(Box::new(FilterCondition::VectorExists { fid, embedder, filter })), + )); + } + + Err(crate::Error::failure_from_kind(input, ErrorKind::VectorFilterOperation)) +} + /// contains = value "CONTAINS" value pub fn parse_contains(input: Span) -> IResult { let (input, (fid, contains, value)) = diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index 855ce983e..e381f45e2 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -42,6 +42,23 @@ pub fn cut_with_err<'a, O>( } } +pub trait IResultExt<'a> { + fn map_cut(self, kind: ErrorKind<'a>) -> Self; +} + +impl<'a, T> IResultExt<'a> for IResult<'a, T> { + fn map_cut(self, kind: ErrorKind<'a>) -> Self { + self.map_err(move |e: nom::Err>| { + let input = match e { + nom::Err::Incomplete(_) => return e, + nom::Err::Error(e) => *e.context(), + nom::Err::Failure(e) => *e.context(), + }; + Error::failure_from_kind(input, kind) + }) + } +} + #[derive(Debug)] pub struct Error<'a> { context: Span<'a>, @@ -61,6 +78,14 @@ pub enum ErrorKind<'a> { GeoBoundingBox, MisusedGeoRadius, MisusedGeoBoundingBox, + VectorFilterLeftover, + VectorFilterInvalidQuotes, + VectorFilterMissingEmbedder, + VectorFilterInvalidEmbedder, + VectorFilterMissingFragment, + VectorFilterInvalidFragment, + VectorFilterUnknownSuffix(Option<&'static str>, String), + VectorFilterOperation, InvalidPrimary, InvalidEscapedNumber, ExpectedEof, @@ -91,6 +116,10 @@ impl<'a> Error<'a> { Self { context, kind } } + pub fn failure_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> nom::Err { + nom::Err::Failure(Self::new_from_kind(context, kind)) + } + pub fn new_from_external(context: Span<'a>, error: impl std::error::Error) -> Self { Self::new_from_kind(context, ErrorKind::External(error.to_string())) } @@ -128,6 +157,20 @@ impl Display for Error<'_> { // first line being the diagnostic and the second line being the incriminated filter. let escaped_input = input.escape_debug(); + fn key_suggestion<'a>(key: &str, keys: &[&'a str]) -> Option<&'a str> { + let typos = + levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true).build_dfa(key); + for key in keys.iter() { + match typos.eval(key) { + levenshtein_automata::Distance::Exact(_) => { + return Some(key); + } + levenshtein_automata::Distance::AtLeast(_) => continue, + } + } + None + } + match &self.kind { ErrorKind::ExpectedValue(_) if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? @@ -169,6 +212,44 @@ impl Display for Error<'_> { ErrorKind::MisusedGeoBoundingBox => { writeln!(f, "The `_geoBoundingBox` filter is an operation and can't be used as a value.")? } + ErrorKind::VectorFilterLeftover => { + writeln!(f, "The vector filter has leftover tokens.")? + } + ErrorKind::VectorFilterUnknownSuffix(_, value) if value.as_str() == "." => { + writeln!(f, "Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value.")?; + } + ErrorKind::VectorFilterUnknownSuffix(None, value) if ["fragments", "userProvided", "documentTemplate", "regenerate"].contains(&value.as_str()) => { + // This will happen with "_vectors.rest.\"userProvided\"" for instance + writeln!(f, "Was expecting this part to be unquoted.")? + } + ErrorKind::VectorFilterUnknownSuffix(None, value) => { + if let Some(suggestion) = key_suggestion(value, &["fragments", "userProvided", "documentTemplate", "regenerate"]) { + writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`. Did you mean `{suggestion}`?")?; + } else { + writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`.")?; + } + } + ErrorKind::VectorFilterUnknownSuffix(Some(previous_filter_kind), value) => { + writeln!(f, "Vector filter can only accept one of `fragments`, `userProvided`, `documentTemplate` or `regenerate`, but found both `{previous_filter_kind}` and `{value}`.")? + }, + ErrorKind::VectorFilterInvalidFragment => { + writeln!(f, "The vector filter's fragment name is invalid.")? + } + ErrorKind::VectorFilterMissingFragment => { + writeln!(f, "The vector filter is missing a fragment name.")? + } + ErrorKind::VectorFilterMissingEmbedder => { + writeln!(f, "Was expecting embedder name but found nothing.")? + } + ErrorKind::VectorFilterInvalidEmbedder => { + writeln!(f, "The vector filter's embedder name is invalid.")? + } + ErrorKind::VectorFilterOperation => { + writeln!(f, "Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter.")? + } + ErrorKind::VectorFilterInvalidQuotes => { + writeln!(f, "The quotes in one of the values are inconsistent.")? + } ErrorKind::ReservedKeyword(word) => { writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index e25636812..9ee00c3eb 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -65,6 +65,9 @@ use nom_locate::LocatedSpan; pub(crate) use value::parse_value; use value::word_exact; +use crate::condition::parse_vectors_exists; +use crate::error::IResultExt; + pub type Span<'a> = LocatedSpan<&'a str, &'a str>; type IResult<'a, Ret> = nom::IResult, Ret, Error<'a>>; @@ -136,6 +139,15 @@ impl<'a> From<&'a str> for Token<'a> { } } +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum VectorFilter<'a> { + Fragment(Token<'a>), + DocumentTemplate, + UserProvided, + Regenerate, + None, +} + #[derive(Debug, Clone, PartialEq, Eq)] pub enum FilterCondition<'a> { Not(Box), @@ -143,6 +155,7 @@ pub enum FilterCondition<'a> { In { fid: Token<'a>, els: Vec> }, Or(Vec), And(Vec), + VectorExists { fid: Token<'a>, embedder: Option>, filter: VectorFilter<'a> }, GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> }, GeoBoundingBox { top_right_point: [Token<'a>; 2], bottom_left_point: [Token<'a>; 2] }, } @@ -173,9 +186,24 @@ impl<'a> FilterCondition<'a> { FilterCondition::Or(seq) | FilterCondition::And(seq) => { seq.iter().find_map(|filter| filter.use_contains_operator()) } + FilterCondition::VectorExists { .. } + | FilterCondition::GeoLowerThan { .. } + | FilterCondition::GeoBoundingBox { .. } + | FilterCondition::In { .. } => None, + } + } + + pub fn use_vector_filter(&self) -> Option<&Token> { + match self { + FilterCondition::Condition { .. } => None, + FilterCondition::Not(this) => this.use_vector_filter(), + FilterCondition::Or(seq) | FilterCondition::And(seq) => { + seq.iter().find_map(|filter| filter.use_vector_filter()) + } FilterCondition::GeoLowerThan { .. } | FilterCondition::GeoBoundingBox { .. } | FilterCondition::In { .. } => None, + FilterCondition::VectorExists { fid, .. } => Some(fid), } } @@ -263,10 +291,7 @@ fn parse_in_body(input: Span) -> IResult> { let (input, _) = ws(word_exact("IN"))(input)?; // everything after `IN` can be a failure - let (input, _) = - cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))( - input, - )?; + let (input, _) = tag("[")(input).map_cut(ErrorKind::InOpeningBracket)?; let (input, content) = cut(parse_value_list)(input)?; @@ -412,7 +437,7 @@ fn parse_geo_bounding_box(input: Span) -> IResult { let (input, args) = parsed?; if args.len() != 2 || args[0].len() != 2 || args[1].len() != 2 { - return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::GeoBoundingBox))); + return Err(Error::failure_from_kind(input, ErrorKind::GeoBoundingBox)); } let res = FilterCondition::GeoBoundingBox { @@ -433,7 +458,7 @@ fn parse_geo_point(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; // if we succeeded we still return a `Failure` because geoPoints are not allowed - Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint")))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))) } /// geoPoint = WS* "_geoDistance(float WS* "," WS* float WS* "," WS* float) @@ -447,7 +472,7 @@ fn parse_geo_distance(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))))?; // if we succeeded we still return a `Failure` because `geoDistance` filters are not allowed - Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoDistance")))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))) } /// geo = WS* "_geo(float WS* "," WS* float WS* "," WS* float) @@ -461,7 +486,7 @@ fn parse_geo(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geo"))))?; // if we succeeded we still return a `Failure` because `_geo` filter is not allowed - Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::ReservedGeo("_geo")))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geo"))) } fn parse_error_reserved_keyword(input: Span) -> IResult { @@ -500,8 +525,7 @@ fn parse_primary(input: Span, depth: usize) -> IResult { parse_is_not_null, parse_is_empty, parse_is_not_empty, - parse_exists, - parse_not_exists, + alt((parse_vectors_exists, parse_exists, parse_not_exists)), parse_to, parse_contains, parse_not_contains, @@ -557,6 +581,22 @@ impl std::fmt::Display for FilterCondition<'_> { } write!(f, "]") } + FilterCondition::VectorExists { fid: _, embedder, filter: inner } => { + write!(f, "_vectors")?; + if let Some(embedder) = embedder { + write!(f, ".{:?}", embedder.value())?; + } + match inner { + VectorFilter::Fragment(fragment) => { + write!(f, ".fragments.{:?}", fragment.value())? + } + VectorFilter::DocumentTemplate => write!(f, ".documentTemplate")?, + VectorFilter::UserProvided => write!(f, ".userProvided")?, + VectorFilter::Regenerate => write!(f, ".regenerate")?, + VectorFilter::None => (), + } + write!(f, " EXISTS") + } FilterCondition::GeoLowerThan { point, radius } => { write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius) } @@ -630,6 +670,9 @@ pub mod tests { insta::assert_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#); // but it also works with other sequences insta::assert_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}"); + + insta::assert_snapshot!(p(r#"_vectors." valid.name ".fragments."also.. valid! " EXISTS"#), @r#"_vectors." valid.name ".fragments."also.. valid! " EXISTS"#); + insta::assert_snapshot!(p("_vectors.\"\n\t\r\\\"\" EXISTS"), @r#"_vectors."\n\t\r\"" EXISTS"#); } #[test] @@ -692,6 +735,18 @@ pub mod tests { insta::assert_snapshot!(p("NOT subscribers IS NOT EMPTY"), @"{subscribers} IS EMPTY"); insta::assert_snapshot!(p("subscribers IS NOT EMPTY"), @"NOT ({subscribers} IS EMPTY)"); + // Test _vectors EXISTS + _vectors NOT EXITS + insta::assert_snapshot!(p("_vectors EXISTS"), @"_vectors EXISTS"); + insta::assert_snapshot!(p("_vectors.embedderName EXISTS"), @r#"_vectors."embedderName" EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.documentTemplate EXISTS"), @r#"_vectors."embedderName".documentTemplate EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.regenerate EXISTS"), @r#"_vectors."embedderName".regenerate EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.regenerate EXISTS"), @r#"_vectors."embedderName".regenerate EXISTS"#); + insta::assert_snapshot!(p("_vectors.embedderName.fragments.fragmentName EXISTS"), @r#"_vectors."embedderName".fragments."fragmentName" EXISTS"#); + insta::assert_snapshot!(p(" _vectors.embedderName.fragments.fragmentName EXISTS"), @r#"_vectors."embedderName".fragments."fragmentName" EXISTS"#); + insta::assert_snapshot!(p("NOT _vectors EXISTS"), @"NOT (_vectors EXISTS)"); + insta::assert_snapshot!(p(" NOT _vectors EXISTS"), @"NOT (_vectors EXISTS)"); + insta::assert_snapshot!(p(" _vectors NOT EXISTS"), @"NOT (_vectors EXISTS)"); + // Test EXISTS + NOT EXITS insta::assert_snapshot!(p("subscribers EXISTS"), @"{subscribers} EXISTS"); insta::assert_snapshot!(p("NOT subscribers EXISTS"), @"NOT ({subscribers} EXISTS)"); @@ -946,6 +1001,71 @@ pub mod tests { "### ); + insta::assert_snapshot!(p(r#"_vectors _vectors EXISTS"#), @r" + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 10:25 _vectors _vectors EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors. embedderName EXISTS"#), @r" + Was expecting embedder name but found nothing. + 10:11 _vectors. embedderName EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors .embedderName EXISTS"#), @r" + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 10:30 _vectors .embedderName EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName. EXISTS"#), @r" + Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value. + 22:23 _vectors.embedderName. EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors."embedderName EXISTS"#), @r#" + The quotes in one of the values are inconsistent. + 10:30 _vectors."embedderName EXISTS + "#); + insta::assert_snapshot!(p(r#"_vectors."embedderNam"e EXISTS"#), @r#" + The vector filter has leftover tokens. + 23:31 _vectors."embedderNam"e EXISTS + "#); + insta::assert_snapshot!(p(r#"_vectors.embedderName.documentTemplate. EXISTS"#), @r" + Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value. + 39:40 _vectors.embedderName.documentTemplate. EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments EXISTS"#), @r" + The vector filter is missing a fragment name. + 32:39 _vectors.embedderName.fragments EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. EXISTS"#), @r" + The vector filter's fragment name is invalid. + 33:40 _vectors.embedderName.fragments. EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments.test test EXISTS"#), @r" + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 38:49 _vectors.embedderName.fragments.test test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments. test EXISTS"#), @r" + The vector filter's fragment name is invalid. + 33:45 _vectors.embedderName.fragments. test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments. test EXISTS"#), @r" + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 23:46 _vectors.embedderName .fragments. test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName .fragments.test EXISTS"#), @r" + Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. + 23:45 _vectors.embedderName .fragments.test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fargments.test EXISTS"#), @r" + Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `fargments`. Did you mean `fragments`? + 23:32 _vectors.embedderName.fargments.test EXISTS + "); + insta::assert_snapshot!(p(r#"_vectors.embedderName."userProvided" EXISTS"#), @r#" + Was expecting this part to be unquoted. + 24:36 _vectors.embedderName."userProvided" EXISTS + "#); + insta::assert_snapshot!(p(r#"_vectors.embedderName.userProvided.fragments.test EXISTS"#), @r" + Vector filter can only accept one of `fragments`, `userProvided`, `documentTemplate` or `regenerate`, but found both `userProvided` and `fragments`. + 36:45 _vectors.embedderName.userProvided.fragments.test EXISTS + "); + insta::assert_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. 5:7 NOT OR EXISTS AND EXISTS NOT EXISTS diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index 98cac39fe..35a5c0ab4 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -80,6 +80,51 @@ pub fn word_exact<'a, 'b: 'a>(tag: &'b str) -> impl Fn(Span<'a>) -> IResult<'a, } } +/// vector_value = ( non_dot_word | singleQuoted | doubleQuoted) +pub fn parse_vector_value(input: Span) -> IResult { + pub fn non_dot_word(input: Span) -> IResult { + let (input, word) = take_while1(|c| is_value_component(c) && c != '.')(input)?; + Ok((input, word.into())) + } + + let (input, value) = alt(( + delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))), + delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))), + non_dot_word, + ))(input)?; + + match unescaper::unescape(value.value()) { + Ok(content) => { + if content.len() != value.value().len() { + Ok((input, Token::new(value.original_span(), Some(content)))) + } else { + Ok((input, value)) + } + } + Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)), + Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::InvalidEscapedNumber, + ))), + Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind( + value.original_span(), + ErrorKind::MalformedValue, + ))), + } +} + +pub fn parse_vector_value_cut<'a>(input: Span<'a>, kind: ErrorKind<'a>) -> IResult<'a, Token<'a>> { + parse_vector_value(input).map_err(|e| match e { + nom::Err::Failure(e) => match e.kind() { + ErrorKind::Char(c) if *c == '"' || *c == '\'' => { + crate::Error::failure_from_kind(input, ErrorKind::VectorFilterInvalidQuotes) + } + _ => crate::Error::failure_from_kind(input, kind), + }, + _ => crate::Error::failure_from_kind(input, kind), + }) +} + /// value = WS* ( word | singleQuoted | doubleQuoted) WS+ pub fn parse_value(input: Span) -> IResult { // to get better diagnostic message we are going to strip the left whitespaces from the input right now @@ -99,31 +144,21 @@ pub fn parse_value(input: Span) -> IResult { } match parse_geo_radius(input) { - Ok(_) => { - return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeoRadius))) - } + Ok(_) => return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoRadius)), // if we encountered a failure it means the user badly wrote a _geoRadius filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(nom::Err::Failure(Error::new_from_kind(input, ErrorKind::MisusedGeoRadius))) + return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoRadius)) } _ => (), } match parse_geo_bounding_box(input) { - Ok(_) => { - return Err(nom::Err::Failure(Error::new_from_kind( - input, - ErrorKind::MisusedGeoBoundingBox, - ))) - } + Ok(_) => return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)), // if we encountered a failure it means the user badly wrote a _geoBoundingBox filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(nom::Err::Failure(Error::new_from_kind( - input, - ErrorKind::MisusedGeoBoundingBox, - ))) + return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)) } _ => (), } diff --git a/crates/index-scheduler/src/scheduler/test_document_addition.rs b/crates/index-scheduler/src/scheduler/test_document_addition.rs index b642f5604..7ca72da95 100644 --- a/crates/index-scheduler/src/scheduler/test_document_addition.rs +++ b/crates/index-scheduler/src/scheduler/test_document_addition.rs @@ -736,7 +736,7 @@ fn test_document_addition_mixed_rights_with_index() { #[test] fn test_document_addition_mixed_right_without_index_starts_with_cant_create() { // We're going to autobatch multiple document addition. - // - The index does not exists + // - The index does not exist // - The first document addition don't have the right to create an index // - The second do. They should not batch together. // - The second should batch with everything else as it's going to create an index. diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 138f5140f..5ced4603e 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -139,6 +139,8 @@ pub struct DocumentsFetchAggregator { per_document_id: bool, // if a filter was used per_filter: bool, + with_vector_filter: bool, + // if documents were sorted sort: bool, @@ -166,6 +168,7 @@ impl Aggregate for DocumentsFetchAggregator { Box::new(Self { per_document_id: self.per_document_id | new.per_document_id, per_filter: self.per_filter | new.per_filter, + with_vector_filter: self.with_vector_filter | new.with_vector_filter, sort: self.sort | new.sort, retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, max_limit: self.max_limit.max(new.max_limit), @@ -250,6 +253,7 @@ pub async fn get_document( retrieve_vectors: param_retrieve_vectors.0, per_document_id: true, per_filter: false, + with_vector_filter: false, sort: false, max_limit: 0, max_offset: 0, @@ -475,6 +479,10 @@ pub async fn documents_by_query_post( analytics.publish( DocumentsFetchAggregator:: { per_filter: body.filter.is_some(), + with_vector_filter: body + .filter + .as_ref() + .is_some_and(|f| f.to_string().contains("_vectors")), sort: body.sort.is_some(), retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, @@ -576,6 +584,10 @@ pub async fn get_documents( analytics.publish( DocumentsFetchAggregator:: { per_filter: query.filter.is_some(), + with_vector_filter: query + .filter + .as_ref() + .is_some_and(|f| f.to_string().contains("_vectors")), sort: query.sort.is_some(), retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, @@ -1455,8 +1467,6 @@ fn some_documents<'a, 't: 'a>( document.remove("_vectors"); } RetrieveVectors::Retrieve => { - // Clippy is simply wrong - #[allow(clippy::manual_unwrap_or_default)] let mut vectors = match document.remove("_vectors") { Some(Value::Object(map)) => map, _ => Default::default(), diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs index 2a8e78059..e27e6347b 100644 --- a/crates/meilisearch/src/routes/indexes/search_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -40,6 +40,7 @@ pub struct SearchAggregator { // filter filter_with_geo_radius: bool, filter_with_geo_bounding_box: bool, + filter_on_vectors: bool, // every time a request has a filter, this field must be incremented by the number of terms it contains filter_sum_of_criteria_terms: usize, // every time a request has a filter, this field must be incremented by one @@ -163,6 +164,7 @@ impl SearchAggregator { let stringified_filters = filter.to_string(); ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius("); ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox("); + ret.filter_on_vectors = stringified_filters.contains("_vectors"); ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count(); } @@ -261,6 +263,7 @@ impl Aggregate for SearchAggregator { distinct, filter_with_geo_radius, filter_with_geo_bounding_box, + filter_on_vectors, filter_sum_of_criteria_terms, filter_total_number_of_criteria, used_syntax, @@ -315,6 +318,7 @@ impl Aggregate for SearchAggregator { // filter self.filter_with_geo_radius |= filter_with_geo_radius; self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box; + self.filter_on_vectors |= filter_on_vectors; self.filter_sum_of_criteria_terms = self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms); self.filter_total_number_of_criteria = @@ -389,6 +393,7 @@ impl Aggregate for SearchAggregator { distinct, filter_with_geo_radius, filter_with_geo_bounding_box, + filter_on_vectors, filter_sum_of_criteria_terms, filter_total_number_of_criteria, used_syntax, @@ -446,6 +451,7 @@ impl Aggregate for SearchAggregator { "filter": { "with_geoRadius": filter_with_geo_radius, "with_geoBoundingBox": filter_with_geo_bounding_box, + "on_vectors": filter_on_vectors, "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, diff --git a/crates/meilisearch/src/routes/tasks.rs b/crates/meilisearch/src/routes/tasks.rs index 95c105894..fb0f73425 100644 --- a/crates/meilisearch/src/routes/tasks.rs +++ b/crates/meilisearch/src/routes/tasks.rs @@ -336,7 +336,7 @@ impl Aggregate for TaskFilterAnalytics Server { +pub async fn get_server_vector() -> Server { Server::new().await } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index e68a87dbf..7668dbcc3 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -12,7 +12,7 @@ use crate::common::Value; use crate::json; use crate::vector::{get_server_vector, GetAllDocumentsOptions}; -async fn create_mock() -> (&'static MockServer, Value) { +pub async fn create_mock() -> (&'static MockServer, Value) { let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 9ad9d0511..76ad3fda0 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -639,3 +639,29 @@ fn conditionally_lookup_for_error_message() { assert_eq!(err.to_string(), format!("{} {}", prefix, suffix)); } } + +pub struct DidYouMean<'a>(Option<&'a str>); + +impl<'a> DidYouMean<'a> { + pub fn new(key: &str, keys: &'a [String]) -> DidYouMean<'a> { + let typos = levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true).build_dfa(key); + for key in keys.iter() { + match typos.eval(key) { + levenshtein_automata::Distance::Exact(_) => { + return DidYouMean(Some(key)); + } + levenshtein_automata::Distance::AtLeast(_) => continue, + } + } + DidYouMean(None) + } +} + +impl std::fmt::Display for DidYouMean<'_> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if let Some(suggestion) = self.0 { + write!(f, " Did you mean `{suggestion}`?")?; + } + Ok(()) + } +} diff --git a/crates/milli/src/filterable_attributes_rules.rs b/crates/milli/src/filterable_attributes_rules.rs index ae1a9755a..5ba8a99d8 100644 --- a/crates/milli/src/filterable_attributes_rules.rs +++ b/crates/milli/src/filterable_attributes_rules.rs @@ -111,7 +111,7 @@ impl FilterableAttributesFeatures { self.filter.is_filterable_null() } - /// Check if `IS EXISTS` is allowed + /// Check if `EXISTS` is allowed pub fn is_filterable_exists(&self) -> bool { self.filter.is_filterable_exists() } diff --git a/crates/milli/src/search/facet/filter.rs b/crates/milli/src/search/facet/filter.rs index 76d935fc6..f9262f855 100644 --- a/crates/milli/src/search/facet/filter.rs +++ b/crates/milli/src/search/facet/filter.rs @@ -12,7 +12,7 @@ use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; use super::facet_range_search; -use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::error::{Error, UserError}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; @@ -228,6 +228,10 @@ impl<'a> Filter<'a> { pub fn use_contains_operator(&self) -> Option<&Token> { self.condition.use_contains_operator() } + + pub fn use_vector_filter(&self) -> Option<&Token> { + self.condition.use_vector_filter() + } } impl<'a> Filter<'a> { @@ -235,10 +239,12 @@ impl<'a> Filter<'a> { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + for fid in self.condition.fids(MAX_FILTER_DEPTH) { let attribute = fid.value(); if matching_features(attribute, &filterable_attributes_rules) .is_some_and(|(_, features)| features.is_filterable()) + || attribute == RESERVED_VECTORS_FIELD_NAME { continue; } @@ -578,7 +584,8 @@ impl<'a> Filter<'a> { .union() } FilterCondition::Condition { fid, op } => { - let Some(field_id) = field_ids_map.id(fid.value()) else { + let value = fid.value(); + let Some(field_id) = field_ids_map.id(value) else { return Ok(RoaringBitmap::new()); }; let Some((rule_index, features)) = @@ -635,6 +642,9 @@ impl<'a> Filter<'a> { Ok(RoaringBitmap::new()) } } + FilterCondition::VectorExists { fid: _, embedder, filter } => { + super::filter_vector::evaluate(rtxn, index, universe, embedder.clone(), filter) + } FilterCondition::GeoLowerThan { point, radius } => { if index.is_geo_filtering_enabled(rtxn)? { let base_point: [f64; 2] = diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs new file mode 100644 index 000000000..1ef4b8e3d --- /dev/null +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -0,0 +1,157 @@ +use filter_parser::{Token, VectorFilter}; +use roaring::{MultiOps, RoaringBitmap}; + +use crate::error::{DidYouMean, Error}; +use crate::vector::db::IndexEmbeddingConfig; +use crate::vector::{ArroyStats, ArroyWrapper}; +use crate::Index; + +#[derive(Debug, thiserror::Error)] +pub enum VectorFilterError<'a> { + #[error("The embedder `{}` does not exist. {}", embedder.value(), { + if available.is_empty() { + String::from("This index does not have any configured embedders.") + } else { + let mut available = available.clone(); + available.sort_unstable(); + let did_you_mean = DidYouMean::new(embedder.value(), &available); + format!("Available embedders are: {}.{did_you_mean}", available.iter().map(|e| format!("`{e}`")).collect::>().join(", ")) + } + })] + EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec }, + + #[error("The fragment `{}` does not exist on embedder `{}`. {}", fragment.value(), embedder.value(), { + if available.is_empty() { + String::from("This embedder does not have any configured fragments.") + } else { + let mut available = available.clone(); + available.sort_unstable(); + let did_you_mean = DidYouMean::new(fragment.value(), &available); + format!("Available fragments on this embedder are: {}.{did_you_mean}", available.iter().map(|f| format!("`{f}`")).collect::>().join(", ")) + } + })] + FragmentDoesNotExist { + embedder: &'a Token<'a>, + fragment: &'a Token<'a>, + available: Vec, + }, +} + +use VectorFilterError::*; + +impl<'a> From> for Error { + fn from(err: VectorFilterError<'a>) -> Self { + match &err { + EmbedderDoesNotExist { embedder: token, .. } + | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), + } + } +} + +pub(super) fn evaluate( + rtxn: &heed::RoTxn<'_>, + index: &Index, + universe: Option<&RoaringBitmap>, + embedder: Option>, + filter: &VectorFilter<'_>, +) -> crate::Result { + let index_embedding_configs = index.embedding_configs(); + let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; + + let embedders = match embedder { + Some(embedder) => vec![embedder], + None => embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect(), + }; + + let mut docids = embedders + .iter() + .map(|e| evaluate_inner(rtxn, index, e, &embedding_configs, filter)) + .union()?; + + if let Some(universe) = universe { + docids &= universe; + } + + Ok(docids) +} + +fn evaluate_inner( + rtxn: &heed::RoTxn<'_>, + index: &Index, + embedder: &Token<'_>, + embedding_configs: &[IndexEmbeddingConfig], + filter: &VectorFilter<'_>, +) -> crate::Result { + let embedder_name = embedder.value(); + let available_embedders = + || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); + + let embedding_config = embedding_configs + .iter() + .find(|config| config.name == embedder_name) + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + + let embedder_info = index + .embedding_configs() + .embedder_info(rtxn, embedder_name)? + .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; + + let arroy_wrapper = ArroyWrapper::new( + index.vector_arroy, + embedder_info.embedder_id, + embedding_config.config.quantized(), + ); + + let docids = match filter { + VectorFilter::Fragment(fragment) => { + let fragment_name = fragment.value(); + let fragment_config = embedding_config + .fragments + .as_slice() + .iter() + .find(|fragment| fragment.name == fragment_name) + .ok_or_else(|| FragmentDoesNotExist { + embedder, + fragment, + available: embedding_config + .fragments + .as_slice() + .iter() + .map(|f| f.name.clone()) + .collect(), + })?; + + let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); + arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| { + bitmap.clone() - user_provided_docids + })? + } + VectorFilter::DocumentTemplate => { + if !embedding_config.fragments.as_slice().is_empty() { + return Ok(RoaringBitmap::new()); + } + + let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents - user_provided_docids.clone() + } + VectorFilter::UserProvided => { + let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); + user_provided_docids.clone() + } + VectorFilter::Regenerate => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); + stats.documents - skip_regenerate + } + VectorFilter::None => { + let mut stats = ArroyStats::default(); + arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + stats.documents + } + }; + + Ok(docids) +} diff --git a/crates/milli/src/search/facet/mod.rs b/crates/milli/src/search/facet/mod.rs index a5e65c95d..fac85df59 100644 --- a/crates/milli/src/search/facet/mod.rs +++ b/crates/milli/src/search/facet/mod.rs @@ -17,6 +17,7 @@ mod facet_range_search; mod facet_sort_ascending; mod facet_sort_descending; mod filter; +mod filter_vector; mod search; fn facet_extreme_value<'t>( diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index d174319d0..6e34961e7 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -1339,10 +1339,9 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() { assert!(results.candidates.is_empty()); let mut search = index.search(&rtxn); - let results = search - .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) - .execute() - .unwrap(); + let results = + dbg!(search.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()).execute()) + .unwrap(); assert!(results.candidates.is_empty()); index diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs index 2fea75d68..d445b47c0 100644 --- a/crates/milli/src/vector/db.rs +++ b/crates/milli/src/vector/db.rs @@ -128,6 +128,7 @@ impl EmbeddingStatus { pub fn is_user_provided(&self, docid: DocumentId) -> bool { self.user_provided.contains(docid) } + /// Whether vectors should be regenerated for that document and that embedder. pub fn must_regenerate(&self, docid: DocumentId) -> bool { let invert = self.skip_regenerate_different_from_user_provided.contains(docid); diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 873693a34..1f07f6c4f 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -556,9 +556,6 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.quantized_db()) { let reader = reader?; let documents = reader.item_ids(); - if documents.is_empty() { - break; - } stats.documents |= documents; stats.number_of_embeddings += documents.len(); } @@ -566,9 +563,6 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.angular_db()) { let reader = reader?; let documents = reader.item_ids(); - if documents.is_empty() { - break; - } stats.documents |= documents; stats.number_of_embeddings += documents.len(); }