From 666ae1a3e745f167946cacc039ca511bd9fc52c9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Wed, 13 Aug 2025 13:00:38 +0200 Subject: [PATCH] Add "did you mean" message --- Cargo.lock | 1 + crates/filter-parser/Cargo.toml | 1 + crates/filter-parser/src/condition.rs | 18 ++++++++++++----- crates/filter-parser/src/error.rs | 29 +++++++++++++++++++++++++-- crates/filter-parser/src/lib.rs | 20 ++++++++++-------- crates/filter-parser/src/value.rs | 8 ++++---- 6 files changed, 58 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 8413b3d14..43f491f1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2031,6 +2031,7 @@ name = "filter-parser" version = "1.16.0" dependencies = [ "insta", + "levenshtein_automata", "nom", "nom_locate", "unescaper", diff --git a/crates/filter-parser/Cargo.toml b/crates/filter-parser/Cargo.toml index 6eeb0794b..173cabd4b 100644 --- a/crates/filter-parser/Cargo.toml +++ b/crates/filter-parser/Cargo.toml @@ -15,6 +15,7 @@ license.workspace = true nom = "7.1.3" nom_locate = "4.2.0" unescaper = "0.1.6" +levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } [dev-dependencies] # fixed version due to format breakages in v1.40 diff --git a/crates/filter-parser/src/condition.rs b/crates/filter-parser/src/condition.rs index 12542110a..e4795d048 100644 --- a/crates/filter-parser/src/condition.rs +++ b/crates/filter-parser/src/condition.rs @@ -19,6 +19,7 @@ use Condition::*; use crate::error::IResultExt; use crate::value::parse_vector_value; +use crate::Error; use crate::ErrorKind; use crate::VectorFilter; use crate::{parse_value, FilterCondition, IResult, Span, Token}; @@ -136,10 +137,7 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> // We could use nom's `cut` but it's better to be explicit about the errors if let Ok((_, space)) = tag::<_, _, ()>(" ")(input) { - return Err(crate::Error::new_failure_from_kind( - space, - ErrorKind::VectorFilterMissingEmbedder, - )); + return Err(crate::Error::failure_from_kind(space, ErrorKind::VectorFilterMissingEmbedder)); } let (input, embedder_name) = @@ -159,6 +157,16 @@ fn parse_vectors(input: Span) -> IResult<(Token, Option, VectorFilter<'_> value(VectorFilter::None, nom::combinator::success("")), ))(input)?; + if let Ok((input, point)) = tag::<_, _, ()>(".")(input) { + let opt_value = parse_vector_value(input).ok().map(|(_, v)| v); + let value = opt_value + .as_ref() + .map(|v| v.original_span().to_string()) + .unwrap_or_else(|| point.to_string()); + let context = opt_value.map(|v| v.original_span()).unwrap_or(point); + return Err(Error::failure_from_kind(context, ErrorKind::VectorFilterUnknownSuffix(value))); + } + let (input, _) = multispace1(input).map_cut(ErrorKind::VectorFilterLeftover)?; Ok((input, (Token::from(fid), Some(embedder_name), filter))) @@ -181,7 +189,7 @@ pub fn parse_vectors_exists(input: Span) -> IResult { )); } - Err(crate::Error::new_failure_from_kind(input, ErrorKind::VectorFilterOperation)) + Err(crate::Error::failure_from_kind(input, ErrorKind::VectorFilterOperation)) } /// contains = value "CONTAINS" value diff --git a/crates/filter-parser/src/error.rs b/crates/filter-parser/src/error.rs index f6356f3fd..91ae2e33c 100644 --- a/crates/filter-parser/src/error.rs +++ b/crates/filter-parser/src/error.rs @@ -54,7 +54,7 @@ impl<'a, T> IResultExt<'a> for IResult<'a, T> { nom::Err::Error(e) => *e.context(), nom::Err::Failure(e) => *e.context(), }; - Error::new_failure_from_kind(input, kind) + Error::failure_from_kind(input, kind) }) } } @@ -83,6 +83,7 @@ pub enum ErrorKind<'a> { VectorFilterInvalidEmbedder, VectorFilterMissingFragment, VectorFilterInvalidFragment, + VectorFilterUnknownSuffix(String), VectorFilterOperation, InvalidPrimary, InvalidEscapedNumber, @@ -114,7 +115,7 @@ impl<'a> Error<'a> { Self { context, kind } } - pub fn new_failure_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> nom::Err { + pub fn failure_from_kind(context: Span<'a>, kind: ErrorKind<'a>) -> nom::Err { nom::Err::Failure(Self::new_from_kind(context, kind)) } @@ -155,6 +156,20 @@ impl Display for Error<'_> { // first line being the diagnostic and the second line being the incriminated filter. let escaped_input = input.escape_debug(); + fn key_suggestion<'a>(key: &str, keys: &[&'a str]) -> Option<&'a str> { + let typos = + levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true).build_dfa(key); + for key in keys.iter() { + match typos.eval(key) { + levenshtein_automata::Distance::Exact(_) => { + return Some(key); + } + levenshtein_automata::Distance::AtLeast(_) => continue, + } + } + None + } + match &self.kind { ErrorKind::ExpectedValue(_) if input.trim().is_empty() => { writeln!(f, "Was expecting a value but instead got nothing.")? @@ -199,6 +214,16 @@ impl Display for Error<'_> { ErrorKind::VectorFilterLeftover => { writeln!(f, "The vector filter has leftover tokens.")? } + ErrorKind::VectorFilterUnknownSuffix(value) if value.as_str() == "." => { + writeln!(f, "Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value.")?; + } + ErrorKind::VectorFilterUnknownSuffix(value) => { + if let Some(suggestion) = key_suggestion(value, &["fragments", "userProvided", "documentTemplate", "regenerate"]) { + writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`. Did you mean `{suggestion}`?")?; + } else { + writeln!(f, "Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `{value}`.")?; + } + } ErrorKind::VectorFilterInvalidFragment => { writeln!(f, "The vector filter's fragment is invalid.")? } diff --git a/crates/filter-parser/src/lib.rs b/crates/filter-parser/src/lib.rs index 2c8dfac80..75cbecc26 100644 --- a/crates/filter-parser/src/lib.rs +++ b/crates/filter-parser/src/lib.rs @@ -437,7 +437,7 @@ fn parse_geo_bounding_box(input: Span) -> IResult { let (input, args) = parsed?; if args.len() != 2 || args[0].len() != 2 || args[1].len() != 2 { - return Err(Error::new_failure_from_kind(input, ErrorKind::GeoBoundingBox)); + return Err(Error::failure_from_kind(input, ErrorKind::GeoBoundingBox)); } let res = FilterCondition::GeoBoundingBox { @@ -458,7 +458,7 @@ fn parse_geo_point(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))))?; // if we succeeded we still return a `Failure` because geoPoints are not allowed - Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geoPoint"))) } /// geoPoint = WS* "_geoDistance(float WS* "," WS* float WS* "," WS* float) @@ -472,7 +472,7 @@ fn parse_geo_distance(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))))?; // if we succeeded we still return a `Failure` because `geoDistance` filters are not allowed - Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geoDistance"))) } /// geo = WS* "_geo(float WS* "," WS* float WS* "," WS* float) @@ -486,7 +486,7 @@ fn parse_geo(input: Span) -> IResult { ))(input) .map_err(|e| e.map(|_| Error::new_from_kind(input, ErrorKind::ReservedGeo("_geo"))))?; // if we succeeded we still return a `Failure` because `_geo` filter is not allowed - Err(Error::new_failure_from_kind(input, ErrorKind::ReservedGeo("_geo"))) + Err(Error::failure_from_kind(input, ErrorKind::ReservedGeo("_geo"))) } fn parse_error_reserved_keyword(input: Span) -> IResult { @@ -1014,8 +1014,8 @@ pub mod tests { 10:30 _vectors .embedderName EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName. EXISTS"#), @r" - The vector filter has leftover tokens. - 22:30 _vectors.embedderName. EXISTS + Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value. + 22:23 _vectors.embedderName. EXISTS "); insta::assert_snapshot!(p(r#"_vectors."embedderName EXISTS"#), @r#" The vector filter's embedder is invalid. @@ -1026,8 +1026,8 @@ pub mod tests { 23:31 _vectors."embedderNam"e EXISTS "#); insta::assert_snapshot!(p(r#"_vectors.embedderName.documentTemplate. EXISTS"#), @r" - The vector filter has leftover tokens. - 39:47 _vectors.embedderName.documentTemplate. EXISTS + Was expecting one of `.fragments`, `.userProvided`, `.documentTemplate`, `.regenerate` or nothing, but instead found a point without a valid value. + 39:40 _vectors.embedderName.documentTemplate. EXISTS "); insta::assert_snapshot!(p(r#"_vectors.embedderName.fragments EXISTS"#), @r" The vector filter is missing a fragment name. @@ -1053,6 +1053,10 @@ pub mod tests { Was expecting an operation like `EXISTS` or `NOT EXISTS` after the vector filter. 23:45 _vectors.embedderName .fragments.test EXISTS "); + insta::assert_snapshot!(p(r#"_vectors.embedderName.fargments.test EXISTS"#), @r" + Was expecting one of `fragments`, `userProvided`, `documentTemplate`, `regenerate` or nothing, but instead found `fargments`. Did you mean `fragments`? + 23:32 _vectors.embedderName.fargments.test EXISTS + "); insta::assert_snapshot!(p(r#"NOT OR EXISTS AND EXISTS NOT EXISTS"#), @r###" Was expecting a value but instead got `OR`, which is a reserved keyword. To use `OR` as a field name or a value, surround it by quotes. diff --git a/crates/filter-parser/src/value.rs b/crates/filter-parser/src/value.rs index ac645799b..dac96b4f4 100644 --- a/crates/filter-parser/src/value.rs +++ b/crates/filter-parser/src/value.rs @@ -132,21 +132,21 @@ pub fn parse_value(input: Span) -> IResult { } match parse_geo_radius(input) { - Ok(_) => return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoRadius)), + Ok(_) => return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoRadius)), // if we encountered a failure it means the user badly wrote a _geoRadius filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoRadius)) + return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoRadius)) } _ => (), } match parse_geo_bounding_box(input) { - Ok(_) => return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)), + Ok(_) => return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)), // if we encountered a failure it means the user badly wrote a _geoBoundingBox filter. // But instead of showing them how to fix his syntax we are going to tell them they should not use this filter as a value. Err(e) if e.is_failure() => { - return Err(Error::new_failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)) + return Err(Error::failure_from_kind(input, ErrorKind::MisusedGeoBoundingBox)) } _ => (), }