mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-05 20:26:31 +00:00
Switch to a nom parser
This commit is contained in:
@ -7,11 +7,20 @@
|
|||||||
|
|
||||||
use nom::branch::alt;
|
use nom::branch::alt;
|
||||||
use nom::bytes::complete::tag;
|
use nom::bytes::complete::tag;
|
||||||
|
use nom::character::complete::char;
|
||||||
|
use nom::character::complete::multispace0;
|
||||||
use nom::character::complete::multispace1;
|
use nom::character::complete::multispace1;
|
||||||
use nom::combinator::cut;
|
use nom::combinator::cut;
|
||||||
|
use nom::combinator::map;
|
||||||
|
use nom::combinator::value;
|
||||||
|
use nom::sequence::preceded;
|
||||||
use nom::sequence::{terminated, tuple};
|
use nom::sequence::{terminated, tuple};
|
||||||
use Condition::*;
|
use Condition::*;
|
||||||
|
|
||||||
|
use crate::error::IResultExt;
|
||||||
|
use crate::value::parse_vector_value;
|
||||||
|
use crate::ErrorKind;
|
||||||
|
use crate::VectorFilter;
|
||||||
use crate::{parse_value, FilterCondition, IResult, Span, Token};
|
use crate::{parse_value, FilterCondition, IResult, Span, Token};
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
@ -113,6 +122,57 @@ pub fn parse_not_exists(input: Span) -> IResult<FilterCondition> {
|
|||||||
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Exists }))))
|
Ok((input, FilterCondition::Not(Box::new(FilterCondition::Condition { fid: key, op: Exists }))))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn parse_vectors(input: Span) -> IResult<(Token, Option<Token>, VectorFilter<'_>)> {
|
||||||
|
let (input, _) = multispace0(input)?;
|
||||||
|
let (input, fid) = tag("_vectors")(input)?;
|
||||||
|
|
||||||
|
if let Ok((input, _)) = multispace1::<_, crate::Error>(input) {
|
||||||
|
return Ok((input, (Token::from(fid), None, VectorFilter::None)));
|
||||||
|
}
|
||||||
|
|
||||||
|
let (input, _) = char('.')(input)?;
|
||||||
|
|
||||||
|
// From this point, we are certain this is a vector filter, so our errors must be final.
|
||||||
|
// We could use nom's `cut`` but it's better to be explicit about the errors
|
||||||
|
|
||||||
|
let (input, embedder_name) = parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidEmbedder)?;
|
||||||
|
|
||||||
|
let (input, filter) = alt((
|
||||||
|
map(
|
||||||
|
preceded(tag(".fragments"), |input| {
|
||||||
|
let (input, _) = tag(".")(input).map_cut(ErrorKind::VectorFilterMissingFragment)?;
|
||||||
|
parse_vector_value(input).map_cut(ErrorKind::VectorFilterInvalidFragment)
|
||||||
|
}),
|
||||||
|
VectorFilter::Fragment,
|
||||||
|
),
|
||||||
|
value(VectorFilter::UserProvided, tag(".userProvided")),
|
||||||
|
value(VectorFilter::DocumentTemplate, tag(".documentTemplate")),
|
||||||
|
value(VectorFilter::Regenerate, tag(".regenerate")),
|
||||||
|
value(VectorFilter::None, nom::combinator::success("")),
|
||||||
|
))(input)?;
|
||||||
|
|
||||||
|
let (input, _) = multispace1(input).map_cut(ErrorKind::VectorFilterLeftover)?;
|
||||||
|
|
||||||
|
Ok((input, (Token::from(fid), Some(embedder_name), filter)))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// vectors_exists = vectors "EXISTS"
|
||||||
|
pub fn parse_vectors_exists(input: Span) -> IResult<FilterCondition> {
|
||||||
|
let (input, (fid, embedder, filter)) = terminated(parse_vectors, tag("EXISTS"))(input)?;
|
||||||
|
|
||||||
|
Ok((input, FilterCondition::VectorExists { fid, embedder, filter }))
|
||||||
|
}
|
||||||
|
/// vectors_not_exists = vectors "NOT" WS+ "EXISTS"
|
||||||
|
pub fn parse_vectors_not_exists(input: Span) -> IResult<FilterCondition> {
|
||||||
|
let (input, (fid, embedder, filter)) = parse_vectors(input)?;
|
||||||
|
|
||||||
|
let (input, _) = tuple((tag("NOT"), multispace1, tag("EXISTS")))(input)?;
|
||||||
|
Ok((
|
||||||
|
input,
|
||||||
|
FilterCondition::Not(Box::new(FilterCondition::VectorExists { fid, embedder, filter })),
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
/// contains = value "CONTAINS" value
|
/// contains = value "CONTAINS" value
|
||||||
pub fn parse_contains(input: Span) -> IResult<FilterCondition> {
|
pub fn parse_contains(input: Span) -> IResult<FilterCondition> {
|
||||||
let (input, (fid, contains, value)) =
|
let (input, (fid, contains, value)) =
|
||||||
|
@ -42,6 +42,23 @@ pub fn cut_with_err<'a, O>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub trait IResultExt<'a> {
|
||||||
|
fn map_cut(self, kind: ErrorKind<'a>) -> Self;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, T> IResultExt<'a> for IResult<'a, T> {
|
||||||
|
fn map_cut(self, kind: ErrorKind<'a>) -> Self {
|
||||||
|
self.map_err(move |e: nom::Err<Error<'a>>| {
|
||||||
|
let input = match e {
|
||||||
|
nom::Err::Incomplete(_) => return e,
|
||||||
|
nom::Err::Error(e) => *e.context(),
|
||||||
|
nom::Err::Failure(e) => *e.context(),
|
||||||
|
};
|
||||||
|
nom::Err::Failure(Error::new_from_kind(input, kind))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct Error<'a> {
|
pub struct Error<'a> {
|
||||||
context: Span<'a>,
|
context: Span<'a>,
|
||||||
@ -76,6 +93,11 @@ pub enum ErrorKind<'a> {
|
|||||||
InternalError(error::ErrorKind),
|
InternalError(error::ErrorKind),
|
||||||
DepthLimitReached,
|
DepthLimitReached,
|
||||||
External(String),
|
External(String),
|
||||||
|
|
||||||
|
VectorFilterLeftover,
|
||||||
|
VectorFilterInvalidEmbedder,
|
||||||
|
VectorFilterMissingFragment,
|
||||||
|
VectorFilterInvalidFragment,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Error<'a> {
|
impl<'a> Error<'a> {
|
||||||
@ -169,6 +191,18 @@ impl Display for Error<'_> {
|
|||||||
ErrorKind::MisusedGeoBoundingBox => {
|
ErrorKind::MisusedGeoBoundingBox => {
|
||||||
writeln!(f, "The `_geoBoundingBox` filter is an operation and can't be used as a value.")?
|
writeln!(f, "The `_geoBoundingBox` filter is an operation and can't be used as a value.")?
|
||||||
}
|
}
|
||||||
|
ErrorKind::VectorFilterLeftover => {
|
||||||
|
writeln!(f, "The vector filter has leftover tokens.")?
|
||||||
|
}
|
||||||
|
ErrorKind::VectorFilterInvalidFragment => {
|
||||||
|
writeln!(f, "The vector filter's fragment is invalid.")?
|
||||||
|
}
|
||||||
|
ErrorKind::VectorFilterMissingFragment => {
|
||||||
|
writeln!(f, "The vector filter is missing a fragment name.")?
|
||||||
|
}
|
||||||
|
ErrorKind::VectorFilterInvalidEmbedder => {
|
||||||
|
writeln!(f, "The vector filter's embedder is invalid.")?
|
||||||
|
}
|
||||||
ErrorKind::ReservedKeyword(word) => {
|
ErrorKind::ReservedKeyword(word) => {
|
||||||
writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")?
|
writeln!(f, "`{word}` is a reserved keyword and thus cannot be used as a field name unless it is put inside quotes. Use \"{word}\" or \'{word}\' instead.")?
|
||||||
}
|
}
|
||||||
|
@ -65,6 +65,9 @@ use nom_locate::LocatedSpan;
|
|||||||
pub(crate) use value::parse_value;
|
pub(crate) use value::parse_value;
|
||||||
use value::word_exact;
|
use value::word_exact;
|
||||||
|
|
||||||
|
use crate::condition::{parse_vectors_exists, parse_vectors_not_exists};
|
||||||
|
use crate::error::IResultExt;
|
||||||
|
|
||||||
pub type Span<'a> = LocatedSpan<&'a str, &'a str>;
|
pub type Span<'a> = LocatedSpan<&'a str, &'a str>;
|
||||||
|
|
||||||
type IResult<'a, Ret> = nom::IResult<Span<'a>, Ret, Error<'a>>;
|
type IResult<'a, Ret> = nom::IResult<Span<'a>, Ret, Error<'a>>;
|
||||||
@ -146,6 +149,15 @@ impl<'a> From<&'a str> for Token<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
|
pub enum VectorFilter<'a> {
|
||||||
|
Fragment(Token<'a>),
|
||||||
|
DocumentTemplate,
|
||||||
|
UserProvided,
|
||||||
|
Regenerate,
|
||||||
|
None,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||||
pub enum FilterCondition<'a> {
|
pub enum FilterCondition<'a> {
|
||||||
Not(Box<Self>),
|
Not(Box<Self>),
|
||||||
@ -153,6 +165,7 @@ pub enum FilterCondition<'a> {
|
|||||||
In { fid: Token<'a>, els: Vec<Token<'a>> },
|
In { fid: Token<'a>, els: Vec<Token<'a>> },
|
||||||
Or(Vec<Self>),
|
Or(Vec<Self>),
|
||||||
And(Vec<Self>),
|
And(Vec<Self>),
|
||||||
|
VectorExists { fid: Token<'a>, embedder: Option<Token<'a>>, filter: VectorFilter<'a> },
|
||||||
GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> },
|
GeoLowerThan { point: [Token<'a>; 2], radius: Token<'a> },
|
||||||
GeoBoundingBox { top_right_point: [Token<'a>; 2], bottom_left_point: [Token<'a>; 2] },
|
GeoBoundingBox { top_right_point: [Token<'a>; 2], bottom_left_point: [Token<'a>; 2] },
|
||||||
}
|
}
|
||||||
@ -183,7 +196,8 @@ impl<'a> FilterCondition<'a> {
|
|||||||
FilterCondition::Or(seq) | FilterCondition::And(seq) => {
|
FilterCondition::Or(seq) | FilterCondition::And(seq) => {
|
||||||
seq.iter().find_map(|filter| filter.use_contains_operator())
|
seq.iter().find_map(|filter| filter.use_contains_operator())
|
||||||
}
|
}
|
||||||
FilterCondition::GeoLowerThan { .. }
|
FilterCondition::VectorExists { .. }
|
||||||
|
| FilterCondition::GeoLowerThan { .. }
|
||||||
| FilterCondition::GeoBoundingBox { .. }
|
| FilterCondition::GeoBoundingBox { .. }
|
||||||
| FilterCondition::In { .. } => None,
|
| FilterCondition::In { .. } => None,
|
||||||
}
|
}
|
||||||
@ -191,13 +205,7 @@ impl<'a> FilterCondition<'a> {
|
|||||||
|
|
||||||
pub fn use_vector_filter(&self) -> Option<&Token> {
|
pub fn use_vector_filter(&self) -> Option<&Token> {
|
||||||
match self {
|
match self {
|
||||||
FilterCondition::Condition { fid, op: _ } => {
|
FilterCondition::Condition { .. } => None,
|
||||||
if fid.value().starts_with("_vectors.") || fid.value() == "_vectors" {
|
|
||||||
Some(fid)
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
FilterCondition::Not(this) => this.use_vector_filter(),
|
FilterCondition::Not(this) => this.use_vector_filter(),
|
||||||
FilterCondition::Or(seq) | FilterCondition::And(seq) => {
|
FilterCondition::Or(seq) | FilterCondition::And(seq) => {
|
||||||
seq.iter().find_map(|filter| filter.use_vector_filter())
|
seq.iter().find_map(|filter| filter.use_vector_filter())
|
||||||
@ -205,6 +213,7 @@ impl<'a> FilterCondition<'a> {
|
|||||||
FilterCondition::GeoLowerThan { .. }
|
FilterCondition::GeoLowerThan { .. }
|
||||||
| FilterCondition::GeoBoundingBox { .. }
|
| FilterCondition::GeoBoundingBox { .. }
|
||||||
| FilterCondition::In { .. } => None,
|
| FilterCondition::In { .. } => None,
|
||||||
|
FilterCondition::VectorExists { fid, .. } => Some(fid),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -292,10 +301,7 @@ fn parse_in_body(input: Span) -> IResult<Vec<Token>> {
|
|||||||
let (input, _) = ws(word_exact("IN"))(input)?;
|
let (input, _) = ws(word_exact("IN"))(input)?;
|
||||||
|
|
||||||
// everything after `IN` can be a failure
|
// everything after `IN` can be a failure
|
||||||
let (input, _) =
|
let (input, _) = tag("[")(input).map_cut(ErrorKind::InOpeningBracket)?;
|
||||||
cut_with_err(tag("["), |_| Error::new_from_kind(input, ErrorKind::InOpeningBracket))(
|
|
||||||
input,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
let (input, content) = cut(parse_value_list)(input)?;
|
let (input, content) = cut(parse_value_list)(input)?;
|
||||||
|
|
||||||
@ -529,8 +535,7 @@ fn parse_primary(input: Span, depth: usize) -> IResult<FilterCondition> {
|
|||||||
parse_is_not_null,
|
parse_is_not_null,
|
||||||
parse_is_empty,
|
parse_is_empty,
|
||||||
parse_is_not_empty,
|
parse_is_not_empty,
|
||||||
parse_exists,
|
alt((parse_vectors_exists, parse_vectors_not_exists, parse_exists, parse_not_exists)),
|
||||||
parse_not_exists,
|
|
||||||
parse_to,
|
parse_to,
|
||||||
parse_contains,
|
parse_contains,
|
||||||
parse_not_contains,
|
parse_not_contains,
|
||||||
@ -586,6 +591,19 @@ impl std::fmt::Display for FilterCondition<'_> {
|
|||||||
}
|
}
|
||||||
write!(f, "]")
|
write!(f, "]")
|
||||||
}
|
}
|
||||||
|
FilterCondition::VectorExists { fid: _, embedder, filter: inner } => {
|
||||||
|
write!(f, "_vectors")?;
|
||||||
|
if let Some(embedder) = embedder {
|
||||||
|
write!(f, ".{embedder:?}")?;
|
||||||
|
}
|
||||||
|
match inner {
|
||||||
|
VectorFilter::Fragment(fragment) => write!(f, ".fragments.{fragment:?}"),
|
||||||
|
VectorFilter::DocumentTemplate => write!(f, ".documentTemplate"),
|
||||||
|
VectorFilter::UserProvided => write!(f, ".userProvided"),
|
||||||
|
VectorFilter::Regenerate => write!(f, ".regenerate"),
|
||||||
|
VectorFilter::None => Ok(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
FilterCondition::GeoLowerThan { point, radius } => {
|
FilterCondition::GeoLowerThan { point, radius } => {
|
||||||
write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius)
|
write!(f, "_geoRadius({}, {}, {})", point[0], point[1], radius)
|
||||||
}
|
}
|
||||||
|
@ -80,6 +80,39 @@ pub fn word_exact<'a, 'b: 'a>(tag: &'b str) -> impl Fn(Span<'a>) -> IResult<'a,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// vector_value = ( non_dot_word | singleQuoted | doubleQuoted)
|
||||||
|
pub fn parse_vector_value(input: Span) -> IResult<Token> {
|
||||||
|
pub fn non_dot_word(input: Span) -> IResult<Token> {
|
||||||
|
let (input, word) = take_while1(|c| is_value_component(c) && c != '.')(input)?;
|
||||||
|
Ok((input, word.into()))
|
||||||
|
}
|
||||||
|
|
||||||
|
let (input, value) = alt((
|
||||||
|
delimited(char('\''), cut(|input| quoted_by('\'', input)), cut(char('\''))),
|
||||||
|
delimited(char('"'), cut(|input| quoted_by('"', input)), cut(char('"'))),
|
||||||
|
non_dot_word,
|
||||||
|
))(input)?;
|
||||||
|
|
||||||
|
match unescaper::unescape(value.value()) {
|
||||||
|
Ok(content) => {
|
||||||
|
if content.len() != value.value().len() {
|
||||||
|
Ok((input, Token::new(value.original_span(), Some(content))))
|
||||||
|
} else {
|
||||||
|
Ok((input, value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
|
||||||
|
Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
|
||||||
|
value.original_span(),
|
||||||
|
ErrorKind::InvalidEscapedNumber,
|
||||||
|
))),
|
||||||
|
Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
|
||||||
|
value.original_span(),
|
||||||
|
ErrorKind::MalformedValue,
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// value = WS* ( word | singleQuoted | doubleQuoted) WS+
|
/// value = WS* ( word | singleQuoted | doubleQuoted) WS+
|
||||||
pub fn parse_value(input: Span) -> IResult<Token> {
|
pub fn parse_value(input: Span) -> IResult<Token> {
|
||||||
// to get better diagnostic message we are going to strip the left whitespaces from the input right now
|
// to get better diagnostic message we are going to strip the left whitespaces from the input right now
|
||||||
|
@ -779,7 +779,7 @@ async fn vector_filter_missing_fragment() {
|
|||||||
.await;
|
.await;
|
||||||
snapshot!(value, @r#"
|
snapshot!(value, @r#"
|
||||||
{
|
{
|
||||||
"message": "Index `[uuid]`: Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.\n15:24 _vectors.rest.fragments EXISTS",
|
"message": "The vector filter is missing a fragment name.\n24:31 _vectors.rest.fragments EXISTS",
|
||||||
"code": "invalid_search_filter",
|
"code": "invalid_search_filter",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||||
@ -981,7 +981,7 @@ async fn vector_filter_specific_fragment_user_provided() {
|
|||||||
.await;
|
.await;
|
||||||
snapshot!(value, @r#"
|
snapshot!(value, @r#"
|
||||||
{
|
{
|
||||||
"message": "Index `[uuid]`: Vector filter cannot have both `fragments` and `userProvided`.\n15:24 _vectors.rest.fragments.other.userProvided EXISTS",
|
"message": "The vector filter has leftover tokens.\n30:50 _vectors.rest.fragments.other.userProvided EXISTS",
|
||||||
"code": "invalid_search_filter",
|
"code": "invalid_search_filter",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||||
@ -1190,11 +1190,11 @@ async fn vector_filter_regenerate() {
|
|||||||
}))
|
}))
|
||||||
.await;
|
.await;
|
||||||
snapshot!(value, @r#"
|
snapshot!(value, @r#"
|
||||||
{
|
{
|
||||||
"message": "Index `[uuid]`: Vector filter cannot have both `fragments` and `regenerate`.\n15:24 _vectors.rest.fragments.basic.regenerate EXISTS",
|
"message": "The vector filter has leftover tokens.\n30:48 _vectors.rest.fragments.basic.regenerate EXISTS",
|
||||||
"code": "invalid_search_filter",
|
"code": "invalid_search_filter",
|
||||||
"type": "invalid_request",
|
"type": "invalid_request",
|
||||||
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
"link": "https://docs.meilisearch.com/errors#invalid_search_filter"
|
||||||
}
|
}
|
||||||
"#);
|
"#);
|
||||||
}
|
}
|
||||||
|
@ -10,8 +10,8 @@ use memchr::memmem::Finder;
|
|||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::{facet_range_search, filter_vector::VectorFilter};
|
use super::facet_range_search;
|
||||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
|
||||||
use crate::error::{Error, UserError};
|
use crate::error::{Error, UserError};
|
||||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
@ -230,7 +230,7 @@ impl<'a> Filter<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn use_vector_filter(&self) -> Option<&Token> {
|
pub fn use_vector_filter(&self) -> Option<&Token> {
|
||||||
self.condition.use_vector_filter()
|
dbg!(self.condition.use_vector_filter())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -241,10 +241,10 @@ impl<'a> Filter<'a> {
|
|||||||
let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?;
|
let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?;
|
||||||
|
|
||||||
for fid in self.condition.fids(MAX_FILTER_DEPTH) {
|
for fid in self.condition.fids(MAX_FILTER_DEPTH) {
|
||||||
let attribute = fid.value();
|
let attribute = dbg!(fid.value());
|
||||||
if matching_features(attribute, &filterable_attributes_rules)
|
if matching_features(attribute, &filterable_attributes_rules)
|
||||||
.is_some_and(|(_, features)| features.is_filterable())
|
.is_some_and(|(_, features)| features.is_filterable())
|
||||||
|| VectorFilter::matches(attribute)
|
|| attribute == RESERVED_VECTORS_FIELD_NAME
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -549,16 +549,6 @@ impl<'a> Filter<'a> {
|
|||||||
}
|
}
|
||||||
FilterCondition::Condition { fid, op } => {
|
FilterCondition::Condition { fid, op } => {
|
||||||
let value = fid.value();
|
let value = fid.value();
|
||||||
if VectorFilter::matches(value) {
|
|
||||||
if !matches!(op, Condition::Exists) {
|
|
||||||
return Err(Error::UserError(UserError::InvalidFilter(String::from(
|
|
||||||
"Vector filter can only be used with the `exists` operator",
|
|
||||||
))));
|
|
||||||
}
|
|
||||||
let vector_filter = VectorFilter::parse(fid)?;
|
|
||||||
return vector_filter.evaluate(rtxn, index, universe);
|
|
||||||
}
|
|
||||||
|
|
||||||
let Some(field_id) = field_ids_map.id(value) else {
|
let Some(field_id) = field_ids_map.id(value) else {
|
||||||
return Ok(RoaringBitmap::new());
|
return Ok(RoaringBitmap::new());
|
||||||
};
|
};
|
||||||
@ -616,6 +606,9 @@ impl<'a> Filter<'a> {
|
|||||||
Ok(RoaringBitmap::new())
|
Ok(RoaringBitmap::new())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
FilterCondition::VectorExists { fid: _, embedder, filter } => {
|
||||||
|
super::filter_vector::evaluate(rtxn, index, universe, embedder.clone(), filter)
|
||||||
|
}
|
||||||
FilterCondition::GeoLowerThan { point, radius } => {
|
FilterCondition::GeoLowerThan { point, radius } => {
|
||||||
if index.is_geo_filtering_enabled(rtxn)? {
|
if index.is_geo_filtering_enabled(rtxn)? {
|
||||||
let base_point: [f64; 2] =
|
let base_point: [f64; 2] =
|
||||||
|
@ -1,45 +1,13 @@
|
|||||||
use filter_parser::Token;
|
use filter_parser::{Token, VectorFilter};
|
||||||
use roaring::{MultiOps, RoaringBitmap};
|
use roaring::{MultiOps, RoaringBitmap};
|
||||||
|
|
||||||
use crate::error::{Error, UserError};
|
use crate::error::Error;
|
||||||
use crate::vector::db::IndexEmbeddingConfig;
|
use crate::vector::db::IndexEmbeddingConfig;
|
||||||
use crate::vector::{ArroyStats, ArroyWrapper};
|
use crate::vector::{ArroyStats, ArroyWrapper};
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
enum VectorFilterInner<'a> {
|
|
||||||
Fragment(Token<'a>),
|
|
||||||
DocumentTemplate,
|
|
||||||
UserProvided,
|
|
||||||
Regenerate,
|
|
||||||
None,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug)]
|
|
||||||
pub(super) struct VectorFilter<'a> {
|
|
||||||
embedder: Option<Token<'a>>,
|
|
||||||
inner: VectorFilterInner<'a>,
|
|
||||||
}
|
|
||||||
|
|
||||||
#[derive(Debug, thiserror::Error)]
|
#[derive(Debug, thiserror::Error)]
|
||||||
pub enum VectorFilterError<'a> {
|
pub enum VectorFilterError<'a> {
|
||||||
#[error("Vector filter cannot be empty.")]
|
|
||||||
EmptyFilter,
|
|
||||||
|
|
||||||
#[error("Vector filter must start with `_vectors` but found `{}`.", _0.value())]
|
|
||||||
InvalidPrefix(Token<'a>),
|
|
||||||
|
|
||||||
#[error("Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")]
|
|
||||||
MissingFragmentName(Token<'a>),
|
|
||||||
|
|
||||||
#[error("Vector filter cannot have both {}.", {
|
|
||||||
_0.iter().map(|t| format!("`{}`", t.value())).collect::<Vec<_>>().join(" and ")
|
|
||||||
})]
|
|
||||||
ExclusiveOptions(Vec<Token<'a>>),
|
|
||||||
|
|
||||||
#[error("Vector filter has leftover token: `{}`.", _0.value())]
|
|
||||||
LeftoverToken(Token<'a>),
|
|
||||||
|
|
||||||
#[error("The embedder `{}` does not exist. {}", embedder.value(), {
|
#[error("The embedder `{}` does not exist. {}", embedder.value(), {
|
||||||
if available.is_empty() {
|
if available.is_empty() {
|
||||||
String::from("This index does not have any configured embedders.")
|
String::from("This index does not have any configured embedders.")
|
||||||
@ -72,201 +40,113 @@ use VectorFilterError::*;
|
|||||||
impl<'a> From<VectorFilterError<'a>> for Error {
|
impl<'a> From<VectorFilterError<'a>> for Error {
|
||||||
fn from(err: VectorFilterError<'a>) -> Self {
|
fn from(err: VectorFilterError<'a>) -> Self {
|
||||||
match &err {
|
match &err {
|
||||||
EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())),
|
|
||||||
InvalidPrefix(token) | MissingFragmentName(token) | LeftoverToken(token) => {
|
|
||||||
token.clone().as_external_error(err).into()
|
|
||||||
}
|
|
||||||
ExclusiveOptions(tokens) => tokens
|
|
||||||
.first()
|
|
||||||
.cloned()
|
|
||||||
.unwrap_or_else(|| Token::from("")) // Should never happen: tokens is never created empty
|
|
||||||
.as_external_error(err)
|
|
||||||
.into(),
|
|
||||||
EmbedderDoesNotExist { embedder: token, .. }
|
EmbedderDoesNotExist { embedder: token, .. }
|
||||||
| FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(),
|
| FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> VectorFilter<'a> {
|
pub(super) fn evaluate(
|
||||||
pub(super) fn matches(value: &str) -> bool {
|
rtxn: &heed::RoTxn<'_>,
|
||||||
value.starts_with("_vectors.") || value == "_vectors"
|
index: &Index,
|
||||||
|
universe: Option<&RoaringBitmap>,
|
||||||
|
embedder: Option<Token<'_>>,
|
||||||
|
filter: &VectorFilter<'_>,
|
||||||
|
) -> crate::Result<RoaringBitmap> {
|
||||||
|
let index_embedding_configs = index.embedding_configs();
|
||||||
|
let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?;
|
||||||
|
|
||||||
|
let embedders = match embedder {
|
||||||
|
Some(embedder) => vec![embedder],
|
||||||
|
None => embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut docids = embedders
|
||||||
|
.iter()
|
||||||
|
.map(|e| evaluate_inner(rtxn, index, e, &embedding_configs, filter))
|
||||||
|
.union()?;
|
||||||
|
|
||||||
|
if let Some(universe) = universe {
|
||||||
|
docids &= universe;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parses a vector filter string.
|
Ok(docids)
|
||||||
///
|
|
||||||
/// Valid formats:
|
|
||||||
/// - `_vectors`
|
|
||||||
/// - `_vectors.{embedder_name}`
|
|
||||||
/// - `_vectors.{embedder_name}.regenerate`
|
|
||||||
/// - `_vectors.{embedder_name}.userProvided`
|
|
||||||
/// - `_vectors.{embedder_name}.documentTemplate`
|
|
||||||
/// - `_vectors.{embedder_name}.fragments.{fragment_name}`
|
|
||||||
pub(super) fn parse(s: &'a Token<'a>) -> Result<Self, VectorFilterError<'a>> {
|
|
||||||
let mut split = s.split(".").peekable();
|
|
||||||
|
|
||||||
match split.next() {
|
|
||||||
Some(token) if token.value() == "_vectors" => (),
|
|
||||||
Some(token) => return Err(InvalidPrefix(token)),
|
|
||||||
None => return Err(EmptyFilter),
|
|
||||||
}
|
|
||||||
|
|
||||||
let embedder_name = split.next();
|
|
||||||
|
|
||||||
let mut fragment_tokens = None;
|
|
||||||
if split.peek().map(|t| t.value()) == Some("fragments") {
|
|
||||||
let token = split.next().expect("it was peeked before");
|
|
||||||
let name = split.next().ok_or_else(|| MissingFragmentName(token.clone()))?;
|
|
||||||
|
|
||||||
fragment_tokens = Some((token, name));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut remaining_tokens = split.collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let mut user_provided_token = None;
|
|
||||||
if let Some(position) = remaining_tokens.iter().position(|t| t.value() == "userProvided") {
|
|
||||||
user_provided_token = Some(remaining_tokens.remove(position));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut document_template_token = None;
|
|
||||||
if let Some(position) =
|
|
||||||
remaining_tokens.iter().position(|t| t.value() == "documentTemplate")
|
|
||||||
{
|
|
||||||
document_template_token = Some(remaining_tokens.remove(position));
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut regenerate_token = None;
|
|
||||||
if let Some(position) = remaining_tokens.iter().position(|t| t.value() == "regenerate") {
|
|
||||||
regenerate_token = Some(remaining_tokens.remove(position));
|
|
||||||
}
|
|
||||||
|
|
||||||
if !remaining_tokens.is_empty() {
|
|
||||||
return Err(LeftoverToken(remaining_tokens.remove(0)));
|
|
||||||
}
|
|
||||||
|
|
||||||
let inner =
|
|
||||||
match (fragment_tokens, user_provided_token, document_template_token, regenerate_token)
|
|
||||||
{
|
|
||||||
(Some((_token, name)), None, None, None) => VectorFilterInner::Fragment(name),
|
|
||||||
(None, Some(_), None, None) => VectorFilterInner::UserProvided,
|
|
||||||
(None, None, Some(_), None) => VectorFilterInner::DocumentTemplate,
|
|
||||||
(None, None, None, Some(_)) => VectorFilterInner::Regenerate,
|
|
||||||
(None, None, None, None) => VectorFilterInner::None,
|
|
||||||
(a, b, c, d) => {
|
|
||||||
let a = a.map(|(token, _)| token);
|
|
||||||
let present = [a, b, c, d].into_iter().flatten().collect();
|
|
||||||
return Err(ExclusiveOptions(present));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(Self { inner, embedder: embedder_name })
|
|
||||||
}
|
|
||||||
|
|
||||||
pub(super) fn evaluate(
|
|
||||||
self,
|
|
||||||
rtxn: &heed::RoTxn<'_>,
|
|
||||||
index: &Index,
|
|
||||||
universe: Option<&RoaringBitmap>,
|
|
||||||
) -> crate::Result<RoaringBitmap> {
|
|
||||||
let index_embedding_configs = index.embedding_configs();
|
|
||||||
let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?;
|
|
||||||
|
|
||||||
let embedders = match self.embedder {
|
|
||||||
Some(embedder) => vec![embedder],
|
|
||||||
None => {
|
|
||||||
embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect()
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut docids = embedders
|
|
||||||
.iter()
|
|
||||||
.map(|e| self.inner.evaluate(rtxn, index, e, &embedding_configs))
|
|
||||||
.union()?;
|
|
||||||
|
|
||||||
if let Some(universe) = universe {
|
|
||||||
docids &= universe;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(docids)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl VectorFilterInner<'_> {
|
fn evaluate_inner(
|
||||||
fn evaluate(
|
rtxn: &heed::RoTxn<'_>,
|
||||||
&self,
|
index: &Index,
|
||||||
rtxn: &heed::RoTxn<'_>,
|
embedder: &Token<'_>,
|
||||||
index: &Index,
|
embedding_configs: &[IndexEmbeddingConfig],
|
||||||
embedder: &Token<'_>,
|
filter: &VectorFilter<'_>,
|
||||||
embedding_configs: &[IndexEmbeddingConfig],
|
) -> crate::Result<RoaringBitmap> {
|
||||||
) -> crate::Result<RoaringBitmap> {
|
let embedder_name = embedder.value();
|
||||||
let embedder_name = embedder.value();
|
let available_embedders =
|
||||||
let available_embedders =
|
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
|
||||||
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
|
|
||||||
|
|
||||||
let embedding_config = embedding_configs
|
let embedding_config = embedding_configs
|
||||||
.iter()
|
.iter()
|
||||||
.find(|config| config.name == embedder_name)
|
.find(|config| config.name == embedder_name)
|
||||||
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
||||||
|
|
||||||
let embedder_info = index
|
let embedder_info = index
|
||||||
.embedding_configs()
|
.embedding_configs()
|
||||||
.embedder_info(rtxn, embedder_name)?
|
.embedder_info(rtxn, embedder_name)?
|
||||||
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
||||||
|
|
||||||
let arroy_wrapper = ArroyWrapper::new(
|
let arroy_wrapper = ArroyWrapper::new(
|
||||||
index.vector_arroy,
|
index.vector_arroy,
|
||||||
embedder_info.embedder_id,
|
embedder_info.embedder_id,
|
||||||
embedding_config.config.quantized(),
|
embedding_config.config.quantized(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let docids = match self {
|
let docids = match filter {
|
||||||
VectorFilterInner::Fragment(fragment) => {
|
VectorFilter::Fragment(fragment) => {
|
||||||
let fragment_name = fragment.value();
|
let fragment_name = fragment.value();
|
||||||
let fragment_config = embedding_config
|
let fragment_config = embedding_config
|
||||||
.fragments
|
.fragments
|
||||||
.as_slice()
|
.as_slice()
|
||||||
.iter()
|
.iter()
|
||||||
.find(|fragment| fragment.name == fragment_name)
|
.find(|fragment| fragment.name == fragment_name)
|
||||||
.ok_or_else(|| FragmentDoesNotExist {
|
.ok_or_else(|| FragmentDoesNotExist {
|
||||||
embedder,
|
embedder,
|
||||||
fragment,
|
fragment,
|
||||||
available: embedding_config
|
available: embedding_config
|
||||||
.fragments
|
.fragments
|
||||||
.as_slice()
|
.as_slice()
|
||||||
.iter()
|
.iter()
|
||||||
.map(|f| f.name.clone())
|
.map(|f| f.name.clone())
|
||||||
.collect(),
|
.collect(),
|
||||||
})?;
|
})?;
|
||||||
|
|
||||||
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
|
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
|
||||||
|
}
|
||||||
|
VectorFilter::DocumentTemplate => {
|
||||||
|
if !embedding_config.fragments.as_slice().is_empty() {
|
||||||
|
return Ok(RoaringBitmap::new());
|
||||||
}
|
}
|
||||||
VectorFilterInner::DocumentTemplate => {
|
|
||||||
if !embedding_config.fragments.as_slice().is_empty() {
|
|
||||||
return Ok(RoaringBitmap::new());
|
|
||||||
}
|
|
||||||
|
|
||||||
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
|
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
|
||||||
let mut stats = ArroyStats::default();
|
let mut stats = ArroyStats::default();
|
||||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||||
stats.documents - user_provided_docsids.clone()
|
stats.documents - user_provided_docsids.clone()
|
||||||
}
|
}
|
||||||
VectorFilterInner::UserProvided => {
|
VectorFilter::UserProvided => {
|
||||||
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
|
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
|
||||||
user_provided_docsids.clone()
|
user_provided_docsids.clone()
|
||||||
}
|
}
|
||||||
VectorFilterInner::Regenerate => {
|
VectorFilter::Regenerate => {
|
||||||
let mut stats = ArroyStats::default();
|
let mut stats = ArroyStats::default();
|
||||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||||
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
|
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
|
||||||
stats.documents - skip_regenerate
|
stats.documents - skip_regenerate
|
||||||
}
|
}
|
||||||
VectorFilterInner::None => {
|
VectorFilter::None => {
|
||||||
let mut stats = ArroyStats::default();
|
let mut stats = ArroyStats::default();
|
||||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||||
stats.documents
|
stats.documents
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(docids)
|
Ok(docids)
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user