Merge branch 'fragment-filters' into render-route

This commit is contained in:
Mubelotix
2025-08-01 09:13:17 +02:00
19 changed files with 920 additions and 30 deletions

View File

@ -111,7 +111,7 @@ impl FilterableAttributesFeatures {
self.filter.is_filterable_null()
}
/// Check if `IS EXISTS` is allowed
/// Check if `EXISTS` is allowed
pub fn is_filterable_exists(&self) -> bool {
self.filter.is_filterable_exists()
}

View File

@ -11,7 +11,7 @@ use roaring::{MultiOps, RoaringBitmap};
use serde_json::Value;
use super::facet_range_search;
use crate::constants::RESERVED_GEO_FIELD_NAME;
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
use crate::error::{Error, UserError};
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
use crate::heed_codec::facet::{
@ -228,6 +228,10 @@ impl<'a> Filter<'a> {
pub fn use_contains_operator(&self) -> Option<&Token> {
self.condition.use_contains_operator()
}
pub fn use_vector_filter(&self) -> Option<&Token> {
self.condition.use_vector_filter()
}
}
impl<'a> Filter<'a> {
@ -235,10 +239,12 @@ impl<'a> Filter<'a> {
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let fields_ids_map = index.fields_ids_map(rtxn)?;
let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?;
for fid in self.condition.fids(MAX_FILTER_DEPTH) {
let attribute = fid.value();
if matching_features(attribute, &filterable_attributes_rules)
.is_some_and(|(_, features)| features.is_filterable())
|| attribute == RESERVED_VECTORS_FIELD_NAME
{
continue;
}
@ -542,7 +548,8 @@ impl<'a> Filter<'a> {
.union()
}
FilterCondition::Condition { fid, op } => {
let Some(field_id) = field_ids_map.id(fid.value()) else {
let value = fid.value();
let Some(field_id) = field_ids_map.id(value) else {
return Ok(RoaringBitmap::new());
};
let Some((rule_index, features)) =
@ -599,6 +606,9 @@ impl<'a> Filter<'a> {
Ok(RoaringBitmap::new())
}
}
FilterCondition::VectorExists { fid: _, embedder, filter } => {
super::filter_vector::evaluate(rtxn, index, universe, embedder.clone(), filter)
}
FilterCondition::GeoLowerThan { point, radius } => {
if index.is_geo_filtering_enabled(rtxn)? {
let base_point: [f64; 2] =

View File

@ -0,0 +1,155 @@
use filter_parser::{Token, VectorFilter};
use roaring::{MultiOps, RoaringBitmap};
use crate::error::Error;
use crate::vector::db::IndexEmbeddingConfig;
use crate::vector::{ArroyStats, ArroyWrapper};
use crate::Index;
#[derive(Debug, thiserror::Error)]
pub enum VectorFilterError<'a> {
#[error("The embedder `{}` does not exist. {}", embedder.value(), {
if available.is_empty() {
String::from("This index does not have any configured embedders.")
} else {
let mut available = available.clone();
available.sort_unstable();
format!("Available embedders are: {}.", available.iter().map(|e| format!("`{e}`")).collect::<Vec<_>>().join(", "))
}
})]
EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec<String> },
#[error("The fragment `{}` does not exist on embedder `{}`. {}", fragment.value(), embedder.value(), {
if available.is_empty() {
String::from("This embedder does not have any configured fragments.")
} else {
let mut available = available.clone();
available.sort_unstable();
format!("Available fragments on this embedder are: {}.", available.iter().map(|f| format!("`{f}`")).collect::<Vec<_>>().join(", "))
}
})]
FragmentDoesNotExist {
embedder: &'a Token<'a>,
fragment: &'a Token<'a>,
available: Vec<String>,
},
}
use VectorFilterError::*;
impl<'a> From<VectorFilterError<'a>> for Error {
fn from(err: VectorFilterError<'a>) -> Self {
match &err {
EmbedderDoesNotExist { embedder: token, .. }
| FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(),
}
}
}
pub(super) fn evaluate(
rtxn: &heed::RoTxn<'_>,
index: &Index,
universe: Option<&RoaringBitmap>,
embedder: Option<Token<'_>>,
filter: &VectorFilter<'_>,
) -> crate::Result<RoaringBitmap> {
let index_embedding_configs = index.embedding_configs();
let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?;
let embedders = match embedder {
Some(embedder) => vec![embedder],
None => embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect(),
};
let mut docids = embedders
.iter()
.map(|e| evaluate_inner(rtxn, index, e, &embedding_configs, filter))
.union()?;
if let Some(universe) = universe {
docids &= universe;
}
Ok(docids)
}
fn evaluate_inner(
rtxn: &heed::RoTxn<'_>,
index: &Index,
embedder: &Token<'_>,
embedding_configs: &[IndexEmbeddingConfig],
filter: &VectorFilter<'_>,
) -> crate::Result<RoaringBitmap> {
let embedder_name = embedder.value();
let available_embedders =
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
let embedding_config = embedding_configs
.iter()
.find(|config| config.name == embedder_name)
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let embedder_info = index
.embedding_configs()
.embedder_info(rtxn, embedder_name)?
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let arroy_wrapper = ArroyWrapper::new(
index.vector_arroy,
embedder_info.embedder_id,
embedding_config.config.quantized(),
);
let docids = match filter {
VectorFilter::Fragment(fragment) => {
let fragment_name = fragment.value();
let fragment_config = embedding_config
.fragments
.as_slice()
.iter()
.find(|fragment| fragment.name == fragment_name)
.ok_or_else(|| FragmentDoesNotExist {
embedder,
fragment,
available: embedding_config
.fragments
.as_slice()
.iter()
.map(|f| f.name.clone())
.collect(),
})?;
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| {
bitmap.clone() - user_provided_docids
})?
}
VectorFilter::DocumentTemplate => {
if !embedding_config.fragments.as_slice().is_empty() {
return Ok(RoaringBitmap::new());
}
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents - user_provided_docids.clone()
}
VectorFilter::UserProvided => {
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
user_provided_docids.clone()
}
VectorFilter::Regenerate => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
stats.documents - skip_regenerate
}
VectorFilter::None => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents
}
};
Ok(docids)
}

View File

@ -17,6 +17,7 @@ mod facet_range_search;
mod facet_sort_ascending;
mod facet_sort_descending;
mod filter;
mod filter_vector;
mod search;
fn facet_extreme_value<'t>(

View File

@ -1338,10 +1338,9 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() {
assert!(results.candidates.is_empty());
let mut search = index.search(&rtxn);
let results = search
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
.execute()
.unwrap();
let results =
dbg!(search.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()).execute())
.unwrap();
assert!(results.candidates.is_empty());
index

View File

@ -128,6 +128,7 @@ impl EmbeddingStatus {
pub fn is_user_provided(&self, docid: DocumentId) -> bool {
self.user_provided.contains(docid)
}
/// Whether vectors should be regenerated for that document and that embedder.
pub fn must_regenerate(&self, docid: DocumentId) -> bool {
let invert = self.skip_regenerate_different_from_user_provided.contains(docid);

View File

@ -556,9 +556,6 @@ impl ArroyWrapper {
for reader in self.readers(rtxn, self.quantized_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len();
}
@ -566,9 +563,6 @@ impl ArroyWrapper {
for reader in self.readers(rtxn, self.angular_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len();
}