Apply review suggestions

This commit is contained in:
Mubelotix
2025-07-22 10:56:05 +02:00
parent ab07e9480e
commit 0014ed3114
3 changed files with 161 additions and 139 deletions

View File

@ -981,7 +981,7 @@ async fn vector_filter_specific_fragment_user_provided() {
.await; .await;
snapshot!(value, @r#" snapshot!(value, @r#"
{ {
"message": "Index `[uuid]`: Vector filter cannot specify both a fragment name and userProvided.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS", "message": "Index `[uuid]`: Vector filter cannot have both `other` and `userProvided`.\n31:43 _vectors.rest.fragments.other.userProvided EXISTS",
"code": "invalid_search_filter", "code": "invalid_search_filter",
"type": "invalid_request", "type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_search_filter" "link": "https://docs.meilisearch.com/errors#invalid_search_filter"
@ -1052,9 +1052,6 @@ async fn vector_filter_document_template() {
{ {
"name": "kefir" "name": "kefir"
}, },
{
"name": "echo"
},
{ {
"name": "intel" "name": "intel"
}, },
@ -1066,7 +1063,7 @@ async fn vector_filter_document_template() {
"processingTimeMs": "[duration]", "processingTimeMs": "[duration]",
"limit": 20, "limit": 20,
"offset": 0, "offset": 0,
"estimatedTotalHits": 4 "estimatedTotalHits": 3
} }
"#); "#);
} }

View File

@ -2,14 +2,107 @@ use filter_parser::Token;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::error::{Error, UserError}; use crate::error::{Error, UserError};
use crate::vector::db::IndexEmbeddingConfig;
use crate::vector::{ArroyStats, ArroyWrapper}; use crate::vector::{ArroyStats, ArroyWrapper};
use crate::Index; use crate::Index;
#[derive(Debug)]
enum VectorFilterInner<'a> {
Fragment { embedder_token: Token<'a>, fragment_token: Token<'a> },
DocumentTemplate { embedder_token: Token<'a> },
UserProvided { embedder_token: Token<'a> },
FullEmbedder { embedder_token: Token<'a> },
}
impl VectorFilterInner<'_> {
fn evaluate_inner(
&self,
rtxn: &heed::RoTxn<'_>,
index: &Index,
embedding_configs: &[IndexEmbeddingConfig],
regenerate: bool,
) -> crate::Result<RoaringBitmap> {
let embedder = match self {
VectorFilterInner::Fragment { embedder_token, .. } => embedder_token,
VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token,
VectorFilterInner::UserProvided { embedder_token } => embedder_token,
VectorFilterInner::FullEmbedder { embedder_token } => embedder_token,
};
let embedder_name = embedder.value();
let available_embedders =
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
let embedding_config = embedding_configs
.iter()
.find(|config| config.name == embedder_name)
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let embedder_info = index
.embedding_configs()
.embedder_info(rtxn, embedder_name)?
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let arroy_wrapper = ArroyWrapper::new(
index.vector_arroy,
embedder_info.embedder_id,
embedding_config.config.quantized(),
);
let mut docids = match self {
VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => {
let fragment_name = fragment.value();
let fragment_config = embedding_config
.fragments
.as_slice()
.iter()
.find(|fragment| fragment.name == fragment_name)
.ok_or_else(|| FragmentDoesNotExist {
embedder,
fragment,
available: embedding_config
.fragments
.as_slice()
.iter()
.map(|f| f.name.clone())
.collect(),
})?;
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
}
VectorFilterInner::DocumentTemplate { .. } => {
if !embedding_config.fragments.as_slice().is_empty() {
return Ok(RoaringBitmap::new());
}
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents - user_provided_docsids.clone()
}
VectorFilterInner::UserProvided { .. } => {
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
user_provided_docsids.clone()
}
VectorFilterInner::FullEmbedder { .. } => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents
}
};
if regenerate {
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
docids &= skip_regenerate;
}
Ok(docids)
}
}
#[derive(Debug)]
pub(super) struct VectorFilter<'a> { pub(super) struct VectorFilter<'a> {
embedder_token: Option<Token<'a>>, inner: Option<VectorFilterInner<'a>>,
fragment_token: Option<Token<'a>>, regenerate: bool,
document_template: bool,
user_provided: bool,
} }
#[derive(Debug)] #[derive(Debug)]
@ -17,8 +110,7 @@ pub enum VectorFilterError<'a> {
EmptyFilter, EmptyFilter,
InvalidPrefix(Token<'a>), InvalidPrefix(Token<'a>),
MissingFragmentName(Token<'a>), MissingFragmentName(Token<'a>),
UserProvidedWithFragment(Token<'a>), ExclusiveOptions(Box<(Token<'a>, Token<'a>)>),
DocumentTemplateWithFragment(Token<'a>),
LeftoverToken(Token<'a>), LeftoverToken(Token<'a>),
EmbedderDoesNotExist { EmbedderDoesNotExist {
embedder: &'a Token<'a>, embedder: &'a Token<'a>,
@ -51,11 +143,13 @@ impl std::fmt::Display for VectorFilterError<'_> {
MissingFragmentName(_token) => { MissingFragmentName(_token) => {
write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.") write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")
} }
UserProvidedWithFragment(_token) => { ExclusiveOptions(tokens) => {
write!(f, "Vector filter cannot specify both a fragment name and userProvided.") write!(
} f,
DocumentTemplateWithFragment(_token) => { "Vector filter cannot have both `{}` and `{}`.",
write!(f, "Vector filter cannot specify both a fragment name and documentTemplate.") tokens.0.value(),
tokens.1.value()
)
} }
LeftoverToken(token) => { LeftoverToken(token) => {
write!(f, "Vector filter has leftover token: `{}`.", token.value()) write!(f, "Vector filter has leftover token: `{}`.", token.value())
@ -107,11 +201,10 @@ impl<'a> From<VectorFilterError<'a>> for Error {
fn from(err: VectorFilterError<'a>) -> Self { fn from(err: VectorFilterError<'a>) -> Self {
match &err { match &err {
EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())), EmptyFilter => Error::UserError(UserError::InvalidFilter(err.to_string())),
InvalidPrefix(token) InvalidPrefix(token) | MissingFragmentName(token) | LeftoverToken(token) => {
| MissingFragmentName(token) token.clone().as_external_error(err).into()
| UserProvidedWithFragment(token) }
| DocumentTemplateWithFragment(token) ExclusiveOptions(tokens) => tokens.1.clone().as_external_error(err).into(),
| LeftoverToken(token) => token.clone().as_external_error(err).into(),
EmbedderDoesNotExist { embedder: token, .. } EmbedderDoesNotExist { embedder: token, .. }
| FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(), | FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(),
} }
@ -127,11 +220,15 @@ impl<'a> VectorFilter<'a> {
/// ///
/// Valid formats: /// Valid formats:
/// - `_vectors` /// - `_vectors`
/// - `_vectors.mustRegenerate`
/// - `_vectors.{embedder_name}` /// - `_vectors.{embedder_name}`
/// - `_vectors.{embedder_name}.mustRegenerate`
/// - `_vectors.{embedder_name}.userProvided` /// - `_vectors.{embedder_name}.userProvided`
/// - `_vectors.{embedder_name}.userProvided.mustRegenerate`
/// - `_vectors.{embedder_name}.documentTemplate` /// - `_vectors.{embedder_name}.documentTemplate`
/// - `_vectors.{embedder_name}.documentTemplate.userProvided` /// - `_vectors.{embedder_name}.documentTemplate.mustRegenerate`
/// - `_vectors.{embedder_name}.fragments.{fragment_name}` /// - `_vectors.{embedder_name}.fragments.{fragment_name}`
/// - `_vectors.{embedder_name}.fragments.{fragment_name}.mustRegenerate`
pub(super) fn parse(s: &'a Token<'a>) -> Result<Self, VectorFilterError<'a>> { pub(super) fn parse(s: &'a Token<'a>) -> Result<Self, VectorFilterError<'a>> {
let mut split = s.split(".").peekable(); let mut split = s.split(".").peekable();
@ -151,38 +248,46 @@ impl<'a> VectorFilter<'a> {
} }
let mut user_provided_token = None; let mut user_provided_token = None;
if split.peek().map(|t| t.value()) == Some("userProvided") if split.peek().map(|t| t.value()) == Some("userProvided") {
|| split.peek().map(|t| t.value()) == Some("user_provided")
{
user_provided_token = split.next(); user_provided_token = split.next();
} }
let mut document_template_token = None; let mut document_template_token = None;
if split.peek().map(|t| t.value()) == Some("documentTemplate") if split.peek().map(|t| t.value()) == Some("documentTemplate") {
|| split.peek().map(|t| t.value()) == Some("document_template")
{
document_template_token = split.next(); document_template_token = split.next();
} }
if let (Some(_), Some(user_provided_token)) = (&fragment_name, &user_provided_token) { let mut regenerate_token = None;
return Err(UserProvidedWithFragment(user_provided_token.clone()))?; if split.peek().map(|t| t.value()) == Some("regenerate") {
regenerate_token = split.next();
} }
if let (Some(_), Some(document_template_token)) = (&fragment_name, &document_template_token) let inner = match (fragment_name, user_provided_token, document_template_token) {
{ (Some(fragment_name), None, None) => Some(VectorFilterInner::Fragment {
return Err(DocumentTemplateWithFragment(document_template_token.clone()))?; embedder_token: embedder_name
.expect("embedder name comes before fragment so it's always Some"),
fragment_token: fragment_name,
}),
(None, Some(_), None) => Some(VectorFilterInner::UserProvided {
embedder_token: embedder_name
.expect("embedder name comes before userProvided so it's always Some"),
}),
(None, None, Some(_)) => Some(VectorFilterInner::DocumentTemplate {
embedder_token: embedder_name
.expect("embedder name comes before documentTemplate so it's always Some"),
}),
(Some(a), Some(b), _) | (_, Some(a), Some(b)) | (Some(a), None, Some(b)) => {
return Err(ExclusiveOptions(Box::new((a, b))));
} }
(None, None, None) => embedder_name
.map(|embedder_token| VectorFilterInner::FullEmbedder { embedder_token }),
};
if let Some(next) = split.next() { if let Some(next) = split.next() {
return Err(LeftoverToken(next))?; return Err(LeftoverToken(next))?;
} }
Ok(Self { Ok(Self { inner, regenerate: regenerate_token.is_some() })
embedder_token: embedder_name,
fragment_token: fragment_name,
user_provided: user_provided_token.is_some(),
document_template: document_template_token.is_some(),
})
} }
pub(super) fn evaluate( pub(super) fn evaluate(
@ -194,100 +299,19 @@ impl<'a> VectorFilter<'a> {
let index_embedding_configs = index.embedding_configs(); let index_embedding_configs = index.embedding_configs();
let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?; let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?;
let mut embedders = Vec::new(); let inners = dbg!(match self.inner {
if let Some(embedder_token) = &self.embedder_token { Some(inner) => vec![inner],
let embedder_name = embedder_token.value(); None => embedding_configs
.iter()
let Some(embedding_config) = .map(|config| VectorFilterInner::FullEmbedder {
embedding_configs.iter().find(|config| config.name == embedder_name) embedder_token: Token::from(config.name.as_str()),
else { })
return Err(EmbedderDoesNotExist { .collect(),
embedder: embedder_token, });
available: embedding_configs.iter().map(|c| c.name.clone()).collect(),
})?;
};
let Some(embedder_info) = index_embedding_configs.embedder_info(rtxn, embedder_name)?
else {
return Err(EmbedderDoesNotExist {
embedder: embedder_token,
available: embedding_configs.iter().map(|c| c.name.clone()).collect(),
})?;
};
if self.document_template && !embedding_config.fragments.as_slice().is_empty() {
return Ok(RoaringBitmap::new());
}
embedders.push((embedding_config, embedder_info));
} else {
for embedder_config in embedding_configs.iter() {
let Some(embedder_info) =
index_embedding_configs.embedder_info(rtxn, &embedder_config.name)?
else {
continue;
};
embedders.push((embedder_config, embedder_info));
}
};
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
for (embedding_config, embedder_info) in embedders { for inner in inners.iter() {
let arroy_wrapper = ArroyWrapper::new( docids |= inner.evaluate_inner(rtxn, index, &embedding_configs, self.regenerate)?;
index.vector_arroy,
embedder_info.embedder_id,
embedding_config.config.quantized(),
);
docids |= if let Some(fragment_token) = &self.fragment_token {
let fragment_name = fragment_token.value();
let Some(fragment_config) = embedding_config
.fragments
.as_slice()
.iter()
.find(|fragment| fragment.name == fragment_name)
else {
return Err(FragmentDoesNotExist {
embedder: self
.embedder_token
.as_ref()
.expect("there can't be a fragment without an embedder"),
fragment: fragment_token,
available: embedding_config
.fragments
.as_slice()
.iter()
.map(|f| f.name.clone())
.collect(),
})?;
};
if let Some(universe) = universe {
arroy_wrapper
.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap & universe)?
} else {
arroy_wrapper
.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
}
} else {
let mut universe = universe.cloned();
if self.user_provided {
let user_provided_docsids =
embedder_info.embedding_status.user_provided_docids();
match &mut universe {
Some(universe) => *universe &= user_provided_docsids,
None => universe = Some(user_provided_docsids.clone()),
}
}
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
if let Some(universe) = &universe {
stats.documents & universe
} else {
stats.documents
}
};
} }
if let Some(universe) = universe { if let Some(universe) = universe {

View File

@ -128,6 +128,7 @@ impl EmbeddingStatus {
pub fn is_user_provided(&self, docid: DocumentId) -> bool { pub fn is_user_provided(&self, docid: DocumentId) -> bool {
self.user_provided.contains(docid) self.user_provided.contains(docid)
} }
/// Whether vectors should be regenerated for that document and that embedder. /// Whether vectors should be regenerated for that document and that embedder.
pub fn must_regenerate(&self, docid: DocumentId) -> bool { pub fn must_regenerate(&self, docid: DocumentId) -> bool {
let invert = self.skip_regenerate_different_from_user_provided.contains(docid); let invert = self.skip_regenerate_different_from_user_provided.contains(docid);