This commit is contained in:
Mubelotix
2025-07-22 11:18:41 +02:00
parent 982e989886
commit 6d93b36279
2 changed files with 122 additions and 170 deletions

View File

@ -1152,7 +1152,6 @@ async fn vector_filter_or_combination() {
"#);
}
#[actix_rt::test]
async fn vector_filter_regenerate() {
let index = shared_index_for_fragments().await;
@ -1186,4 +1185,3 @@ async fn vector_filter_regenerate() {
"#);
}
}

View File

@ -14,108 +14,49 @@ enum VectorFilterInner<'a> {
FullEmbedder { embedder_token: Token<'a> },
}
impl VectorFilterInner<'_> {
fn evaluate_inner(
&self,
rtxn: &heed::RoTxn<'_>,
index: &Index,
embedding_configs: &[IndexEmbeddingConfig],
regenerate: bool,
) -> crate::Result<RoaringBitmap> {
let embedder = match self {
VectorFilterInner::Fragment { embedder_token, .. } => embedder_token,
VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token,
VectorFilterInner::UserProvided { embedder_token } => embedder_token,
VectorFilterInner::FullEmbedder { embedder_token } => embedder_token,
};
let embedder_name = embedder.value();
let available_embedders =
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
let embedding_config = embedding_configs
.iter()
.find(|config| config.name == embedder_name)
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let embedder_info = index
.embedding_configs()
.embedder_info(rtxn, embedder_name)?
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let arroy_wrapper = ArroyWrapper::new(
index.vector_arroy,
embedder_info.embedder_id,
embedding_config.config.quantized(),
);
let mut docids = match self {
VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => {
let fragment_name = fragment.value();
let fragment_config = embedding_config
.fragments
.as_slice()
.iter()
.find(|fragment| fragment.name == fragment_name)
.ok_or_else(|| FragmentDoesNotExist {
embedder,
fragment,
available: embedding_config
.fragments
.as_slice()
.iter()
.map(|f| f.name.clone())
.collect(),
})?;
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
}
VectorFilterInner::DocumentTemplate { .. } => {
if !embedding_config.fragments.as_slice().is_empty() {
return Ok(RoaringBitmap::new());
}
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents - user_provided_docsids.clone()
}
VectorFilterInner::UserProvided { .. } => {
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
user_provided_docsids.clone()
}
VectorFilterInner::FullEmbedder { .. } => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents
}
};
if regenerate {
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
docids -= skip_regenerate;
}
Ok(docids)
}
}
#[derive(Debug)]
pub(super) struct VectorFilter<'a> {
inner: Option<VectorFilterInner<'a>>,
regenerate: bool,
}
#[derive(Debug)]
#[derive(Debug, thiserror::Error)]
pub enum VectorFilterError<'a> {
#[error("Vector filter cannot be empty.")]
EmptyFilter,
#[error("Vector filter must start with `_vectors` but found `{}`.", _0.value())]
InvalidPrefix(Token<'a>),
#[error("Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")]
MissingFragmentName(Token<'a>),
#[error("Vector filter cannot have both `{}` and `{}`.", _0.0.value(), _0.1.value())]
ExclusiveOptions(Box<(Token<'a>, Token<'a>)>),
#[error("Vector filter has leftover token: `{}`.", _0.value())]
LeftoverToken(Token<'a>),
EmbedderDoesNotExist {
embedder: &'a Token<'a>,
available: Vec<String>,
},
#[error("The embedder `{}` does not exist. {}", embedder.value(), {
if available.is_empty() {
String::from("This index does not have any configured embedders.")
} else {
let mut available = available.clone();
available.sort_unstable();
format!("Available embedders are: {}.", available.iter().map(|e| format!("`{e}`")).collect::<Vec<_>>().join(", "))
}
})]
EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec<String> },
#[error("The fragment `{}` does not exist on embedder `{}`. {}", fragment.value(), embedder.value(), {
if available.is_empty() {
String::from("This embedder does not have any configured fragments.")
} else {
let mut available = available.clone();
available.sort_unstable();
format!("Available fragments on this embedder are: {}.", available.iter().map(|f| format!("`{f}`")).collect::<Vec<_>>().join(", "))
}
})]
FragmentDoesNotExist {
embedder: &'a Token<'a>,
fragment: &'a Token<'a>,
@ -125,78 +66,6 @@ pub enum VectorFilterError<'a> {
use VectorFilterError::*;
impl std::error::Error for VectorFilterError<'_> {}
impl std::fmt::Display for VectorFilterError<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
EmptyFilter => {
write!(f, "Vector filter cannot be empty.")
}
InvalidPrefix(prefix) => {
write!(
f,
"Vector filter must start with `_vectors` but found `{}`.",
prefix.value()
)
}
MissingFragmentName(_token) => {
write!(f, "Vector filter is inconsistent: either specify a fragment name or remove the `fragments` part.")
}
ExclusiveOptions(tokens) => {
write!(
f,
"Vector filter cannot have both `{}` and `{}`.",
tokens.0.value(),
tokens.1.value()
)
}
LeftoverToken(token) => {
write!(f, "Vector filter has leftover token: `{}`.", token.value())
}
EmbedderDoesNotExist { embedder, available } => {
write!(f, "The embedder `{}` does not exist.", embedder.value())?;
if available.is_empty() {
write!(f, " This index does not have configured embedders.")
} else {
write!(f, " Available embedders are: ")?;
let mut available = available.clone();
available.sort_unstable();
for (idx, embedder) in available.iter().enumerate() {
write!(f, "`{embedder}`")?;
if idx != available.len() - 1 {
write!(f, ", ")?;
}
}
write!(f, ".")
}
}
FragmentDoesNotExist { embedder, fragment, available } => {
write!(
f,
"The fragment `{}` does not exist on embedder `{}`.",
fragment.value(),
embedder.value(),
)?;
if available.is_empty() {
write!(f, " This embedder does not have configured fragments.")
} else {
write!(f, " Available fragments on this embedder are: ")?;
let mut available = available.clone();
available.sort_unstable();
for (idx, fragment) in available.iter().enumerate() {
write!(f, "`{fragment}`")?;
if idx != available.len() - 1 {
write!(f, ", ")?;
}
}
write!(f, ".")
}
}
}
}
}
impl<'a> From<VectorFilterError<'a>> for Error {
fn from(err: VectorFilterError<'a>) -> Self {
match &err {
@ -320,3 +189,88 @@ impl<'a> VectorFilter<'a> {
Ok(docids)
}
}
impl VectorFilterInner<'_> {
fn evaluate_inner(
&self,
rtxn: &heed::RoTxn<'_>,
index: &Index,
embedding_configs: &[IndexEmbeddingConfig],
regenerate: bool,
) -> crate::Result<RoaringBitmap> {
let embedder = match self {
VectorFilterInner::Fragment { embedder_token, .. } => embedder_token,
VectorFilterInner::DocumentTemplate { embedder_token } => embedder_token,
VectorFilterInner::UserProvided { embedder_token } => embedder_token,
VectorFilterInner::FullEmbedder { embedder_token } => embedder_token,
};
let embedder_name = embedder.value();
let available_embedders =
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
let embedding_config = embedding_configs
.iter()
.find(|config| config.name == embedder_name)
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let embedder_info = index
.embedding_configs()
.embedder_info(rtxn, embedder_name)?
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
let arroy_wrapper = ArroyWrapper::new(
index.vector_arroy,
embedder_info.embedder_id,
embedding_config.config.quantized(),
);
let mut docids = match self {
VectorFilterInner::Fragment { embedder_token: embedder, fragment_token: fragment } => {
let fragment_name = fragment.value();
let fragment_config = embedding_config
.fragments
.as_slice()
.iter()
.find(|fragment| fragment.name == fragment_name)
.ok_or_else(|| FragmentDoesNotExist {
embedder,
fragment,
available: embedding_config
.fragments
.as_slice()
.iter()
.map(|f| f.name.clone())
.collect(),
})?;
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| bitmap.clone())?
}
VectorFilterInner::DocumentTemplate { .. } => {
if !embedding_config.fragments.as_slice().is_empty() {
return Ok(RoaringBitmap::new());
}
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents - user_provided_docsids.clone()
}
VectorFilterInner::UserProvided { .. } => {
let user_provided_docsids = embedder_info.embedding_status.user_provided_docids();
user_provided_docsids.clone()
}
VectorFilterInner::FullEmbedder { .. } => {
let mut stats = ArroyStats::default();
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
stats.documents
}
};
if regenerate {
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
docids -= skip_regenerate;
}
Ok(docids)
}
}