mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-23 21:26:26 +00:00
Merge branch 'main' into tasks-du-sheitan
This commit is contained in:
@ -639,3 +639,29 @@ fn conditionally_lookup_for_error_message() {
|
||||
assert_eq!(err.to_string(), format!("{} {}", prefix, suffix));
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DidYouMean<'a>(Option<&'a str>);
|
||||
|
||||
impl<'a> DidYouMean<'a> {
|
||||
pub fn new(key: &str, keys: &'a [String]) -> DidYouMean<'a> {
|
||||
let typos = levenshtein_automata::LevenshteinAutomatonBuilder::new(2, true).build_dfa(key);
|
||||
for key in keys.iter() {
|
||||
match typos.eval(key) {
|
||||
levenshtein_automata::Distance::Exact(_) => {
|
||||
return DidYouMean(Some(key));
|
||||
}
|
||||
levenshtein_automata::Distance::AtLeast(_) => continue,
|
||||
}
|
||||
}
|
||||
DidYouMean(None)
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for DidYouMean<'_> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
if let Some(suggestion) = self.0 {
|
||||
write!(f, " Did you mean `{suggestion}`?")?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -111,7 +111,7 @@ impl FilterableAttributesFeatures {
|
||||
self.filter.is_filterable_null()
|
||||
}
|
||||
|
||||
/// Check if `IS EXISTS` is allowed
|
||||
/// Check if `EXISTS` is allowed
|
||||
pub fn is_filterable_exists(&self) -> bool {
|
||||
self.filter.is_filterable_exists()
|
||||
}
|
||||
|
@ -1457,7 +1457,7 @@ impl Index {
|
||||
.0)
|
||||
}
|
||||
|
||||
pub(crate) fn set_updated_at(
|
||||
pub fn set_updated_at(
|
||||
&self,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
time: &time::OffsetDateTime,
|
||||
|
@ -12,7 +12,7 @@ use roaring::{MultiOps, RoaringBitmap};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::facet_range_search;
|
||||
use crate::constants::RESERVED_GEO_FIELD_NAME;
|
||||
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::error::{Error, UserError};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
@ -228,6 +228,10 @@ impl<'a> Filter<'a> {
|
||||
pub fn use_contains_operator(&self) -> Option<&Token> {
|
||||
self.condition.use_contains_operator()
|
||||
}
|
||||
|
||||
pub fn use_vector_filter(&self) -> Option<&Token> {
|
||||
self.condition.use_vector_filter()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Filter<'a> {
|
||||
@ -235,10 +239,12 @@ impl<'a> Filter<'a> {
|
||||
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
|
||||
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
||||
let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?;
|
||||
|
||||
for fid in self.condition.fids(MAX_FILTER_DEPTH) {
|
||||
let attribute = fid.value();
|
||||
if matching_features(attribute, &filterable_attributes_rules)
|
||||
.is_some_and(|(_, features)| features.is_filterable())
|
||||
|| attribute == RESERVED_VECTORS_FIELD_NAME
|
||||
{
|
||||
continue;
|
||||
}
|
||||
@ -578,7 +584,8 @@ impl<'a> Filter<'a> {
|
||||
.union()
|
||||
}
|
||||
FilterCondition::Condition { fid, op } => {
|
||||
let Some(field_id) = field_ids_map.id(fid.value()) else {
|
||||
let value = fid.value();
|
||||
let Some(field_id) = field_ids_map.id(value) else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
};
|
||||
let Some((rule_index, features)) =
|
||||
@ -635,6 +642,9 @@ impl<'a> Filter<'a> {
|
||||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
}
|
||||
FilterCondition::VectorExists { fid: _, embedder, filter } => {
|
||||
super::filter_vector::evaluate(rtxn, index, universe, embedder.clone(), filter)
|
||||
}
|
||||
FilterCondition::GeoLowerThan { point, radius } => {
|
||||
if index.is_geo_filtering_enabled(rtxn)? {
|
||||
let base_point: [f64; 2] =
|
||||
|
157
crates/milli/src/search/facet/filter_vector.rs
Normal file
157
crates/milli/src/search/facet/filter_vector.rs
Normal file
@ -0,0 +1,157 @@
|
||||
use filter_parser::{Token, VectorFilter};
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
|
||||
use crate::error::{DidYouMean, Error};
|
||||
use crate::vector::db::IndexEmbeddingConfig;
|
||||
use crate::vector::{ArroyStats, ArroyWrapper};
|
||||
use crate::Index;
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum VectorFilterError<'a> {
|
||||
#[error("The embedder `{}` does not exist. {}", embedder.value(), {
|
||||
if available.is_empty() {
|
||||
String::from("This index does not have any configured embedders.")
|
||||
} else {
|
||||
let mut available = available.clone();
|
||||
available.sort_unstable();
|
||||
let did_you_mean = DidYouMean::new(embedder.value(), &available);
|
||||
format!("Available embedders are: {}.{did_you_mean}", available.iter().map(|e| format!("`{e}`")).collect::<Vec<_>>().join(", "))
|
||||
}
|
||||
})]
|
||||
EmbedderDoesNotExist { embedder: &'a Token<'a>, available: Vec<String> },
|
||||
|
||||
#[error("The fragment `{}` does not exist on embedder `{}`. {}", fragment.value(), embedder.value(), {
|
||||
if available.is_empty() {
|
||||
String::from("This embedder does not have any configured fragments.")
|
||||
} else {
|
||||
let mut available = available.clone();
|
||||
available.sort_unstable();
|
||||
let did_you_mean = DidYouMean::new(fragment.value(), &available);
|
||||
format!("Available fragments on this embedder are: {}.{did_you_mean}", available.iter().map(|f| format!("`{f}`")).collect::<Vec<_>>().join(", "))
|
||||
}
|
||||
})]
|
||||
FragmentDoesNotExist {
|
||||
embedder: &'a Token<'a>,
|
||||
fragment: &'a Token<'a>,
|
||||
available: Vec<String>,
|
||||
},
|
||||
}
|
||||
|
||||
use VectorFilterError::*;
|
||||
|
||||
impl<'a> From<VectorFilterError<'a>> for Error {
|
||||
fn from(err: VectorFilterError<'a>) -> Self {
|
||||
match &err {
|
||||
EmbedderDoesNotExist { embedder: token, .. }
|
||||
| FragmentDoesNotExist { fragment: token, .. } => token.as_external_error(err).into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn evaluate(
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
index: &Index,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
embedder: Option<Token<'_>>,
|
||||
filter: &VectorFilter<'_>,
|
||||
) -> crate::Result<RoaringBitmap> {
|
||||
let index_embedding_configs = index.embedding_configs();
|
||||
let embedding_configs = index_embedding_configs.embedding_configs(rtxn)?;
|
||||
|
||||
let embedders = match embedder {
|
||||
Some(embedder) => vec![embedder],
|
||||
None => embedding_configs.iter().map(|config| Token::from(config.name.as_str())).collect(),
|
||||
};
|
||||
|
||||
let mut docids = embedders
|
||||
.iter()
|
||||
.map(|e| evaluate_inner(rtxn, index, e, &embedding_configs, filter))
|
||||
.union()?;
|
||||
|
||||
if let Some(universe) = universe {
|
||||
docids &= universe;
|
||||
}
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
|
||||
fn evaluate_inner(
|
||||
rtxn: &heed::RoTxn<'_>,
|
||||
index: &Index,
|
||||
embedder: &Token<'_>,
|
||||
embedding_configs: &[IndexEmbeddingConfig],
|
||||
filter: &VectorFilter<'_>,
|
||||
) -> crate::Result<RoaringBitmap> {
|
||||
let embedder_name = embedder.value();
|
||||
let available_embedders =
|
||||
|| embedding_configs.iter().map(|c| c.name.clone()).collect::<Vec<_>>();
|
||||
|
||||
let embedding_config = embedding_configs
|
||||
.iter()
|
||||
.find(|config| config.name == embedder_name)
|
||||
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
||||
|
||||
let embedder_info = index
|
||||
.embedding_configs()
|
||||
.embedder_info(rtxn, embedder_name)?
|
||||
.ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?;
|
||||
|
||||
let arroy_wrapper = ArroyWrapper::new(
|
||||
index.vector_arroy,
|
||||
embedder_info.embedder_id,
|
||||
embedding_config.config.quantized(),
|
||||
);
|
||||
|
||||
let docids = match filter {
|
||||
VectorFilter::Fragment(fragment) => {
|
||||
let fragment_name = fragment.value();
|
||||
let fragment_config = embedding_config
|
||||
.fragments
|
||||
.as_slice()
|
||||
.iter()
|
||||
.find(|fragment| fragment.name == fragment_name)
|
||||
.ok_or_else(|| FragmentDoesNotExist {
|
||||
embedder,
|
||||
fragment,
|
||||
available: embedding_config
|
||||
.fragments
|
||||
.as_slice()
|
||||
.iter()
|
||||
.map(|f| f.name.clone())
|
||||
.collect(),
|
||||
})?;
|
||||
|
||||
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
|
||||
arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| {
|
||||
bitmap.clone() - user_provided_docids
|
||||
})?
|
||||
}
|
||||
VectorFilter::DocumentTemplate => {
|
||||
if !embedding_config.fragments.as_slice().is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
|
||||
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
|
||||
let mut stats = ArroyStats::default();
|
||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||
stats.documents - user_provided_docids.clone()
|
||||
}
|
||||
VectorFilter::UserProvided => {
|
||||
let user_provided_docids = embedder_info.embedding_status.user_provided_docids();
|
||||
user_provided_docids.clone()
|
||||
}
|
||||
VectorFilter::Regenerate => {
|
||||
let mut stats = ArroyStats::default();
|
||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||
let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids();
|
||||
stats.documents - skip_regenerate
|
||||
}
|
||||
VectorFilter::None => {
|
||||
let mut stats = ArroyStats::default();
|
||||
arroy_wrapper.aggregate_stats(rtxn, &mut stats)?;
|
||||
stats.documents
|
||||
}
|
||||
};
|
||||
|
||||
Ok(docids)
|
||||
}
|
@ -17,6 +17,7 @@ mod facet_range_search;
|
||||
mod facet_sort_ascending;
|
||||
mod facet_sort_descending;
|
||||
mod filter;
|
||||
mod filter_vector;
|
||||
mod search;
|
||||
|
||||
fn facet_extreme_value<'t>(
|
||||
|
@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
|
||||
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
|
||||
use crate::search::new::{distinct_fid, distinct_single_docid};
|
||||
use crate::search::SemanticSearch;
|
||||
use crate::vector::SearchQuery;
|
||||
use crate::vector::{Embedding, SearchQuery};
|
||||
use crate::{Index, MatchingWords, Result, Search, SearchResult};
|
||||
|
||||
struct ScoreWithRatioResult {
|
||||
@ -16,6 +16,7 @@ struct ScoreWithRatioResult {
|
||||
document_scores: Vec<(u32, ScoreWithRatio)>,
|
||||
degraded: bool,
|
||||
used_negative_operator: bool,
|
||||
query_vector: Option<Embedding>,
|
||||
}
|
||||
|
||||
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
|
||||
@ -85,6 +86,7 @@ impl ScoreWithRatioResult {
|
||||
document_scores,
|
||||
degraded: results.degraded,
|
||||
used_negative_operator: results.used_negative_operator,
|
||||
query_vector: results.query_vector,
|
||||
}
|
||||
}
|
||||
|
||||
@ -186,6 +188,7 @@ impl ScoreWithRatioResult {
|
||||
degraded: vector_results.degraded | keyword_results.degraded,
|
||||
used_negative_operator: vector_results.used_negative_operator
|
||||
| keyword_results.used_negative_operator,
|
||||
query_vector: vector_results.query_vector,
|
||||
},
|
||||
semantic_hit_count,
|
||||
))
|
||||
@ -209,6 +212,7 @@ impl Search<'_> {
|
||||
terms_matching_strategy: self.terms_matching_strategy,
|
||||
scoring_strategy: ScoringStrategy::Detailed,
|
||||
words_limit: self.words_limit,
|
||||
retrieve_vectors: self.retrieve_vectors,
|
||||
exhaustive_number_hits: self.exhaustive_number_hits,
|
||||
max_total_hits: self.max_total_hits,
|
||||
rtxn: self.rtxn,
|
||||
@ -265,7 +269,7 @@ impl Search<'_> {
|
||||
};
|
||||
|
||||
search.semantic = Some(SemanticSearch {
|
||||
vector: Some(vector_query),
|
||||
vector: Some(vector_query.clone()),
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
@ -322,6 +326,7 @@ fn return_keyword_results(
|
||||
mut document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
query_vector,
|
||||
}: SearchResult,
|
||||
) -> (SearchResult, Option<u32>) {
|
||||
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
|
||||
@ -348,6 +353,7 @@ fn return_keyword_results(
|
||||
document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
query_vector,
|
||||
},
|
||||
Some(0),
|
||||
)
|
||||
|
@ -52,6 +52,7 @@ pub struct Search<'a> {
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
words_limit: usize,
|
||||
retrieve_vectors: bool,
|
||||
exhaustive_number_hits: bool,
|
||||
max_total_hits: Option<usize>,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
@ -75,6 +76,7 @@ impl<'a> Search<'a> {
|
||||
geo_param: GeoSortParameter::default(),
|
||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||
scoring_strategy: Default::default(),
|
||||
retrieve_vectors: false,
|
||||
exhaustive_number_hits: false,
|
||||
max_total_hits: None,
|
||||
words_limit: 10,
|
||||
@ -161,6 +163,11 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn retrieve_vectors(&mut self, retrieve_vectors: bool) -> &mut Search<'a> {
|
||||
self.retrieve_vectors = retrieve_vectors;
|
||||
self
|
||||
}
|
||||
|
||||
/// Forces the search to exhaustively compute the number of candidates,
|
||||
/// this will increase the search time but allows finite pagination.
|
||||
pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> {
|
||||
@ -233,6 +240,7 @@ impl<'a> Search<'a> {
|
||||
}
|
||||
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
|
||||
let mut query_vector = None;
|
||||
let PartialSearchResult {
|
||||
located_query_terms,
|
||||
candidates,
|
||||
@ -247,24 +255,29 @@ impl<'a> Search<'a> {
|
||||
embedder,
|
||||
quantized,
|
||||
media: _,
|
||||
}) => execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
self.max_total_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_param,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?,
|
||||
}) => {
|
||||
if self.retrieve_vectors {
|
||||
query_vector = Some(vector.clone());
|
||||
}
|
||||
execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
self.max_total_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_param,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?
|
||||
}
|
||||
_ => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
@ -306,6 +319,7 @@ impl<'a> Search<'a> {
|
||||
documents_ids,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
query_vector,
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -324,6 +338,7 @@ impl fmt::Debug for Search<'_> {
|
||||
terms_matching_strategy,
|
||||
scoring_strategy,
|
||||
words_limit,
|
||||
retrieve_vectors,
|
||||
exhaustive_number_hits,
|
||||
max_total_hits,
|
||||
rtxn: _,
|
||||
@ -344,6 +359,7 @@ impl fmt::Debug for Search<'_> {
|
||||
.field("searchable_attributes", searchable_attributes)
|
||||
.field("terms_matching_strategy", terms_matching_strategy)
|
||||
.field("scoring_strategy", scoring_strategy)
|
||||
.field("retrieve_vectors", retrieve_vectors)
|
||||
.field("exhaustive_number_hits", exhaustive_number_hits)
|
||||
.field("max_total_hits", max_total_hits)
|
||||
.field("words_limit", words_limit)
|
||||
@ -366,6 +382,7 @@ pub struct SearchResult {
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
pub degraded: bool,
|
||||
pub used_negative_operator: bool,
|
||||
pub query_vector: Option<Embedding>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
|
@ -130,6 +130,7 @@ impl<'a> Similar<'a> {
|
||||
document_scores,
|
||||
degraded: false,
|
||||
used_negative_operator: false,
|
||||
query_vector: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -1097,6 +1097,7 @@ fn bug_3021_fourth() {
|
||||
mut documents_ids,
|
||||
degraded: _,
|
||||
used_negative_operator: _,
|
||||
query_vector: _,
|
||||
} = search.execute().unwrap();
|
||||
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
|
||||
documents_ids.sort_unstable();
|
||||
@ -1338,10 +1339,9 @@ fn vectors_are_never_indexed_as_searchable_or_filterable() {
|
||||
assert!(results.candidates.is_empty());
|
||||
|
||||
let mut search = index.search(&rtxn);
|
||||
let results = search
|
||||
.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
let results =
|
||||
dbg!(search.filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()).execute())
|
||||
.unwrap();
|
||||
assert!(results.candidates.is_empty());
|
||||
|
||||
index
|
||||
|
@ -128,6 +128,7 @@ impl EmbeddingStatus {
|
||||
pub fn is_user_provided(&self, docid: DocumentId) -> bool {
|
||||
self.user_provided.contains(docid)
|
||||
}
|
||||
|
||||
/// Whether vectors should be regenerated for that document and that embedder.
|
||||
pub fn must_regenerate(&self, docid: DocumentId) -> bool {
|
||||
let invert = self.skip_regenerate_different_from_user_provided.contains(docid);
|
||||
|
@ -556,9 +556,6 @@ impl ArroyWrapper {
|
||||
for reader in self.readers(rtxn, self.quantized_db()) {
|
||||
let reader = reader?;
|
||||
let documents = reader.item_ids();
|
||||
if documents.is_empty() {
|
||||
break;
|
||||
}
|
||||
stats.documents |= documents;
|
||||
stats.number_of_embeddings += documents.len();
|
||||
}
|
||||
@ -566,9 +563,6 @@ impl ArroyWrapper {
|
||||
for reader in self.readers(rtxn, self.angular_db()) {
|
||||
let reader = reader?;
|
||||
let documents = reader.item_ids();
|
||||
if documents.is_empty() {
|
||||
break;
|
||||
}
|
||||
stats.documents |= documents;
|
||||
stats.number_of_embeddings += documents.len();
|
||||
}
|
||||
|
Reference in New Issue
Block a user