mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-09-07 21:26:31 +00:00
Merge branch 'main' into fragment-filters
This commit is contained in:
@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt::{Debug, Display};
|
||||
use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
||||
@ -14,10 +15,9 @@ use super::facet_range_search;
|
||||
use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME};
|
||||
use crate::error::{Error, UserError};
|
||||
use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
||||
};
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::index::db_name::FACET_ID_STRING_DOCIDS;
|
||||
use crate::search::facet::facet_range_search::find_docids_of_facet_within_bounds;
|
||||
use crate::{
|
||||
distance_between_two_points, lat_lng_to_xyz, FieldId, FieldsIdsMap,
|
||||
FilterableAttributesFeatures, FilterableAttributesRule, Index, InternalError, Result,
|
||||
@ -422,20 +422,56 @@ impl<'a> Filter<'a> {
|
||||
return Ok(docids);
|
||||
}
|
||||
Condition::StartsWith { keyword: _, word } => {
|
||||
// The idea here is that "STARTS WITH baba" is the same as "baba <= value < babb".
|
||||
// We just incremented the last letter to find the upper bound.
|
||||
// The upper bound may not be valid utf8, but lmdb doesn't care as it works over bytes.
|
||||
|
||||
let value = crate::normalize_facet(word.value());
|
||||
let base = FacetGroupKey { field_id, level: 0, left_bound: value.as_str() };
|
||||
let docids = strings_db
|
||||
.prefix_iter(rtxn, &base)?
|
||||
.map(|result| -> Result<RoaringBitmap> {
|
||||
match result {
|
||||
Ok((_facet_group_key, FacetGroupValue { bitmap, .. })) => Ok(bitmap),
|
||||
Err(_e) => Err(InternalError::from(SerializationError::Decoding {
|
||||
db_name: Some(FACET_ID_STRING_DOCIDS),
|
||||
})
|
||||
.into()),
|
||||
}
|
||||
})
|
||||
.union()?;
|
||||
let mut value2 = value.as_bytes().to_owned();
|
||||
|
||||
let last = match value2.last_mut() {
|
||||
Some(last) => last,
|
||||
None => {
|
||||
// The prefix is empty, so all documents that have the field will match.
|
||||
return index
|
||||
.exists_faceted_documents_ids(rtxn, field_id)
|
||||
.map_err(|e| e.into());
|
||||
}
|
||||
};
|
||||
|
||||
if *last == u8::MAX {
|
||||
// u8::MAX is a forbidden UTF-8 byte, we're guaranteed it cannot be sent through a filter to meilisearch, but just in case, we're going to return something
|
||||
tracing::warn!(
|
||||
"Found non utf-8 character in filter. That shouldn't be possible"
|
||||
);
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
*last += 1;
|
||||
|
||||
// This is very similar to `heed::Bytes` but its `EItem` is `&[u8]` instead of `[u8]`
|
||||
struct BytesRef;
|
||||
impl<'a> BytesEncode<'a> for BytesRef {
|
||||
type EItem = &'a [u8];
|
||||
|
||||
fn bytes_encode(
|
||||
item: &'a Self::EItem,
|
||||
) -> std::result::Result<Cow<'a, [u8]>, heed::BoxedError> {
|
||||
Ok(Cow::Borrowed(item))
|
||||
}
|
||||
}
|
||||
|
||||
let mut docids = RoaringBitmap::new();
|
||||
let bytes_db =
|
||||
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<BytesRef>>();
|
||||
find_docids_of_facet_within_bounds::<BytesRef>(
|
||||
rtxn,
|
||||
bytes_db,
|
||||
field_id,
|
||||
&Included(value.as_bytes()),
|
||||
&Excluded(value2.as_slice()),
|
||||
universe,
|
||||
&mut docids,
|
||||
)?;
|
||||
|
||||
return Ok(docids);
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
|
||||
use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy};
|
||||
use crate::search::new::{distinct_fid, distinct_single_docid};
|
||||
use crate::search::SemanticSearch;
|
||||
use crate::vector::SearchQuery;
|
||||
use crate::vector::{Embedding, SearchQuery};
|
||||
use crate::{Index, MatchingWords, Result, Search, SearchResult};
|
||||
|
||||
struct ScoreWithRatioResult {
|
||||
@ -16,6 +16,7 @@ struct ScoreWithRatioResult {
|
||||
document_scores: Vec<(u32, ScoreWithRatio)>,
|
||||
degraded: bool,
|
||||
used_negative_operator: bool,
|
||||
query_vector: Option<Embedding>,
|
||||
}
|
||||
|
||||
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
|
||||
@ -85,6 +86,7 @@ impl ScoreWithRatioResult {
|
||||
document_scores,
|
||||
degraded: results.degraded,
|
||||
used_negative_operator: results.used_negative_operator,
|
||||
query_vector: results.query_vector,
|
||||
}
|
||||
}
|
||||
|
||||
@ -186,6 +188,7 @@ impl ScoreWithRatioResult {
|
||||
degraded: vector_results.degraded | keyword_results.degraded,
|
||||
used_negative_operator: vector_results.used_negative_operator
|
||||
| keyword_results.used_negative_operator,
|
||||
query_vector: vector_results.query_vector,
|
||||
},
|
||||
semantic_hit_count,
|
||||
))
|
||||
@ -209,6 +212,7 @@ impl Search<'_> {
|
||||
terms_matching_strategy: self.terms_matching_strategy,
|
||||
scoring_strategy: ScoringStrategy::Detailed,
|
||||
words_limit: self.words_limit,
|
||||
retrieve_vectors: self.retrieve_vectors,
|
||||
exhaustive_number_hits: self.exhaustive_number_hits,
|
||||
max_total_hits: self.max_total_hits,
|
||||
rtxn: self.rtxn,
|
||||
@ -265,7 +269,7 @@ impl Search<'_> {
|
||||
};
|
||||
|
||||
search.semantic = Some(SemanticSearch {
|
||||
vector: Some(vector_query),
|
||||
vector: Some(vector_query.clone()),
|
||||
embedder_name,
|
||||
embedder,
|
||||
quantized,
|
||||
@ -322,6 +326,7 @@ fn return_keyword_results(
|
||||
mut document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
query_vector,
|
||||
}: SearchResult,
|
||||
) -> (SearchResult, Option<u32>) {
|
||||
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
|
||||
@ -348,6 +353,7 @@ fn return_keyword_results(
|
||||
document_scores,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
query_vector,
|
||||
},
|
||||
Some(0),
|
||||
)
|
||||
|
@ -52,6 +52,7 @@ pub struct Search<'a> {
|
||||
terms_matching_strategy: TermsMatchingStrategy,
|
||||
scoring_strategy: ScoringStrategy,
|
||||
words_limit: usize,
|
||||
retrieve_vectors: bool,
|
||||
exhaustive_number_hits: bool,
|
||||
max_total_hits: Option<usize>,
|
||||
rtxn: &'a heed::RoTxn<'a>,
|
||||
@ -75,6 +76,7 @@ impl<'a> Search<'a> {
|
||||
geo_param: GeoSortParameter::default(),
|
||||
terms_matching_strategy: TermsMatchingStrategy::default(),
|
||||
scoring_strategy: Default::default(),
|
||||
retrieve_vectors: false,
|
||||
exhaustive_number_hits: false,
|
||||
max_total_hits: None,
|
||||
words_limit: 10,
|
||||
@ -161,6 +163,11 @@ impl<'a> Search<'a> {
|
||||
self
|
||||
}
|
||||
|
||||
pub fn retrieve_vectors(&mut self, retrieve_vectors: bool) -> &mut Search<'a> {
|
||||
self.retrieve_vectors = retrieve_vectors;
|
||||
self
|
||||
}
|
||||
|
||||
/// Forces the search to exhaustively compute the number of candidates,
|
||||
/// this will increase the search time but allows finite pagination.
|
||||
pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> {
|
||||
@ -233,6 +240,7 @@ impl<'a> Search<'a> {
|
||||
}
|
||||
|
||||
let universe = filtered_universe(ctx.index, ctx.txn, &self.filter)?;
|
||||
let mut query_vector = None;
|
||||
let PartialSearchResult {
|
||||
located_query_terms,
|
||||
candidates,
|
||||
@ -247,24 +255,29 @@ impl<'a> Search<'a> {
|
||||
embedder,
|
||||
quantized,
|
||||
media: _,
|
||||
}) => execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
self.max_total_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_param,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?,
|
||||
}) => {
|
||||
if self.retrieve_vectors {
|
||||
query_vector = Some(vector.clone());
|
||||
}
|
||||
execute_vector_search(
|
||||
&mut ctx,
|
||||
vector,
|
||||
self.scoring_strategy,
|
||||
self.exhaustive_number_hits,
|
||||
self.max_total_hits,
|
||||
universe,
|
||||
&self.sort_criteria,
|
||||
&self.distinct,
|
||||
self.geo_param,
|
||||
self.offset,
|
||||
self.limit,
|
||||
embedder_name,
|
||||
embedder,
|
||||
*quantized,
|
||||
self.time_budget.clone(),
|
||||
self.ranking_score_threshold,
|
||||
)?
|
||||
}
|
||||
_ => execute_search(
|
||||
&mut ctx,
|
||||
self.query.as_deref(),
|
||||
@ -306,6 +319,7 @@ impl<'a> Search<'a> {
|
||||
documents_ids,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
query_vector,
|
||||
})
|
||||
}
|
||||
}
|
||||
@ -324,6 +338,7 @@ impl fmt::Debug for Search<'_> {
|
||||
terms_matching_strategy,
|
||||
scoring_strategy,
|
||||
words_limit,
|
||||
retrieve_vectors,
|
||||
exhaustive_number_hits,
|
||||
max_total_hits,
|
||||
rtxn: _,
|
||||
@ -344,6 +359,7 @@ impl fmt::Debug for Search<'_> {
|
||||
.field("searchable_attributes", searchable_attributes)
|
||||
.field("terms_matching_strategy", terms_matching_strategy)
|
||||
.field("scoring_strategy", scoring_strategy)
|
||||
.field("retrieve_vectors", retrieve_vectors)
|
||||
.field("exhaustive_number_hits", exhaustive_number_hits)
|
||||
.field("max_total_hits", max_total_hits)
|
||||
.field("words_limit", words_limit)
|
||||
@ -366,6 +382,7 @@ pub struct SearchResult {
|
||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||
pub degraded: bool,
|
||||
pub used_negative_operator: bool,
|
||||
pub query_vector: Option<Embedding>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
|
@ -17,7 +17,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
|
||||
let path = tempfile::tempdir().unwrap();
|
||||
let options = EnvOpenOptions::new();
|
||||
let mut options = options.read_txn_without_tls();
|
||||
options.map_size(10 * 1024 * 1024); // 10 MB
|
||||
options.map_size(10 * 1024 * 1024); // 10 MiB
|
||||
let index = Index::new(options, &path, true).unwrap();
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
|
@ -130,6 +130,7 @@ impl<'a> Similar<'a> {
|
||||
document_scores,
|
||||
degraded: false,
|
||||
used_negative_operator: false,
|
||||
query_vector: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user