Merge branch 'main' into enhance-language-detection

This commit is contained in:
ManyTheFish
2023-02-20 18:14:34 +01:00
202 changed files with 10490 additions and 3066 deletions

View File

@ -123,7 +123,7 @@ impl<'t> Criterion for Attribute<'t> {
None => {
return Ok(Some(CriterionResult {
query_tree: Some(query_tree),
candidates: Some(RoaringBitmap::new()),
candidates: Some(allowed_candidates),
filtered_candidates: None,
initial_candidates: Some(self.initial_candidates.take()),
}));

View File

@ -182,15 +182,15 @@ impl<'t> Criterion for Proximity<'t> {
}
}
fn resolve_candidates<'t>(
ctx: &'t dyn Context,
fn resolve_candidates(
ctx: &dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
fn resolve_operation<'t>(
ctx: &'t dyn Context,
fn resolve_operation(
ctx: &dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut Cache,
@ -243,8 +243,8 @@ fn resolve_candidates<'t>(
Ok(result)
}
fn mdfs_pair<'t>(
ctx: &'t dyn Context,
fn mdfs_pair(
ctx: &dyn Context,
left: &Operation,
right: &Operation,
proximity: u8,
@ -298,8 +298,8 @@ fn resolve_candidates<'t>(
Ok(output)
}
fn mdfs<'t>(
ctx: &'t dyn Context,
fn mdfs(
ctx: &dyn Context,
branches: &[Operation],
proximity: u8,
cache: &mut Cache,

View File

@ -239,15 +239,15 @@ fn alterate_query_tree(
Ok(query_tree)
}
fn resolve_candidates<'t>(
ctx: &'t dyn Context,
fn resolve_candidates(
ctx: &dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache,
) -> Result<RoaringBitmap> {
fn resolve_operation<'t>(
ctx: &'t dyn Context,
fn resolve_operation(
ctx: &dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
@ -276,8 +276,8 @@ fn resolve_candidates<'t>(
}
}
fn mdfs<'t>(
ctx: &'t dyn Context,
fn mdfs(
ctx: &dyn Context,
branches: &[Operation],
mana: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,

View File

@ -291,6 +291,7 @@ impl<'a> FacetDistribution<'a> {
if !invalid_fields.is_empty() {
return Err(UserError::InvalidFacetsDistribution {
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
valid_facets_name: filterable_fields.into_iter().collect(),
}
.into());
} else {

View File

@ -21,17 +21,51 @@ pub struct Filter<'a> {
condition: FilterCondition<'a>,
}
#[derive(Debug)]
pub enum BadGeoError {
Lat(f64),
Lng(f64),
BoundingBoxTopIsBelowBottom(f64, f64),
}
impl std::error::Error for BadGeoError {}
impl Display for BadGeoError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::BoundingBoxTopIsBelowBottom(top, bottom) => {
write!(f, "The top latitude `{top}` is below the bottom latitude `{bottom}`.")
}
Self::Lat(lat) => write!(
f,
"Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ",
lat
),
Self::Lng(lng) => write!(
f,
"Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ",
lng
),
}
}
}
#[derive(Debug)]
enum FilterError<'a> {
AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet<String> },
BadGeo(&'a str),
BadGeoLat(f64),
BadGeoLng(f64),
ParseGeoError(BadGeoError),
ReservedGeo(&'a str),
Reserved(&'a str),
TooDeep,
}
impl<'a> std::error::Error for FilterError<'a> {}
impl<'a> From<BadGeoError> for FilterError<'a> {
fn from(geo_error: BadGeoError) -> Self {
FilterError::ParseGeoError(geo_error)
}
}
impl<'a> Display for FilterError<'a> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
@ -43,7 +77,11 @@ impl<'a> Display for FilterError<'a> {
attribute,
)
} else {
let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(" ");
let filterables_list = filterable_fields
.iter()
.map(AsRef::as_ref)
.collect::<Vec<&str>>()
.join(" ");
write!(
f,
@ -52,19 +90,19 @@ impl<'a> Display for FilterError<'a> {
filterables_list,
)
}
},
Self::TooDeep => write!(f,
}
Self::TooDeep => write!(
f,
"Too many filter conditions, can't process more than {} filters.",
MAX_FILTER_DEPTH
),
Self::ReservedGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` field coordinates.", keyword),
Self::Reserved(keyword) => write!(
f,
"`{}` is a reserved keyword and thus can't be used as a filter expression.",
keyword
),
Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword),
Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat),
Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng),
Self::ParseGeoError(error) => write!(f, "{}", error),
}
}
}
@ -294,12 +332,12 @@ impl<'a> Filter<'a> {
Ok(RoaringBitmap::new())
}
} else {
match fid.lexeme() {
match fid.value() {
attribute @ "_geo" => {
Err(fid.as_external_error(FilterError::BadGeo(attribute)))?
Err(fid.as_external_error(FilterError::ReservedGeo(attribute)))?
}
attribute if attribute.starts_with("_geoPoint(") => {
Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))?
Err(fid.as_external_error(FilterError::ReservedGeo("_geoPoint")))?
}
attribute @ "_geoDistance" => {
Err(fid.as_external_error(FilterError::Reserved(attribute)))?
@ -351,14 +389,10 @@ impl<'a> Filter<'a> {
let base_point: [f64; 2] =
[point[0].parse_finite_float()?, point[1].parse_finite_float()?];
if !(-90.0..=90.0).contains(&base_point[0]) {
return Err(
point[0].as_external_error(FilterError::BadGeoLat(base_point[0]))
)?;
return Err(point[0].as_external_error(BadGeoError::Lat(base_point[0])))?;
}
if !(-180.0..=180.0).contains(&base_point[1]) {
return Err(
point[1].as_external_error(FilterError::BadGeoLng(base_point[1]))
)?;
return Err(point[1].as_external_error(BadGeoError::Lng(base_point[1])))?;
}
let radius = radius.parse_finite_float()?;
let rtree = match index.geo_rtree(rtxn)? {
@ -385,6 +419,130 @@ impl<'a> Filter<'a> {
}))?
}
}
FilterCondition::GeoBoundingBox { top_left_point, bottom_right_point } => {
if filterable_fields.contains("_geo") {
let top_left: [f64; 2] = [
top_left_point[0].parse_finite_float()?,
top_left_point[1].parse_finite_float()?,
];
let bottom_right: [f64; 2] = [
bottom_right_point[0].parse_finite_float()?,
bottom_right_point[1].parse_finite_float()?,
];
if !(-90.0..=90.0).contains(&top_left[0]) {
return Err(
top_left_point[0].as_external_error(BadGeoError::Lat(top_left[0]))
)?;
}
if !(-180.0..=180.0).contains(&top_left[1]) {
return Err(
top_left_point[1].as_external_error(BadGeoError::Lng(top_left[1]))
)?;
}
if !(-90.0..=90.0).contains(&bottom_right[0]) {
return Err(bottom_right_point[0]
.as_external_error(BadGeoError::Lat(bottom_right[0])))?;
}
if !(-180.0..=180.0).contains(&bottom_right[1]) {
return Err(bottom_right_point[1]
.as_external_error(BadGeoError::Lng(bottom_right[1])))?;
}
if top_left[0] < bottom_right[0] {
return Err(bottom_right_point[1].as_external_error(
BadGeoError::BoundingBoxTopIsBelowBottom(top_left[0], bottom_right[0]),
))?;
}
// Instead of writing a custom `GeoBoundingBox` filter we're simply going to re-use the range
// filter to create the following filter;
// `_geo.lat {top_left[0]} TO {bottom_right[0]} AND _geo.lng {top_left[1]} TO {bottom_right[1]}`
// As we can see, we need to use a bunch of tokens that don't exist in the original filter,
// thus we're going to create tokens that point to a random span but contain our text.
let geo_lat_token =
Token::new(top_left_point[0].original_span(), Some("_geo.lat".to_string()));
let condition_lat = FilterCondition::Condition {
fid: geo_lat_token,
op: Condition::Between {
from: bottom_right_point[0].clone(),
to: top_left_point[0].clone(),
},
};
let selected_lat = Filter { condition: condition_lat }.inner_evaluate(
rtxn,
index,
filterable_fields,
)?;
let geo_lng_token =
Token::new(top_left_point[1].original_span(), Some("_geo.lng".to_string()));
let selected_lng = if top_left[1] > bottom_right[1] {
// In this case the bounding box is wrapping around the earth (going from 180 to -180).
// We need to update the lng part of the filter from;
// `_geo.lng {top_left[1]} TO {bottom_right[1]}` to
// `_geo.lng {top_left[1]} TO 180 AND _geo.lng -180 TO {bottom_right[1]}`
let min_lng_token = Token::new(
top_left_point[1].original_span(),
Some("-180.0".to_string()),
);
let max_lng_token = Token::new(
top_left_point[1].original_span(),
Some("180.0".to_string()),
);
let condition_left = FilterCondition::Condition {
fid: geo_lng_token.clone(),
op: Condition::Between {
from: top_left_point[1].clone(),
to: max_lng_token,
},
};
let left = Filter { condition: condition_left }.inner_evaluate(
rtxn,
index,
filterable_fields,
)?;
let condition_right = FilterCondition::Condition {
fid: geo_lng_token,
op: Condition::Between {
from: min_lng_token,
to: bottom_right_point[1].clone(),
},
};
let right = Filter { condition: condition_right }.inner_evaluate(
rtxn,
index,
filterable_fields,
)?;
left | right
} else {
let condition_lng = FilterCondition::Condition {
fid: geo_lng_token,
op: Condition::Between {
from: top_left_point[1].clone(),
to: bottom_right_point[1].clone(),
},
};
Filter { condition: condition_lng }.inner_evaluate(
rtxn,
index,
filterable_fields,
)?
};
Ok(selected_lat & selected_lng)
} else {
Err(top_left_point[0].as_external_error(FilterError::AttributeNotFilterable {
attribute: "_geo",
filterable_fields: filterable_fields.clone(),
}))?
}
}
}
}
}
@ -502,6 +660,12 @@ mod tests {
"Attribute `_geo` is not filterable. This index does not have configured filterable attributes."
));
let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().starts_with(
"Attribute `_geo` is not filterable. This index does not have configured filterable attributes."
));
let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().starts_with(
@ -524,6 +688,12 @@ mod tests {
"Attribute `_geo` is not filterable. Available filterable attributes are: `title`."
));
let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().starts_with(
"Attribute `_geo` is not filterable. Available filterable attributes are: `title`."
));
let filter = Filter::from_str("name = 12").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().starts_with(
@ -675,6 +845,92 @@ mod tests {
));
}
#[test]
fn geo_bounding_box_error() {
let index = TempIndex::new();
index
.update_settings(|settings| {
settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order
settings.set_filterable_fields(hashset! { S("_geo"), S("price") });
})
.unwrap();
let rtxn = index.read_txn().unwrap();
// geoboundingbox top left coord have a bad latitude
let filter =
Filter::from_str("_geoBoundingBox([-90.0000001, 150], [30, 10])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(
error.to_string().starts_with(
"Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees."
),
"{}",
error.to_string()
);
// geoboundingbox top left coord have a bad latitude
let filter =
Filter::from_str("_geoBoundingBox([90.0000001, 150], [30, 10])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(
error.to_string().starts_with(
"Bad latitude `90.0000001`. Latitude must be contained between -90 and 90 degrees."
),
"{}",
error.to_string()
);
// geoboundingbox bottom right coord have a bad latitude
let filter =
Filter::from_str("_geoBoundingBox([30, 10], [-90.0000001, 150])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().contains(
"Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees."
));
// geoboundingbox bottom right coord have a bad latitude
let filter =
Filter::from_str("_geoBoundingBox([30, 10], [90.0000001, 150])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().contains(
"Bad latitude `90.0000001`. Latitude must be contained between -90 and 90 degrees."
));
// geoboundingbox top left coord have a bad longitude
let filter =
Filter::from_str("_geoBoundingBox([-10, 180.000001], [30, 10])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().contains(
"Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees."
));
// geoboundingbox top left coord have a bad longitude
let filter =
Filter::from_str("_geoBoundingBox([-10, -180.000001], [30, 10])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().contains(
"Bad longitude `-180.000001`. Longitude must be contained between -180 and 180 degrees."
));
// geoboundingbox bottom right coord have a bad longitude
let filter =
Filter::from_str("_geoBoundingBox([30, 10], [-10, -180.000001])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().contains(
"Bad longitude `-180.000001`. Longitude must be contained between -180 and 180 degrees."
));
// geoboundingbox bottom right coord have a bad longitude
let filter =
Filter::from_str("_geoBoundingBox([30, 10], [-10, 180.000001])").unwrap().unwrap();
let error = filter.evaluate(&rtxn, &index).unwrap_err();
assert!(error.to_string().contains(
"Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees."
));
}
#[test]
fn filter_depth() {
// generates a big (2 MiB) filter with too much of ORs.

View File

@ -4,7 +4,7 @@ use heed::types::{ByteSlice, DecodeIgnore};
use heed::{BytesDecode, RoTxn};
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
pub use self::filter::Filter;
pub use self::filter::{BadGeoError, Filter};
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
mod facet_distribution;

View File

@ -324,7 +324,7 @@ impl fmt::Debug for Search<'_> {
}
}
#[derive(Default)]
#[derive(Default, Debug)]
pub struct SearchResult {
pub matching_words: MatchingWords,
pub candidates: RoaringBitmap,