mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-28 01:01:00 +00:00
Merge branch 'main' into enhance-language-detection
This commit is contained in:
@ -7,45 +7,31 @@ use serde::{Deserialize, Serialize};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::error::is_reserved_keyword;
|
||||
use crate::search::facet::BadGeoError;
|
||||
use crate::{CriterionError, Error, UserError};
|
||||
|
||||
/// This error type is never supposed to be shown to the end user.
|
||||
/// You must always cast it to a sort error or a criterion error.
|
||||
#[derive(Debug)]
|
||||
#[derive(Error, Debug)]
|
||||
pub enum AscDescError {
|
||||
InvalidLatitude,
|
||||
InvalidLongitude,
|
||||
#[error(transparent)]
|
||||
GeoError(BadGeoError),
|
||||
#[error("Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{name}`.")]
|
||||
InvalidSyntax { name: String },
|
||||
#[error("`{name}` is a reserved keyword and thus can't be used as a asc/desc rule.")]
|
||||
ReservedKeyword { name: String },
|
||||
}
|
||||
|
||||
impl fmt::Display for AscDescError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Self::InvalidLatitude => {
|
||||
write!(f, "Latitude must be contained between -90 and 90 degrees.",)
|
||||
}
|
||||
Self::InvalidLongitude => {
|
||||
write!(f, "Longitude must be contained between -180 and 180 degrees.",)
|
||||
}
|
||||
Self::InvalidSyntax { name } => {
|
||||
write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
|
||||
}
|
||||
Self::ReservedKeyword { name } => {
|
||||
write!(
|
||||
f,
|
||||
"`{}` is a reserved keyword and thus can't be used as a asc/desc rule.",
|
||||
name
|
||||
)
|
||||
}
|
||||
}
|
||||
impl From<BadGeoError> for AscDescError {
|
||||
fn from(geo_error: BadGeoError) -> Self {
|
||||
AscDescError::GeoError(geo_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<AscDescError> for CriterionError {
|
||||
fn from(error: AscDescError) -> Self {
|
||||
match error {
|
||||
AscDescError::InvalidLatitude | AscDescError::InvalidLongitude => {
|
||||
AscDescError::GeoError(_) => {
|
||||
CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() }
|
||||
}
|
||||
AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name },
|
||||
@ -55,6 +41,9 @@ impl From<AscDescError> for CriterionError {
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => {
|
||||
CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoBoundingBox") => {
|
||||
CriterionError::ReservedNameForFilter { name: "_geoBoundingBox".to_string() }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } => CriterionError::ReservedName { name },
|
||||
}
|
||||
}
|
||||
@ -82,14 +71,17 @@ impl FromStr for Member {
|
||||
.map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() })
|
||||
})?;
|
||||
if !(-90.0..=90.0).contains(&lat) {
|
||||
return Err(AscDescError::InvalidLatitude)?;
|
||||
return Err(BadGeoError::Lat(lat))?;
|
||||
} else if !(-180.0..=180.0).contains(&lng) {
|
||||
return Err(AscDescError::InvalidLongitude)?;
|
||||
return Err(BadGeoError::Lng(lng))?;
|
||||
}
|
||||
Ok(Member::Geo([lat, lng]))
|
||||
}
|
||||
None => {
|
||||
if is_reserved_keyword(text) || text.starts_with("_geoRadius(") {
|
||||
if is_reserved_keyword(text)
|
||||
|| text.starts_with("_geoRadius(")
|
||||
|| text.starts_with("_geoBoundingBox(")
|
||||
{
|
||||
return Err(AscDescError::ReservedKeyword { name: text.to_string() })?;
|
||||
}
|
||||
Ok(Member::Field(text.to_string()))
|
||||
@ -156,10 +148,8 @@ impl FromStr for AscDesc {
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
pub enum SortError {
|
||||
#[error("{}", AscDescError::InvalidLatitude)]
|
||||
InvalidLatitude,
|
||||
#[error("{}", AscDescError::InvalidLongitude)]
|
||||
InvalidLongitude,
|
||||
#[error(transparent)]
|
||||
ParseGeoError { error: BadGeoError },
|
||||
#[error("Invalid syntax for the geo parameter: expected expression formated like \
|
||||
`_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{name}`.")]
|
||||
BadGeoPointUsage { name: String },
|
||||
@ -178,8 +168,7 @@ pub enum SortError {
|
||||
impl From<AscDescError> for SortError {
|
||||
fn from(error: AscDescError) -> Self {
|
||||
match error {
|
||||
AscDescError::InvalidLatitude => SortError::InvalidLatitude,
|
||||
AscDescError::InvalidLongitude => SortError::InvalidLongitude,
|
||||
AscDescError::GeoError(error) => SortError::ParseGeoError { error },
|
||||
AscDescError::InvalidSyntax { name } => SortError::InvalidName { name },
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => {
|
||||
SortError::BadGeoPointUsage { name }
|
||||
@ -190,6 +179,9 @@ impl From<AscDescError> for SortError {
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => {
|
||||
SortError::ReservedNameForFilter { name: String::from("_geoRadius") }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } if name.starts_with("_geoBoundingBox") => {
|
||||
SortError::ReservedNameForFilter { name: String::from("_geoBoundingBox") }
|
||||
}
|
||||
AscDescError::ReservedKeyword { name } => SortError::ReservedName { name },
|
||||
}
|
||||
}
|
||||
@ -268,11 +260,11 @@ mod tests {
|
||||
),
|
||||
("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }),
|
||||
("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }),
|
||||
("_geoPoint(200, 200):asc", InvalidLatitude),
|
||||
("_geoPoint(90.000001, 0):asc", InvalidLatitude),
|
||||
("_geoPoint(0, -180.000001):desc", InvalidLongitude),
|
||||
("_geoPoint(159.256, 130):asc", InvalidLatitude),
|
||||
("_geoPoint(12, -2021):desc", InvalidLongitude),
|
||||
("_geoPoint(200, 200):asc", GeoError(BadGeoError::Lat(200.))),
|
||||
("_geoPoint(90.000001, 0):asc", GeoError(BadGeoError::Lat(90.000001))),
|
||||
("_geoPoint(0, -180.000001):desc", GeoError(BadGeoError::Lng(-180.000001))),
|
||||
("_geoPoint(159.256, 130):asc", GeoError(BadGeoError::Lat(159.256))),
|
||||
("_geoPoint(12, -2021):desc", GeoError(BadGeoError::Lng(-2021.))),
|
||||
];
|
||||
|
||||
for (req, expected_error) in invalid_req {
|
||||
|
@ -159,6 +159,11 @@ mod tests {
|
||||
("_geoPoint(42, 75):asc", ReservedNameForSort { name: S("_geoPoint") }),
|
||||
("_geoRadius:asc", ReservedNameForFilter { name: S("_geoRadius") }),
|
||||
("_geoRadius(42, 75, 59):asc", ReservedNameForFilter { name: S("_geoRadius") }),
|
||||
("_geoBoundingBox:asc", ReservedNameForFilter { name: S("_geoBoundingBox") }),
|
||||
(
|
||||
"_geoBoundingBox([42, 75], [75, 59]):asc",
|
||||
ReservedNameForFilter { name: S("_geoBoundingBox") },
|
||||
),
|
||||
];
|
||||
|
||||
for (input, expected) in invalid_criteria {
|
||||
|
@ -1,5 +1,6 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::convert::Infallible;
|
||||
use std::fmt::Write;
|
||||
use std::{io, str};
|
||||
|
||||
use heed::{Error as HeedError, MdbError};
|
||||
@ -11,7 +12,7 @@ use crate::documents::{self, DocumentsBatchCursorError};
|
||||
use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
|
||||
|
||||
pub fn is_reserved_keyword(keyword: &str) -> bool {
|
||||
["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword)
|
||||
["_geo", "_geoDistance", "_geoPoint", "_geoRadius", "_geoBoundingBox"].contains(&keyword)
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
@ -100,10 +101,11 @@ A document identifier can be of type integer or string, \
|
||||
only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string()
|
||||
)]
|
||||
InvalidDocumentId { document_id: Value },
|
||||
#[error("Invalid facet distribution, the fields `{}` are not set as filterable.",
|
||||
.invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
)]
|
||||
InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
|
||||
#[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]
|
||||
InvalidFacetsDistribution {
|
||||
invalid_facets_name: BTreeSet<String>,
|
||||
valid_facets_name: BTreeSet<String>,
|
||||
},
|
||||
#[error(transparent)]
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("{0}")]
|
||||
@ -152,6 +154,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
pub enum GeoError {
|
||||
#[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
|
||||
NotAnObject { document_id: Value, value: Value },
|
||||
#[error("The `_geo` field in the document with the id: `{document_id}` contains the following unexpected fields: `{value}`.")]
|
||||
UnexpectedExtraFields { document_id: Value, value: Value },
|
||||
#[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
|
||||
MissingLatitudeAndLongitude { document_id: Value },
|
||||
#[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
|
||||
@ -166,6 +170,50 @@ pub enum GeoError {
|
||||
BadLongitude { document_id: Value, value: Value },
|
||||
}
|
||||
|
||||
fn format_invalid_filter_distribution(
|
||||
invalid_facets_name: &BTreeSet<String>,
|
||||
valid_facets_name: &BTreeSet<String>,
|
||||
) -> String {
|
||||
if valid_facets_name.is_empty() {
|
||||
return "this index does not have configured filterable attributes.".into();
|
||||
}
|
||||
|
||||
let mut result = String::new();
|
||||
|
||||
match invalid_facets_name.len() {
|
||||
0 => (),
|
||||
1 => write!(
|
||||
result,
|
||||
"attribute `{}` is not filterable.",
|
||||
invalid_facets_name.first().unwrap()
|
||||
)
|
||||
.unwrap(),
|
||||
_ => write!(
|
||||
result,
|
||||
"attributes `{}` are not filterable.",
|
||||
invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
)
|
||||
.unwrap(),
|
||||
};
|
||||
|
||||
match valid_facets_name.len() {
|
||||
1 => write!(
|
||||
result,
|
||||
" The available filterable attribute is `{}`.",
|
||||
valid_facets_name.first().unwrap()
|
||||
)
|
||||
.unwrap(),
|
||||
_ => write!(
|
||||
result,
|
||||
" The available filterable attributes are `{}`.",
|
||||
valid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
|
||||
)
|
||||
.unwrap(),
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
/// A little macro helper to autogenerate From implementation that needs two `Into`.
|
||||
/// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)`
|
||||
/// the macro will create the following code:
|
||||
|
@ -355,10 +355,10 @@ impl Index {
|
||||
/* external documents ids */
|
||||
|
||||
/// Writes the external documents ids and internal ids (i.e. `u32`).
|
||||
pub(crate) fn put_external_documents_ids<'a>(
|
||||
pub(crate) fn put_external_documents_ids(
|
||||
&self,
|
||||
wtxn: &mut RwTxn,
|
||||
external_documents_ids: &ExternalDocumentsIds<'a>,
|
||||
external_documents_ids: &ExternalDocumentsIds<'_>,
|
||||
) -> heed::Result<()> {
|
||||
let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
|
||||
let hard = hard.as_fst().as_bytes();
|
||||
@ -433,7 +433,7 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Returns the `rtree` which associates coordinates to documents ids.
|
||||
pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result<Option<RTree<GeoPoint>>> {
|
||||
pub fn geo_rtree(&self, rtxn: &RoTxn) -> Result<Option<RTree<GeoPoint>>> {
|
||||
match self
|
||||
.main
|
||||
.get::<_, Str, SerdeBincode<RTree<GeoPoint>>>(rtxn, main_key::GEO_RTREE_KEY)?
|
||||
@ -1245,7 +1245,7 @@ pub(crate) mod tests {
|
||||
self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
|
||||
IndexDocumentsMethod, IndexerConfig, Settings,
|
||||
};
|
||||
use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};
|
||||
use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
|
||||
|
||||
pub(crate) struct TempIndex {
|
||||
pub inner: Index,
|
||||
@ -1543,6 +1543,108 @@ pub(crate) mod tests {
|
||||
assert_eq!(user_defined, &["doggo", "name"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_basic_geo_bounding_box() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset! { S("_geo") });
|
||||
})
|
||||
.unwrap();
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "_geo": { "lat": 0, "lng": 0 } },
|
||||
{ "id": 1, "_geo": { "lat": 0, "lng": -175 } },
|
||||
{ "id": 2, "_geo": { "lat": 0, "lng": 175 } },
|
||||
{ "id": 3, "_geo": { "lat": 85, "lng": 0 } },
|
||||
{ "id": 4, "_geo": { "lat": -85, "lng": 0 } },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
// ensure we get the right real searchable fields + user defined searchable fields
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let mut search = index.search(&rtxn);
|
||||
|
||||
// exact match a document
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 0], [0, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0]>");
|
||||
|
||||
// match a document in the middle of the rectangle
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([10, -10], [-10, 10])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0]>");
|
||||
|
||||
// select everything
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([90, -180], [-90, 180])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0, 1, 2, 3, 4]>");
|
||||
|
||||
// go on the edge of the longitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 180], [0, -170])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[1]>");
|
||||
|
||||
// go on the other edge of the longitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 170], [0, -180])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[2]>");
|
||||
|
||||
// wrap around the longitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([0, 170], [0, -170])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[1, 2]>");
|
||||
|
||||
// go on the edge of the latitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([90, 0], [80, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[3]>");
|
||||
|
||||
// go on the edge of the latitude
|
||||
let search_result = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([-80, 0], [-90, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap();
|
||||
insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[4]>");
|
||||
|
||||
// the requests that don't make sense
|
||||
|
||||
// try to wrap around the latitude
|
||||
let error = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([-80, 0], [80, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap_err();
|
||||
insta::assert_display_snapshot!(error, @r###"
|
||||
The top latitude `-80` is below the bottom latitude `80`.
|
||||
32:33 _geoBoundingBox([-80, 0], [80, 0])
|
||||
"###);
|
||||
|
||||
// send a top latitude lower than the bottow latitude
|
||||
let error = search
|
||||
.filter(Filter::from_str("_geoBoundingBox([-10, 0], [10, 0])").unwrap().unwrap())
|
||||
.execute()
|
||||
.unwrap_err();
|
||||
insta::assert_display_snapshot!(error, @r###"
|
||||
The top latitude `-10` is below the bottom latitude `10`.
|
||||
32:33 _geoBoundingBox([-10, 0], [10, 0])
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replace_documents_external_ids_and_soft_deletion_check() {
|
||||
use big_s::S;
|
||||
@ -2331,4 +2433,69 @@ pub(crate) mod tests {
|
||||
assert!(all_ids.insert(id));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bug_3007() {
|
||||
// https://github.com/meilisearch/meilisearch/issues/3007
|
||||
|
||||
use crate::error::{GeoError, UserError};
|
||||
let index = TempIndex::new();
|
||||
|
||||
// Given is an index with a geo field NOT contained in the sortable_fields of the settings
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_string());
|
||||
settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
// happy path
|
||||
index.add_documents(documents!({ "id" : 5, "_geo": {"lat": 12.0, "lng": 11.0}})).unwrap();
|
||||
|
||||
db_snap!(index, geo_faceted_documents_ids);
|
||||
|
||||
// both are unparseable, we expect GeoError::BadLatitudeAndLongitude
|
||||
let err1 = index
|
||||
.add_documents(
|
||||
documents!({ "id" : 6, "_geo": {"lat": "unparseable", "lng": "unparseable"}}),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(matches!(
|
||||
err1,
|
||||
Error::UserError(UserError::InvalidGeoField(GeoError::BadLatitudeAndLongitude { .. }))
|
||||
));
|
||||
|
||||
db_snap!(index, geo_faceted_documents_ids); // ensure that no more document was inserted
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unexpected_extra_fields_in_geo_field() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_primary_key("id".to_string());
|
||||
settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let err = index
|
||||
.add_documents(
|
||||
documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best" }}),
|
||||
)
|
||||
.unwrap_err();
|
||||
insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"doggo":"are the best"}`."###);
|
||||
|
||||
db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
|
||||
|
||||
// multiple fields and complex values
|
||||
let err = index
|
||||
.add_documents(
|
||||
documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }),
|
||||
)
|
||||
.unwrap_err();
|
||||
insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."###);
|
||||
|
||||
db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
|
||||
}
|
||||
}
|
||||
|
@ -123,7 +123,7 @@ impl<'t> Criterion for Attribute<'t> {
|
||||
None => {
|
||||
return Ok(Some(CriterionResult {
|
||||
query_tree: Some(query_tree),
|
||||
candidates: Some(RoaringBitmap::new()),
|
||||
candidates: Some(allowed_candidates),
|
||||
filtered_candidates: None,
|
||||
initial_candidates: Some(self.initial_candidates.take()),
|
||||
}));
|
||||
|
@ -182,15 +182,15 @@ impl<'t> Criterion for Proximity<'t> {
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_candidates<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn resolve_candidates(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn resolve_operation(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
@ -243,8 +243,8 @@ fn resolve_candidates<'t>(
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
fn mdfs_pair<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn mdfs_pair(
|
||||
ctx: &dyn Context,
|
||||
left: &Operation,
|
||||
right: &Operation,
|
||||
proximity: u8,
|
||||
@ -298,8 +298,8 @@ fn resolve_candidates<'t>(
|
||||
Ok(output)
|
||||
}
|
||||
|
||||
fn mdfs<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn mdfs(
|
||||
ctx: &dyn Context,
|
||||
branches: &[Operation],
|
||||
proximity: u8,
|
||||
cache: &mut Cache,
|
||||
|
@ -239,15 +239,15 @@ fn alterate_query_tree(
|
||||
Ok(query_tree)
|
||||
}
|
||||
|
||||
fn resolve_candidates<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn resolve_candidates(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
wdcache: &mut WordDerivationsCache,
|
||||
) -> Result<RoaringBitmap> {
|
||||
fn resolve_operation<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn resolve_operation(
|
||||
ctx: &dyn Context,
|
||||
query_tree: &Operation,
|
||||
number_typos: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
@ -276,8 +276,8 @@ fn resolve_candidates<'t>(
|
||||
}
|
||||
}
|
||||
|
||||
fn mdfs<'t>(
|
||||
ctx: &'t dyn Context,
|
||||
fn mdfs(
|
||||
ctx: &dyn Context,
|
||||
branches: &[Operation],
|
||||
mana: u8,
|
||||
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
|
||||
|
@ -291,6 +291,7 @@ impl<'a> FacetDistribution<'a> {
|
||||
if !invalid_fields.is_empty() {
|
||||
return Err(UserError::InvalidFacetsDistribution {
|
||||
invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
|
||||
valid_facets_name: filterable_fields.into_iter().collect(),
|
||||
}
|
||||
.into());
|
||||
} else {
|
||||
|
@ -21,17 +21,51 @@ pub struct Filter<'a> {
|
||||
condition: FilterCondition<'a>,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum BadGeoError {
|
||||
Lat(f64),
|
||||
Lng(f64),
|
||||
BoundingBoxTopIsBelowBottom(f64, f64),
|
||||
}
|
||||
|
||||
impl std::error::Error for BadGeoError {}
|
||||
|
||||
impl Display for BadGeoError {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
Self::BoundingBoxTopIsBelowBottom(top, bottom) => {
|
||||
write!(f, "The top latitude `{top}` is below the bottom latitude `{bottom}`.")
|
||||
}
|
||||
Self::Lat(lat) => write!(
|
||||
f,
|
||||
"Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ",
|
||||
lat
|
||||
),
|
||||
Self::Lng(lng) => write!(
|
||||
f,
|
||||
"Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ",
|
||||
lng
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
enum FilterError<'a> {
|
||||
AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet<String> },
|
||||
BadGeo(&'a str),
|
||||
BadGeoLat(f64),
|
||||
BadGeoLng(f64),
|
||||
ParseGeoError(BadGeoError),
|
||||
ReservedGeo(&'a str),
|
||||
Reserved(&'a str),
|
||||
TooDeep,
|
||||
}
|
||||
impl<'a> std::error::Error for FilterError<'a> {}
|
||||
|
||||
impl<'a> From<BadGeoError> for FilterError<'a> {
|
||||
fn from(geo_error: BadGeoError) -> Self {
|
||||
FilterError::ParseGeoError(geo_error)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Display for FilterError<'a> {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
match self {
|
||||
@ -43,7 +77,11 @@ impl<'a> Display for FilterError<'a> {
|
||||
attribute,
|
||||
)
|
||||
} else {
|
||||
let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(" ");
|
||||
let filterables_list = filterable_fields
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.collect::<Vec<&str>>()
|
||||
.join(" ");
|
||||
|
||||
write!(
|
||||
f,
|
||||
@ -52,19 +90,19 @@ impl<'a> Display for FilterError<'a> {
|
||||
filterables_list,
|
||||
)
|
||||
}
|
||||
},
|
||||
Self::TooDeep => write!(f,
|
||||
}
|
||||
Self::TooDeep => write!(
|
||||
f,
|
||||
"Too many filter conditions, can't process more than {} filters.",
|
||||
MAX_FILTER_DEPTH
|
||||
),
|
||||
Self::ReservedGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` field coordinates.", keyword),
|
||||
Self::Reserved(keyword) => write!(
|
||||
f,
|
||||
"`{}` is a reserved keyword and thus can't be used as a filter expression.",
|
||||
keyword
|
||||
),
|
||||
Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword),
|
||||
Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat),
|
||||
Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng),
|
||||
Self::ParseGeoError(error) => write!(f, "{}", error),
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -294,12 +332,12 @@ impl<'a> Filter<'a> {
|
||||
Ok(RoaringBitmap::new())
|
||||
}
|
||||
} else {
|
||||
match fid.lexeme() {
|
||||
match fid.value() {
|
||||
attribute @ "_geo" => {
|
||||
Err(fid.as_external_error(FilterError::BadGeo(attribute)))?
|
||||
Err(fid.as_external_error(FilterError::ReservedGeo(attribute)))?
|
||||
}
|
||||
attribute if attribute.starts_with("_geoPoint(") => {
|
||||
Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))?
|
||||
Err(fid.as_external_error(FilterError::ReservedGeo("_geoPoint")))?
|
||||
}
|
||||
attribute @ "_geoDistance" => {
|
||||
Err(fid.as_external_error(FilterError::Reserved(attribute)))?
|
||||
@ -351,14 +389,10 @@ impl<'a> Filter<'a> {
|
||||
let base_point: [f64; 2] =
|
||||
[point[0].parse_finite_float()?, point[1].parse_finite_float()?];
|
||||
if !(-90.0..=90.0).contains(&base_point[0]) {
|
||||
return Err(
|
||||
point[0].as_external_error(FilterError::BadGeoLat(base_point[0]))
|
||||
)?;
|
||||
return Err(point[0].as_external_error(BadGeoError::Lat(base_point[0])))?;
|
||||
}
|
||||
if !(-180.0..=180.0).contains(&base_point[1]) {
|
||||
return Err(
|
||||
point[1].as_external_error(FilterError::BadGeoLng(base_point[1]))
|
||||
)?;
|
||||
return Err(point[1].as_external_error(BadGeoError::Lng(base_point[1])))?;
|
||||
}
|
||||
let radius = radius.parse_finite_float()?;
|
||||
let rtree = match index.geo_rtree(rtxn)? {
|
||||
@ -385,6 +419,130 @@ impl<'a> Filter<'a> {
|
||||
}))?
|
||||
}
|
||||
}
|
||||
FilterCondition::GeoBoundingBox { top_left_point, bottom_right_point } => {
|
||||
if filterable_fields.contains("_geo") {
|
||||
let top_left: [f64; 2] = [
|
||||
top_left_point[0].parse_finite_float()?,
|
||||
top_left_point[1].parse_finite_float()?,
|
||||
];
|
||||
let bottom_right: [f64; 2] = [
|
||||
bottom_right_point[0].parse_finite_float()?,
|
||||
bottom_right_point[1].parse_finite_float()?,
|
||||
];
|
||||
if !(-90.0..=90.0).contains(&top_left[0]) {
|
||||
return Err(
|
||||
top_left_point[0].as_external_error(BadGeoError::Lat(top_left[0]))
|
||||
)?;
|
||||
}
|
||||
if !(-180.0..=180.0).contains(&top_left[1]) {
|
||||
return Err(
|
||||
top_left_point[1].as_external_error(BadGeoError::Lng(top_left[1]))
|
||||
)?;
|
||||
}
|
||||
if !(-90.0..=90.0).contains(&bottom_right[0]) {
|
||||
return Err(bottom_right_point[0]
|
||||
.as_external_error(BadGeoError::Lat(bottom_right[0])))?;
|
||||
}
|
||||
if !(-180.0..=180.0).contains(&bottom_right[1]) {
|
||||
return Err(bottom_right_point[1]
|
||||
.as_external_error(BadGeoError::Lng(bottom_right[1])))?;
|
||||
}
|
||||
if top_left[0] < bottom_right[0] {
|
||||
return Err(bottom_right_point[1].as_external_error(
|
||||
BadGeoError::BoundingBoxTopIsBelowBottom(top_left[0], bottom_right[0]),
|
||||
))?;
|
||||
}
|
||||
|
||||
// Instead of writing a custom `GeoBoundingBox` filter we're simply going to re-use the range
|
||||
// filter to create the following filter;
|
||||
// `_geo.lat {top_left[0]} TO {bottom_right[0]} AND _geo.lng {top_left[1]} TO {bottom_right[1]}`
|
||||
// As we can see, we need to use a bunch of tokens that don't exist in the original filter,
|
||||
// thus we're going to create tokens that point to a random span but contain our text.
|
||||
|
||||
let geo_lat_token =
|
||||
Token::new(top_left_point[0].original_span(), Some("_geo.lat".to_string()));
|
||||
|
||||
let condition_lat = FilterCondition::Condition {
|
||||
fid: geo_lat_token,
|
||||
op: Condition::Between {
|
||||
from: bottom_right_point[0].clone(),
|
||||
to: top_left_point[0].clone(),
|
||||
},
|
||||
};
|
||||
|
||||
let selected_lat = Filter { condition: condition_lat }.inner_evaluate(
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
)?;
|
||||
|
||||
let geo_lng_token =
|
||||
Token::new(top_left_point[1].original_span(), Some("_geo.lng".to_string()));
|
||||
let selected_lng = if top_left[1] > bottom_right[1] {
|
||||
// In this case the bounding box is wrapping around the earth (going from 180 to -180).
|
||||
// We need to update the lng part of the filter from;
|
||||
// `_geo.lng {top_left[1]} TO {bottom_right[1]}` to
|
||||
// `_geo.lng {top_left[1]} TO 180 AND _geo.lng -180 TO {bottom_right[1]}`
|
||||
|
||||
let min_lng_token = Token::new(
|
||||
top_left_point[1].original_span(),
|
||||
Some("-180.0".to_string()),
|
||||
);
|
||||
let max_lng_token = Token::new(
|
||||
top_left_point[1].original_span(),
|
||||
Some("180.0".to_string()),
|
||||
);
|
||||
|
||||
let condition_left = FilterCondition::Condition {
|
||||
fid: geo_lng_token.clone(),
|
||||
op: Condition::Between {
|
||||
from: top_left_point[1].clone(),
|
||||
to: max_lng_token,
|
||||
},
|
||||
};
|
||||
let left = Filter { condition: condition_left }.inner_evaluate(
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
)?;
|
||||
|
||||
let condition_right = FilterCondition::Condition {
|
||||
fid: geo_lng_token,
|
||||
op: Condition::Between {
|
||||
from: min_lng_token,
|
||||
to: bottom_right_point[1].clone(),
|
||||
},
|
||||
};
|
||||
let right = Filter { condition: condition_right }.inner_evaluate(
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
)?;
|
||||
|
||||
left | right
|
||||
} else {
|
||||
let condition_lng = FilterCondition::Condition {
|
||||
fid: geo_lng_token,
|
||||
op: Condition::Between {
|
||||
from: top_left_point[1].clone(),
|
||||
to: bottom_right_point[1].clone(),
|
||||
},
|
||||
};
|
||||
Filter { condition: condition_lng }.inner_evaluate(
|
||||
rtxn,
|
||||
index,
|
||||
filterable_fields,
|
||||
)?
|
||||
};
|
||||
|
||||
Ok(selected_lat & selected_lng)
|
||||
} else {
|
||||
Err(top_left_point[0].as_external_error(FilterError::AttributeNotFilterable {
|
||||
attribute: "_geo",
|
||||
filterable_fields: filterable_fields.clone(),
|
||||
}))?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -502,6 +660,12 @@ mod tests {
|
||||
"Attribute `_geo` is not filterable. This index does not have configured filterable attributes."
|
||||
));
|
||||
|
||||
let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().starts_with(
|
||||
"Attribute `_geo` is not filterable. This index does not have configured filterable attributes."
|
||||
));
|
||||
|
||||
let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().starts_with(
|
||||
@ -524,6 +688,12 @@ mod tests {
|
||||
"Attribute `_geo` is not filterable. Available filterable attributes are: `title`."
|
||||
));
|
||||
|
||||
let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().starts_with(
|
||||
"Attribute `_geo` is not filterable. Available filterable attributes are: `title`."
|
||||
));
|
||||
|
||||
let filter = Filter::from_str("name = 12").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().starts_with(
|
||||
@ -675,6 +845,92 @@ mod tests {
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_bounding_box_error() {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order
|
||||
settings.set_filterable_fields(hashset! { S("_geo"), S("price") });
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
// geoboundingbox top left coord have a bad latitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([-90.0000001, 150], [30, 10])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(
|
||||
error.to_string().starts_with(
|
||||
"Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees."
|
||||
),
|
||||
"{}",
|
||||
error.to_string()
|
||||
);
|
||||
|
||||
// geoboundingbox top left coord have a bad latitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([90.0000001, 150], [30, 10])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(
|
||||
error.to_string().starts_with(
|
||||
"Bad latitude `90.0000001`. Latitude must be contained between -90 and 90 degrees."
|
||||
),
|
||||
"{}",
|
||||
error.to_string()
|
||||
);
|
||||
|
||||
// geoboundingbox bottom right coord have a bad latitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([30, 10], [-90.0000001, 150])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().contains(
|
||||
"Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees."
|
||||
));
|
||||
|
||||
// geoboundingbox bottom right coord have a bad latitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([30, 10], [90.0000001, 150])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().contains(
|
||||
"Bad latitude `90.0000001`. Latitude must be contained between -90 and 90 degrees."
|
||||
));
|
||||
|
||||
// geoboundingbox top left coord have a bad longitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([-10, 180.000001], [30, 10])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().contains(
|
||||
"Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees."
|
||||
));
|
||||
|
||||
// geoboundingbox top left coord have a bad longitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([-10, -180.000001], [30, 10])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().contains(
|
||||
"Bad longitude `-180.000001`. Longitude must be contained between -180 and 180 degrees."
|
||||
));
|
||||
|
||||
// geoboundingbox bottom right coord have a bad longitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([30, 10], [-10, -180.000001])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().contains(
|
||||
"Bad longitude `-180.000001`. Longitude must be contained between -180 and 180 degrees."
|
||||
));
|
||||
|
||||
// geoboundingbox bottom right coord have a bad longitude
|
||||
let filter =
|
||||
Filter::from_str("_geoBoundingBox([30, 10], [-10, 180.000001])").unwrap().unwrap();
|
||||
let error = filter.evaluate(&rtxn, &index).unwrap_err();
|
||||
assert!(error.to_string().contains(
|
||||
"Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees."
|
||||
));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_depth() {
|
||||
// generates a big (2 MiB) filter with too much of ORs.
|
||||
|
@ -4,7 +4,7 @@ use heed::types::{ByteSlice, DecodeIgnore};
|
||||
use heed::{BytesDecode, RoTxn};
|
||||
|
||||
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
|
||||
pub use self::filter::Filter;
|
||||
pub use self::filter::{BadGeoError, Filter};
|
||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::ByteSliceRefCodec;
|
||||
mod facet_distribution;
|
||||
|
@ -324,7 +324,7 @@ impl fmt::Debug for Search<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
#[derive(Default, Debug)]
|
||||
pub struct SearchResult {
|
||||
pub matching_words: MatchingWords,
|
||||
pub candidates: RoaringBitmap,
|
||||
|
@ -6,7 +6,7 @@ use roaring::RoaringBitmap;
|
||||
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
|
||||
use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index};
|
||||
use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
|
||||
|
||||
#[track_caller]
|
||||
pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
|
||||
@ -427,8 +427,26 @@ pub fn snap_settings(index: &Index) -> String {
|
||||
snap
|
||||
}
|
||||
|
||||
pub fn snap_documents(index: &Index) -> String {
|
||||
let mut snap = String::new();
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
let display = fields_ids_map.ids().collect::<Vec<_>>();
|
||||
|
||||
for document in index.all_documents(&rtxn).unwrap() {
|
||||
let doc = obkv_to_json(&display, &fields_ids_map, document.unwrap().1).unwrap();
|
||||
snap.push_str(&serde_json::to_string(&doc).unwrap());
|
||||
snap.push('\n');
|
||||
}
|
||||
|
||||
snap
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! full_snap_of_db {
|
||||
($index:ident, documents) => {{
|
||||
$crate::snapshot_tests::snap_documents(&$index)
|
||||
}};
|
||||
($index:ident, settings) => {{
|
||||
$crate::snapshot_tests::snap_settings(&$index)
|
||||
}};
|
||||
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
[0, ]
|
@ -0,0 +1,4 @@
|
||||
---
|
||||
source: milli/src/index.rs
|
||||
---
|
||||
[]
|
@ -591,9 +591,9 @@ fn remove_from_word_docids(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn remove_docids_from_field_id_docid_facet_value<'i, 'a>(
|
||||
index: &'i Index,
|
||||
wtxn: &'a mut heed::RwTxn,
|
||||
fn remove_docids_from_field_id_docid_facet_value(
|
||||
index: &Index,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
facet_type: FacetType,
|
||||
field_id: FieldId,
|
||||
to_remove: &RoaringBitmap,
|
||||
|
@ -157,9 +157,9 @@ impl FacetsUpdateIncrementalInner {
|
||||
///
|
||||
/// ## Return
|
||||
/// See documentation of `insert_in_level`
|
||||
fn insert_in_level_0<'t>(
|
||||
fn insert_in_level_0(
|
||||
&self,
|
||||
txn: &'t mut RwTxn,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
@ -211,9 +211,9 @@ impl FacetsUpdateIncrementalInner {
|
||||
/// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
|
||||
/// in the addition of a new key in that level, and that therefore the number of children
|
||||
/// of the parent node should be incremented.
|
||||
fn insert_in_level<'t>(
|
||||
fn insert_in_level(
|
||||
&self,
|
||||
txn: &'t mut RwTxn,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
facet_value: &[u8],
|
||||
@ -348,9 +348,9 @@ impl FacetsUpdateIncrementalInner {
|
||||
}
|
||||
|
||||
/// Insert the given facet value and corresponding document ids in the database.
|
||||
pub fn insert<'t>(
|
||||
pub fn insert(
|
||||
&self,
|
||||
txn: &'t mut RwTxn,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
@ -470,9 +470,9 @@ impl FacetsUpdateIncrementalInner {
|
||||
/// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
|
||||
/// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
|
||||
/// its left bound as well.
|
||||
fn delete_in_level<'t>(
|
||||
fn delete_in_level(
|
||||
&self,
|
||||
txn: &'t mut RwTxn,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
level: u8,
|
||||
facet_value: &[u8],
|
||||
@ -529,9 +529,9 @@ impl FacetsUpdateIncrementalInner {
|
||||
}
|
||||
}
|
||||
|
||||
fn delete_in_level_0<'t>(
|
||||
fn delete_in_level_0(
|
||||
&self,
|
||||
txn: &'t mut RwTxn,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
@ -557,9 +557,9 @@ impl FacetsUpdateIncrementalInner {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete<'t>(
|
||||
pub fn delete(
|
||||
&self,
|
||||
txn: &'t mut RwTxn,
|
||||
txn: &mut RwTxn,
|
||||
field_id: u16,
|
||||
facet_value: &[u8],
|
||||
docids: &RoaringBitmap,
|
||||
|
@ -98,7 +98,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
// If the settings specifies that a _geo field must be used therefore we must check the
|
||||
// validity of it in all the documents of this batch and this is when we return `Some`.
|
||||
let geo_field_id = match documents_batch_index.id("_geo") {
|
||||
Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id),
|
||||
Some(geo_field_id)
|
||||
if index.sortable_fields(rtxn)?.contains("_geo")
|
||||
|| index.filterable_fields(rtxn)?.contains("_geo") =>
|
||||
{
|
||||
Some(geo_field_id)
|
||||
}
|
||||
_otherwise => None,
|
||||
};
|
||||
|
||||
@ -367,11 +372,17 @@ pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
|
||||
|
||||
pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
|
||||
use GeoError::*;
|
||||
let debug_id = || Value::from(id.debug());
|
||||
let debug_id = || {
|
||||
serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
|
||||
};
|
||||
match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
|
||||
(Some(lat), Some(lng)) => {
|
||||
match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
|
||||
(Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
|
||||
document_id: debug_id(),
|
||||
value: object.into(),
|
||||
})),
|
||||
(Ok(_), Ok(_)) => Ok(Ok(())),
|
||||
(Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
|
||||
(Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
|
||||
@ -384,6 +395,7 @@ pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult
|
||||
(Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
|
||||
(None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
|
||||
},
|
||||
Value::Null => Ok(Ok(())),
|
||||
value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
|
||||
}
|
||||
}
|
||||
|
@ -59,6 +59,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
} else if lat.is_some() && lng.is_none() {
|
||||
return Err(GeoError::MissingLongitude { document_id: document_id() })?;
|
||||
}
|
||||
// else => the _geo object was `null`, there is nothing to do
|
||||
}
|
||||
|
||||
writer_into_reader(writer)
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, Seek, SeekFrom};
|
||||
use std::io::{self, Seek};
|
||||
use std::time::Instant;
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
@ -66,7 +66,7 @@ pub fn sorter_into_reader(
|
||||
|
||||
pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
|
||||
let mut file = writer.into_inner()?;
|
||||
file.seek(SeekFrom::Start(0))?;
|
||||
file.rewind()?;
|
||||
grenad::Reader::new(file).map_err(Into::into)
|
||||
}
|
||||
|
||||
|
@ -6,6 +6,7 @@ use roaring::RoaringBitmap;
|
||||
|
||||
use super::read_u32_ne_bytes;
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::update::index_documents::transform::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||
@ -57,21 +58,6 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
|
||||
/// Merge all the obks in the order we see them.
|
||||
pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs
|
||||
.iter()
|
||||
.cloned()
|
||||
.reduce(|acc, current| {
|
||||
let first = obkv::KvReader::new(&acc);
|
||||
let second = obkv::KvReader::new(¤t);
|
||||
let mut buffer = Vec::new();
|
||||
merge_two_obkvs(first, second, &mut buffer);
|
||||
Cow::from(buffer)
|
||||
})
|
||||
.unwrap())
|
||||
}
|
||||
|
||||
pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
@ -88,6 +74,41 @@ pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffe
|
||||
writer.finish().unwrap();
|
||||
}
|
||||
|
||||
/// Merge all the obks in the order we see them.
|
||||
pub fn merge_obkvs_and_operations<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
// [add, add, delete, add, add]
|
||||
// we can ignore everything that happened before the last delete.
|
||||
let starting_position =
|
||||
obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
|
||||
|
||||
// [add, add, delete]
|
||||
// if the last operation was a deletion then we simply return the deletion
|
||||
if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
|
||||
{
|
||||
return Ok(obkvs[obkvs.len() - 1].clone());
|
||||
}
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
// (add, add, delete) [add, add]
|
||||
// in the other case, no deletion will be encountered during the merge
|
||||
let mut ret =
|
||||
obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
|
||||
let first = obkv::KvReader::new(&acc);
|
||||
let second = obkv::KvReader::new(¤t[1..]);
|
||||
merge_two_obkvs(first, second, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator
|
||||
std::mem::swap(&mut acc, &mut buffer);
|
||||
acc
|
||||
});
|
||||
|
||||
ret.insert(0, Operation::Addition as u8);
|
||||
Ok(Cow::from(ret))
|
||||
}
|
||||
|
||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
|
@ -13,9 +13,9 @@ pub use grenad_helpers::{
|
||||
GrenadParameters, MergeableReader,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs,
|
||||
merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array,
|
||||
serialize_roaring_bitmap, MergeFn,
|
||||
concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
|
||||
merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
|
||||
roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
|
||||
};
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
@ -79,6 +79,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, FP, FA> {
|
||||
progress: FP,
|
||||
should_abort: FA,
|
||||
added_documents: u64,
|
||||
deleted_documents: u64,
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone)]
|
||||
@ -122,6 +123,7 @@ where
|
||||
wtxn,
|
||||
index,
|
||||
added_documents: 0,
|
||||
deleted_documents: 0,
|
||||
})
|
||||
}
|
||||
|
||||
@ -166,6 +168,30 @@ where
|
||||
Ok((self, Ok(indexed_documents)))
|
||||
}
|
||||
|
||||
/// Remove a batch of documents from the current builder.
|
||||
///
|
||||
/// Returns the number of documents deleted from the builder.
|
||||
pub fn remove_documents(
|
||||
mut self,
|
||||
to_delete: Vec<String>,
|
||||
) -> Result<(Self, StdResult<u64, UserError>)> {
|
||||
// Early return when there is no document to add
|
||||
if to_delete.is_empty() {
|
||||
return Ok((self, Ok(0)));
|
||||
}
|
||||
|
||||
let deleted_documents = self
|
||||
.transform
|
||||
.as_mut()
|
||||
.expect("Invalid document deletion state")
|
||||
.remove_documents(to_delete, self.wtxn, &self.should_abort)?
|
||||
as u64;
|
||||
|
||||
self.deleted_documents += deleted_documents;
|
||||
|
||||
Ok((self, Ok(deleted_documents)))
|
||||
}
|
||||
|
||||
#[logging_timer::time("IndexDocuments::{}")]
|
||||
pub fn execute(mut self) -> Result<DocumentAdditionResult> {
|
||||
if self.added_documents == 0 {
|
||||
@ -965,34 +991,6 @@ mod tests {
|
||||
.unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn index_all_flavour_of_geo() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
|
||||
|
||||
index
|
||||
.update_settings(|settings| {
|
||||
settings.set_filterable_fields(hashset!(S("_geo")));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{ "id": 0, "_geo": { "lat": 31, "lng": [42] } },
|
||||
{ "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 },
|
||||
{ "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
|
||||
{ "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
let rtxn = index.read_txn().unwrap();
|
||||
|
||||
let mut search = crate::Search::new(&rtxn, &index);
|
||||
search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap());
|
||||
let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
|
||||
assert_eq!(documents_ids, vec![0, 1, 2, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn geo_error() {
|
||||
let mut index = TempIndex::new();
|
||||
@ -1934,4 +1932,328 @@ mod tests {
|
||||
let expected_cj_cmn_docids = [1, 5].iter().collect();
|
||||
assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_and_delete_documents_in_single_transform() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "doggo": "kevin" },
|
||||
{ "id": 2, "doggo": { "name": "bob", "age": 20 } },
|
||||
{ "id": 3, "name": "jean", "age": 25 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"3");
|
||||
|
||||
let (builder, removed) = builder.remove_documents(vec![S("2")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"1");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 3,
|
||||
number_of_documents: 2,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":1,"doggo":"kevin"}
|
||||
{"id":3,"name":"jean","age":25}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_update_and_delete_documents_in_single_transform() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "doggo": "kevin" },
|
||||
{ "id": 2, "doggo": { "name": "bob", "age": 20 } },
|
||||
{ "id": 3, "name": "jean", "age": 25 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"3");
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 2, "catto": "jorts" },
|
||||
{ "id": 3, "legs": 4 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"2");
|
||||
|
||||
let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"2");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 5,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":3,"name":"jean","age":25,"legs":4}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_document_and_in_another_transform_update_and_delete_documents() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "doggo": "kevin" },
|
||||
{ "id": 2, "doggo": { "name": "bob", "age": 20 } },
|
||||
{ "id": 3, "name": "jean", "age": 25 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"3");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 3,
|
||||
number_of_documents: 3,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":1,"doggo":"kevin"}
|
||||
{"id":2,"doggo":{"name":"bob","age":20}}
|
||||
{"id":3,"name":"jean","age":25}
|
||||
"###);
|
||||
|
||||
// A first batch of documents has been inserted
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 2, "catto": "jorts" },
|
||||
{ "id": 3, "legs": 4 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"2");
|
||||
|
||||
let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"2");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 2,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":3,"name":"jean","age":25,"legs":4}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_document_and_then_add_documents_in_the_same_transform() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"0");
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 2, "doggo": { "name": "jean", "age": 20 } },
|
||||
{ "id": 3, "name": "bob", "age": 25 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"2");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 2,
|
||||
number_of_documents: 2,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":2,"doggo":{"name":"jean","age":20}}
|
||||
{"id":3,"name":"bob","age":25}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn delete_the_same_document_multiple_time() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (builder, removed) =
|
||||
builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"0");
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "doggo": "kevin" },
|
||||
{ "id": 2, "doggo": { "name": "jean", "age": 20 } },
|
||||
{ "id": 3, "name": "bob", "age": 25 },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"3");
|
||||
|
||||
let (builder, removed) =
|
||||
builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"2");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 3,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":3,"name":"bob","age":25}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn add_document_and_in_another_transform_delete_the_document_then_add_it_again() {
|
||||
let mut index = TempIndex::new();
|
||||
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "doggo": "kevin" },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"1");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 1,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":1,"doggo":"kevin"}
|
||||
"###);
|
||||
|
||||
// A first batch of documents has been inserted
|
||||
|
||||
let mut wtxn = index.write_txn().unwrap();
|
||||
let builder = IndexDocuments::new(
|
||||
&mut wtxn,
|
||||
&index,
|
||||
&index.indexer_config,
|
||||
index.index_documents_config.clone(),
|
||||
|_| (),
|
||||
|| false,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
|
||||
insta::assert_display_snapshot!(removed.unwrap(), @"1");
|
||||
|
||||
let documents = documents!([
|
||||
{ "id": 1, "catto": "jorts" },
|
||||
]);
|
||||
let (builder, added) = builder.add_documents(documents).unwrap();
|
||||
insta::assert_display_snapshot!(added.unwrap(), @"1");
|
||||
|
||||
let addition = builder.execute().unwrap();
|
||||
insta::assert_debug_snapshot!(addition, @r###"
|
||||
DocumentAdditionResult {
|
||||
indexed_documents: 1,
|
||||
number_of_documents: 1,
|
||||
}
|
||||
"###);
|
||||
wtxn.commit().unwrap();
|
||||
|
||||
db_snap!(index, documents, @r###"
|
||||
{"id":1,"catto":"jorts"}
|
||||
"###);
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
||||
use std::collections::hash_map::Entry;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek, SeekFrom};
|
||||
use std::io::{Read, Seek};
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use heed::RoTxn;
|
||||
@ -12,7 +12,9 @@ use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
use smartstring::SmartString;
|
||||
|
||||
use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
|
||||
};
|
||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
@ -50,8 +52,12 @@ pub struct Transform<'a, 'i> {
|
||||
pub index_documents_method: IndexDocumentsMethod,
|
||||
available_documents_ids: AvailableDocumentsIds,
|
||||
|
||||
// Both grenad follows the same format:
|
||||
// key | value
|
||||
// u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
|
||||
original_sorter: grenad::Sorter<MergeFn>,
|
||||
flattened_sorter: grenad::Sorter<MergeFn>,
|
||||
|
||||
replaced_documents_ids: RoaringBitmap,
|
||||
new_documents_ids: RoaringBitmap,
|
||||
// To increase the cache locality and decrease the heap usage we use compact smartstring.
|
||||
@ -59,6 +65,14 @@ pub struct Transform<'a, 'i> {
|
||||
documents_count: usize,
|
||||
}
|
||||
|
||||
/// This enum is specific to the grenad sorter stored in the transform.
|
||||
/// It's used as the first byte of the grenads and tells you if the document id was an addition or a deletion.
|
||||
#[repr(u8)]
|
||||
pub enum Operation {
|
||||
Addition,
|
||||
Deletion,
|
||||
}
|
||||
|
||||
/// Create a mapping between the field ids found in the document batch and the one that were
|
||||
/// already present in the index.
|
||||
///
|
||||
@ -94,7 +108,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// with the same user id must be merged or fully replaced in the same batch.
|
||||
let merge_function = match index_documents_method {
|
||||
IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
|
||||
IndexDocumentsMethod::UpdateDocuments => merge_obkvs,
|
||||
IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
|
||||
};
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
@ -151,9 +165,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
|
||||
|
||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
|
||||
let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
|
||||
|
||||
let primary_key = cursor.primary_key().to_string();
|
||||
@ -161,6 +173,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
|
||||
|
||||
let mut obkv_buffer = Vec::new();
|
||||
let mut document_sorter_buffer = Vec::new();
|
||||
let mut documents_count = 0;
|
||||
let mut docid_buffer: Vec<u8> = Vec::new();
|
||||
let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
|
||||
@ -212,10 +225,13 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
Entry::Occupied(entry) => *entry.get() as u32,
|
||||
Entry::Vacant(entry) => {
|
||||
// If the document was already in the db we mark it as a replaced document.
|
||||
// It'll be deleted later. We keep its original docid to insert it in the grenad.
|
||||
// It'll be deleted later.
|
||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
||||
self.replaced_documents_ids.insert(docid);
|
||||
original_docid = Some(docid);
|
||||
// If it was already in the list of replaced documents it means it was deleted
|
||||
// by the remove_document method. We should starts as if it never existed.
|
||||
if self.replaced_documents_ids.insert(docid) {
|
||||
original_docid = Some(docid);
|
||||
}
|
||||
}
|
||||
let docid = self
|
||||
.available_documents_ids
|
||||
@ -248,26 +264,46 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
skip_insertion = true;
|
||||
} else {
|
||||
// we associate the base document with the new key, everything will get merged later.
|
||||
self.original_sorter.insert(docid.to_be_bytes(), base_obkv)?;
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(base_obkv);
|
||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||
match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
|
||||
Some(buffer) => {
|
||||
self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?
|
||||
Some(flattened_obkv) => {
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||
}
|
||||
None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
|
||||
None => self
|
||||
.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_insertion {
|
||||
self.new_documents_ids.insert(docid);
|
||||
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(&obkv_buffer);
|
||||
// We use the extracted/generated user id as the key for this document.
|
||||
self.original_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?;
|
||||
self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
|
||||
|
||||
match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
|
||||
Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
|
||||
None => {
|
||||
self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
|
||||
Some(flattened_obkv) => {
|
||||
document_sorter_buffer.clear();
|
||||
document_sorter_buffer.push(Operation::Addition as u8);
|
||||
document_sorter_buffer.extend_from_slice(&flattened_obkv);
|
||||
self.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?
|
||||
}
|
||||
None => self
|
||||
.flattened_sorter
|
||||
.insert(docid.to_be_bytes(), &document_sorter_buffer)?,
|
||||
}
|
||||
}
|
||||
documents_count += 1;
|
||||
@ -293,6 +329,73 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
Ok(documents_count)
|
||||
}
|
||||
|
||||
/// The counter part of `read_documents` that removes documents either from the transform or the database.
|
||||
/// It can be called before, after or in between two calls of the `read_documents`.
|
||||
///
|
||||
/// It needs to update all the internal datastructure in the transform.
|
||||
/// - If the document is coming from the database -> it's marked as a to_delete document
|
||||
/// - If the document to remove was inserted by the `read_documents` method before AND was present in the db,
|
||||
/// it's marked as `to_delete` + added into the grenad to ensure we don't reinsert it.
|
||||
/// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
|
||||
/// it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
|
||||
/// - If the document to remove was not present in either the db or the transform we do nothing.
|
||||
pub fn remove_documents<FA>(
|
||||
&mut self,
|
||||
mut to_remove: Vec<String>,
|
||||
wtxn: &mut heed::RwTxn,
|
||||
should_abort: FA,
|
||||
) -> Result<usize>
|
||||
where
|
||||
FA: Fn() -> bool + Sync,
|
||||
{
|
||||
// there may be duplicates in the documents to remove.
|
||||
to_remove.sort_unstable();
|
||||
to_remove.dedup();
|
||||
|
||||
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
|
||||
|
||||
let mut documents_deleted = 0;
|
||||
for to_remove in to_remove {
|
||||
if should_abort() {
|
||||
return Err(Error::InternalError(InternalError::AbortedIndexation));
|
||||
}
|
||||
|
||||
match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
|
||||
// if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
|
||||
Entry::Occupied(entry) => {
|
||||
let doc_id = *entry.get() as u32;
|
||||
self.original_sorter
|
||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
||||
self.flattened_sorter
|
||||
.insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
|
||||
|
||||
// we must NOT update the list of replaced_documents_ids
|
||||
// Either:
|
||||
// 1. It's already in it and there is nothing to do
|
||||
// 2. It wasn't in it because the document was created by a previous batch and since
|
||||
// we're removing it there is nothing to do.
|
||||
self.new_documents_ids.remove(doc_id);
|
||||
entry.remove_entry();
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
// If the document was already in the db we mark it as a `to_delete` document.
|
||||
// It'll be deleted later. We don't need to push anything to the sorters.
|
||||
if let Some(docid) = external_documents_ids.get(entry.key()) {
|
||||
self.replaced_documents_ids.insert(docid);
|
||||
} else {
|
||||
// if the document is nowehere to be found, there is nothing to do and we must NOT
|
||||
// increment the count of documents_deleted
|
||||
continue;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
documents_deleted += 1;
|
||||
}
|
||||
|
||||
Ok(documents_deleted)
|
||||
}
|
||||
|
||||
// Flatten a document from the fields ids map contained in self and insert the new
|
||||
// created fields. Returns `None` if the document doesn't need to be flattened.
|
||||
fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
|
||||
@ -487,6 +590,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let mut documents_count = 0;
|
||||
|
||||
while let Some((key, val)) = iter.next()? {
|
||||
if val[0] == Operation::Deletion as u8 {
|
||||
continue;
|
||||
}
|
||||
let val = &val[1..];
|
||||
|
||||
// send a callback to show at which step we are
|
||||
documents_count += 1;
|
||||
progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
|
||||
@ -510,7 +618,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
|
||||
let mut original_documents = writer.into_inner()?;
|
||||
// We then extract the file and reset the seek to be able to read it again.
|
||||
original_documents.seek(SeekFrom::Start(0))?;
|
||||
original_documents.rewind()?;
|
||||
|
||||
// We create a final writer to write the new documents in order from the sorter.
|
||||
let mut writer = create_writer(
|
||||
@ -518,11 +626,20 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
tempfile::tempfile()?,
|
||||
);
|
||||
// Once we have written all the documents into the final sorter, we write the documents
|
||||
// into this writer, extract the file and reset the seek to be able to read it again.
|
||||
self.flattened_sorter.write_into_stream_writer(&mut writer)?;
|
||||
|
||||
// Once we have written all the documents into the final sorter, we write the nested documents
|
||||
// into this writer.
|
||||
// We get rids of the `Operation` byte and skip the deleted documents as well.
|
||||
let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
|
||||
while let Some((key, val)) = iter.next()? {
|
||||
if val[0] == Operation::Deletion as u8 {
|
||||
continue;
|
||||
}
|
||||
let val = &val[1..];
|
||||
writer.insert(key, val)?;
|
||||
}
|
||||
let mut flattened_documents = writer.into_inner()?;
|
||||
flattened_documents.seek(SeekFrom::Start(0))?;
|
||||
flattened_documents.rewind()?;
|
||||
|
||||
let mut new_external_documents_ids_builder: Vec<_> =
|
||||
self.new_external_documents_ids_builder.into_iter().collect();
|
||||
@ -650,10 +767,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
// Once we have written all the documents, we extract
|
||||
// the file and reset the seek to be able to read it again.
|
||||
let mut original_documents = original_writer.into_inner()?;
|
||||
original_documents.seek(SeekFrom::Start(0))?;
|
||||
original_documents.rewind()?;
|
||||
|
||||
let mut flattened_documents = flattened_writer.into_inner()?;
|
||||
flattened_documents.seek(SeekFrom::Start(0))?;
|
||||
flattened_documents.rewind()?;
|
||||
|
||||
let output = TransformOutput {
|
||||
primary_key,
|
||||
@ -701,3 +818,45 @@ impl TransformOutput {
|
||||
.collect())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn merge_obkvs() {
|
||||
let mut doc_0 = Vec::new();
|
||||
let mut kv_writer = KvWriter::new(&mut doc_0);
|
||||
kv_writer.insert(0_u8, [0]).unwrap();
|
||||
kv_writer.finish().unwrap();
|
||||
doc_0.insert(0, Operation::Addition as u8);
|
||||
|
||||
let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
|
||||
assert_eq!(*ret, doc_0);
|
||||
|
||||
let ret = merge_obkvs_and_operations(
|
||||
&[],
|
||||
&[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, doc_0);
|
||||
|
||||
let ret = merge_obkvs_and_operations(
|
||||
&[],
|
||||
&[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, [Operation::Deletion as u8]);
|
||||
|
||||
let ret = merge_obkvs_and_operations(
|
||||
&[],
|
||||
&[
|
||||
Cow::from([Operation::Addition as u8, 1].as_slice()),
|
||||
Cow::from([Operation::Deletion as u8].as_slice()),
|
||||
Cow::from(doc_0.as_slice()),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, doc_0);
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet};
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use charabia::{Tokenizer, TokenizerBuilder};
|
||||
use deserr::{DeserializeError, DeserializeFromValue};
|
||||
use deserr::{DeserializeError, Deserr};
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||
use time::OffsetDateTime;
|
||||
@ -23,9 +23,9 @@ pub enum Setting<T> {
|
||||
NotSet,
|
||||
}
|
||||
|
||||
impl<T, E> DeserializeFromValue<E> for Setting<T>
|
||||
impl<T, E> Deserr<E> for Setting<T>
|
||||
where
|
||||
T: DeserializeFromValue<E>,
|
||||
T: Deserr<E>,
|
||||
E: DeserializeError,
|
||||
{
|
||||
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||
@ -37,9 +37,6 @@ where
|
||||
_ => T::deserialize_from_value(value, location).map(Setting::Set),
|
||||
}
|
||||
}
|
||||
fn default() -> Option<Self> {
|
||||
Some(Self::NotSet)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> Default for Setting<T> {
|
||||
|
@ -140,16 +140,20 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
|
||||
|
||||
// We remove all the entries that are no more required in this word prefix position
|
||||
// docids database.
|
||||
let mut iter =
|
||||
self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
|
||||
while let Some(((prefix, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
unsafe { iter.del_current()? };
|
||||
// We also avoid iterating over the whole `word_prefix_position_docids` database if we know in
|
||||
// advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
|
||||
// will always be false (i.e. if `del_prefix_fst_words` is empty).
|
||||
if !del_prefix_fst_words.is_empty() {
|
||||
let mut iter =
|
||||
self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
|
||||
while let Some(((prefix, _), _)) = iter.next().transpose()? {
|
||||
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||
unsafe { iter.del_current()? };
|
||||
}
|
||||
}
|
||||
drop(iter);
|
||||
}
|
||||
|
||||
drop(iter);
|
||||
|
||||
// We finally write all the word prefix position docids into the LMDB database.
|
||||
sorter_into_lmdb_database(
|
||||
self.wtxn,
|
||||
|
Reference in New Issue
Block a user