Merge branch 'main' into enhance-language-detection

2025-10-27 14:06:27 +00:00 · 2023-02-20 18:14:34 +01:00
parent 23f4e82b53 b08a49a16e
commit 8aa808d51b
202 changed files with 10490 additions and 3066 deletions
--- a/milli/src/asc_desc.rs
+++ b/milli/src/asc_desc.rs
@@ -7,45 +7,31 @@ use serde::{Deserialize, Serialize};
 use thiserror::Error;

 use crate::error::is_reserved_keyword;
+use crate::search::facet::BadGeoError;
 use crate::{CriterionError, Error, UserError};

 /// This error type is never supposed to be shown to the end user.
 /// You must always cast it to a sort error or a criterion error.
-#[derive(Debug)]
+#[derive(Error, Debug)]
 pub enum AscDescError {
-    InvalidLatitude,
-    InvalidLongitude,
+    #[error(transparent)]
+    GeoError(BadGeoError),
+    #[error("Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{name}`.")]
    InvalidSyntax { name: String },
+    #[error("`{name}` is a reserved keyword and thus can't be used as a asc/desc rule.")]
    ReservedKeyword { name: String },
 }

-impl fmt::Display for AscDescError {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        match self {
-            Self::InvalidLatitude => {
-                write!(f, "Latitude must be contained between -90 and 90 degrees.",)
-            }
-            Self::InvalidLongitude => {
-                write!(f, "Longitude must be contained between -180 and 180 degrees.",)
-            }
-            Self::InvalidSyntax { name } => {
-                write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
-            }
-            Self::ReservedKeyword { name } => {
-                write!(
-                    f,
-                    "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.",
-                    name
-                )
-            }
-        }
+impl From<BadGeoError> for AscDescError {
+    fn from(geo_error: BadGeoError) -> Self {
+        AscDescError::GeoError(geo_error)
    }
 }

 impl From<AscDescError> for CriterionError {
    fn from(error: AscDescError) -> Self {
        match error {
-            AscDescError::InvalidLatitude | AscDescError::InvalidLongitude => {
+            AscDescError::GeoError(_) => {
                CriterionError::ReservedNameForSort { name: "_geoPoint".to_string() }
            }
            AscDescError::InvalidSyntax { name } => CriterionError::InvalidName { name },
@@ -55,6 +41,9 @@ impl From<AscDescError> for CriterionError {
            AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => {
                CriterionError::ReservedNameForFilter { name: "_geoRadius".to_string() }
            }
+            AscDescError::ReservedKeyword { name } if name.starts_with("_geoBoundingBox") => {
+                CriterionError::ReservedNameForFilter { name: "_geoBoundingBox".to_string() }
+            }
            AscDescError::ReservedKeyword { name } => CriterionError::ReservedName { name },
        }
    }
@@ -82,14 +71,17 @@ impl FromStr for Member {
                            .map_err(|_| AscDescError::ReservedKeyword { name: text.to_string() })
                    })?;
                if !(-90.0..=90.0).contains(&lat) {
-                    return Err(AscDescError::InvalidLatitude)?;
+                    return Err(BadGeoError::Lat(lat))?;
                } else if !(-180.0..=180.0).contains(&lng) {
-                    return Err(AscDescError::InvalidLongitude)?;
+                    return Err(BadGeoError::Lng(lng))?;
                }
                Ok(Member::Geo([lat, lng]))
            }
            None => {
-                if is_reserved_keyword(text) || text.starts_with("_geoRadius(") {
+                if is_reserved_keyword(text)
+                    || text.starts_with("_geoRadius(")
+                    || text.starts_with("_geoBoundingBox(")
+                {
                    return Err(AscDescError::ReservedKeyword { name: text.to_string() })?;
                }
                Ok(Member::Field(text.to_string()))
@@ -156,10 +148,8 @@ impl FromStr for AscDesc {

 #[derive(Error, Debug)]
 pub enum SortError {
-    #[error("{}", AscDescError::InvalidLatitude)]
-    InvalidLatitude,
-    #[error("{}", AscDescError::InvalidLongitude)]
-    InvalidLongitude,
+    #[error(transparent)]
+    ParseGeoError { error: BadGeoError },
    #[error("Invalid syntax for the geo parameter: expected expression formated like \
                    `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{name}`.")]
    BadGeoPointUsage { name: String },
@@ -178,8 +168,7 @@ pub enum SortError {
 impl From<AscDescError> for SortError {
    fn from(error: AscDescError) -> Self {
        match error {
-            AscDescError::InvalidLatitude => SortError::InvalidLatitude,
-            AscDescError::InvalidLongitude => SortError::InvalidLongitude,
+            AscDescError::GeoError(error) => SortError::ParseGeoError { error },
            AscDescError::InvalidSyntax { name } => SortError::InvalidName { name },
            AscDescError::ReservedKeyword { name } if name.starts_with("_geoPoint") => {
                SortError::BadGeoPointUsage { name }
@@ -190,6 +179,9 @@ impl From<AscDescError> for SortError {
            AscDescError::ReservedKeyword { name } if name.starts_with("_geoRadius") => {
                SortError::ReservedNameForFilter { name: String::from("_geoRadius") }
            }
+            AscDescError::ReservedKeyword { name } if name.starts_with("_geoBoundingBox") => {
+                SortError::ReservedNameForFilter { name: String::from("_geoBoundingBox") }
+            }
            AscDescError::ReservedKeyword { name } => SortError::ReservedName { name },
        }
    }
@@ -268,11 +260,11 @@ mod tests {
            ),
            ("_geoPoint(35, 85, 75):asc", ReservedKeyword { name: S("_geoPoint(35, 85, 75)") }),
            ("_geoPoint(18):asc", ReservedKeyword { name: S("_geoPoint(18)") }),
-            ("_geoPoint(200, 200):asc", InvalidLatitude),
-            ("_geoPoint(90.000001, 0):asc", InvalidLatitude),
-            ("_geoPoint(0, -180.000001):desc", InvalidLongitude),
-            ("_geoPoint(159.256, 130):asc", InvalidLatitude),
-            ("_geoPoint(12, -2021):desc", InvalidLongitude),
+            ("_geoPoint(200, 200):asc", GeoError(BadGeoError::Lat(200.))),
+            ("_geoPoint(90.000001, 0):asc", GeoError(BadGeoError::Lat(90.000001))),
+            ("_geoPoint(0, -180.000001):desc", GeoError(BadGeoError::Lng(-180.000001))),
+            ("_geoPoint(159.256, 130):asc", GeoError(BadGeoError::Lat(159.256))),
+            ("_geoPoint(12, -2021):desc", GeoError(BadGeoError::Lng(-2021.))),
        ];

        for (req, expected_error) in invalid_req {
--- a/milli/src/criterion.rs
+++ b/milli/src/criterion.rs
@@ -159,6 +159,11 @@ mod tests {
            ("_geoPoint(42, 75):asc", ReservedNameForSort { name: S("_geoPoint") }),
            ("_geoRadius:asc", ReservedNameForFilter { name: S("_geoRadius") }),
            ("_geoRadius(42, 75, 59):asc", ReservedNameForFilter { name: S("_geoRadius") }),
+            ("_geoBoundingBox:asc", ReservedNameForFilter { name: S("_geoBoundingBox") }),
+            (
+                "_geoBoundingBox([42, 75], [75, 59]):asc",
+                ReservedNameForFilter { name: S("_geoBoundingBox") },
+            ),
        ];

        for (input, expected) in invalid_criteria {
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -1,5 +1,6 @@
 use std::collections::BTreeSet;
 use std::convert::Infallible;
+use std::fmt::Write;
 use std::{io, str};

 use heed::{Error as HeedError, MdbError};
@@ -11,7 +12,7 @@ use crate::documents::{self, DocumentsBatchCursorError};
 use crate::{CriterionError, DocumentId, FieldId, Object, SortError};

 pub fn is_reserved_keyword(keyword: &str) -> bool {
-    ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword)
+    ["_geo", "_geoDistance", "_geoPoint", "_geoRadius", "_geoBoundingBox"].contains(&keyword)
 }

 #[derive(Error, Debug)]
@@ -100,10 +101,11 @@ A document identifier can be of type integer or string, \
 only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", .document_id.to_string()
    )]
    InvalidDocumentId { document_id: Value },
-    #[error("Invalid facet distribution, the fields `{}` are not set as filterable.",
-        .invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
-     )]
-    InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
+    #[error("Invalid facet distribution, {}", format_invalid_filter_distribution(.invalid_facets_name, .valid_facets_name))]
+    InvalidFacetsDistribution {
+        invalid_facets_name: BTreeSet<String>,
+        valid_facets_name: BTreeSet<String>,
+    },
    #[error(transparent)]
    InvalidGeoField(#[from] GeoError),
    #[error("{0}")]
@@ -152,6 +154,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
 pub enum GeoError {
    #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
    NotAnObject { document_id: Value, value: Value },
+    #[error("The `_geo` field in the document with the id: `{document_id}` contains the following unexpected fields: `{value}`.")]
+    UnexpectedExtraFields { document_id: Value, value: Value },
    #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
    MissingLatitudeAndLongitude { document_id: Value },
    #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
@@ -166,6 +170,50 @@ pub enum GeoError {
    BadLongitude { document_id: Value, value: Value },
 }

+fn format_invalid_filter_distribution(
+    invalid_facets_name: &BTreeSet<String>,
+    valid_facets_name: &BTreeSet<String>,
+) -> String {
+    if valid_facets_name.is_empty() {
+        return "this index does not have configured filterable attributes.".into();
+    }
+
+    let mut result = String::new();
+
+    match invalid_facets_name.len() {
+        0 => (),
+        1 => write!(
+            result,
+            "attribute `{}` is not filterable.",
+            invalid_facets_name.first().unwrap()
+        )
+        .unwrap(),
+        _ => write!(
+            result,
+            "attributes `{}` are not filterable.",
+            invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
+        )
+        .unwrap(),
+    };
+
+    match valid_facets_name.len() {
+        1 => write!(
+            result,
+            " The available filterable attribute is `{}`.",
+            valid_facets_name.first().unwrap()
+        )
+        .unwrap(),
+        _ => write!(
+            result,
+            " The available filterable attributes are `{}`.",
+            valid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(", ")
+        )
+        .unwrap(),
+    }
+
+    result
+}
+
 /// A little macro helper to autogenerate From implementation that needs two `Into`.
 /// Given the following parameters: `error_from_sub_error!(FieldIdMapMissingEntry => InternalError)`
 /// the macro will create the following code:
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -355,10 +355,10 @@ impl Index {
    /* external documents ids */

    /// Writes the external documents ids and internal ids (i.e. `u32`).
-    pub(crate) fn put_external_documents_ids<'a>(
+    pub(crate) fn put_external_documents_ids(
        &self,
        wtxn: &mut RwTxn,
-        external_documents_ids: &ExternalDocumentsIds<'a>,
+        external_documents_ids: &ExternalDocumentsIds<'_>,
    ) -> heed::Result<()> {
        let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
        let hard = hard.as_fst().as_bytes();
@@ -433,7 +433,7 @@ impl Index {
    }

    /// Returns the `rtree` which associates coordinates to documents ids.
-    pub fn geo_rtree<'t>(&self, rtxn: &'t RoTxn) -> Result<Option<RTree<GeoPoint>>> {
+    pub fn geo_rtree(&self, rtxn: &RoTxn) -> Result<Option<RTree<GeoPoint>>> {
        match self
            .main
            .get::<_, Str, SerdeBincode<RTree<GeoPoint>>>(rtxn, main_key::GEO_RTREE_KEY)?
@@ -1245,7 +1245,7 @@ pub(crate) mod tests {
        self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
        IndexDocumentsMethod, IndexerConfig, Settings,
    };
-    use crate::{db_snap, obkv_to_json, Index, Search, SearchResult};
+    use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};

    pub(crate) struct TempIndex {
        pub inner: Index,
@@ -1543,6 +1543,108 @@ pub(crate) mod tests {
        assert_eq!(user_defined, &["doggo", "name"]);
    }

+    #[test]
+    fn test_basic_geo_bounding_box() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| {
+                settings.set_filterable_fields(hashset! { S("_geo") });
+            })
+            .unwrap();
+        index
+            .add_documents(documents!([
+                { "id": 0, "_geo": { "lat": 0, "lng": 0 } },
+                { "id": 1, "_geo": { "lat": 0, "lng": -175 } },
+                { "id": 2, "_geo": { "lat": 0, "lng": 175 } },
+                { "id": 3, "_geo": { "lat": 85, "lng": 0 } },
+                { "id": 4, "_geo": { "lat": -85, "lng": 0 } },
+            ]))
+            .unwrap();
+
+        // ensure we get the right real searchable fields + user defined searchable fields
+        let rtxn = index.read_txn().unwrap();
+        let mut search = index.search(&rtxn);
+
+        // exact match a document
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([0, 0], [0, 0])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0]>");
+
+        // match a document in the middle of the rectangle
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([10, -10], [-10, 10])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0]>");
+
+        // select everything
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([90, -180], [-90, 180])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[0, 1, 2, 3, 4]>");
+
+        // go on the edge of the longitude
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([0, 180], [0, -170])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[1]>");
+
+        // go on the other edge of the longitude
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([0, 170], [0, -180])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[2]>");
+
+        // wrap around the longitude
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([0, 170], [0, -170])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[1, 2]>");
+
+        // go on the edge of the latitude
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([90, 0], [80, 0])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[3]>");
+
+        // go on the edge of the latitude
+        let search_result = search
+            .filter(Filter::from_str("_geoBoundingBox([-80, 0], [-90, 0])").unwrap().unwrap())
+            .execute()
+            .unwrap();
+        insta::assert_debug_snapshot!(search_result.candidates, @"RoaringBitmap<[4]>");
+
+        // the requests that don't make sense
+
+        // try to wrap around the latitude
+        let error = search
+            .filter(Filter::from_str("_geoBoundingBox([-80, 0], [80, 0])").unwrap().unwrap())
+            .execute()
+            .unwrap_err();
+        insta::assert_display_snapshot!(error, @r###"
+        The top latitude `-80` is below the bottom latitude `80`.
+        32:33 _geoBoundingBox([-80, 0], [80, 0])
+        "###);
+
+        // send a top latitude lower than the bottow latitude
+        let error = search
+            .filter(Filter::from_str("_geoBoundingBox([-10, 0], [10, 0])").unwrap().unwrap())
+            .execute()
+            .unwrap_err();
+        insta::assert_display_snapshot!(error, @r###"
+        The top latitude `-10` is below the bottom latitude `10`.
+        32:33 _geoBoundingBox([-10, 0], [10, 0])
+        "###);
+    }
+
    #[test]
    fn replace_documents_external_ids_and_soft_deletion_check() {
        use big_s::S;
@@ -2331,4 +2433,69 @@ pub(crate) mod tests {
            assert!(all_ids.insert(id));
        }
    }
+
+    #[test]
+    fn bug_3007() {
+        // https://github.com/meilisearch/meilisearch/issues/3007
+
+        use crate::error::{GeoError, UserError};
+        let index = TempIndex::new();
+
+        // Given is an index with a geo field NOT contained in the sortable_fields of the settings
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key("id".to_string());
+                settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
+            })
+            .unwrap();
+
+        // happy path
+        index.add_documents(documents!({ "id" : 5, "_geo": {"lat": 12.0, "lng": 11.0}})).unwrap();
+
+        db_snap!(index, geo_faceted_documents_ids);
+
+        // both are unparseable, we expect GeoError::BadLatitudeAndLongitude
+        let err1 = index
+            .add_documents(
+                documents!({ "id" : 6, "_geo": {"lat": "unparseable", "lng": "unparseable"}}),
+            )
+            .unwrap_err();
+        assert!(matches!(
+            err1,
+            Error::UserError(UserError::InvalidGeoField(GeoError::BadLatitudeAndLongitude { .. }))
+        ));
+
+        db_snap!(index, geo_faceted_documents_ids); // ensure that no more document was inserted
+    }
+
+    #[test]
+    fn unexpected_extra_fields_in_geo_field() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key("id".to_string());
+                settings.set_filterable_fields(HashSet::from(["_geo".to_string()]));
+            })
+            .unwrap();
+
+        let err = index
+            .add_documents(
+                documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best" }}),
+            )
+            .unwrap_err();
+        insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"doggo":"are the best"}`."###);
+
+        db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
+
+        // multiple fields and complex values
+        let err = index
+            .add_documents(
+                documents!({ "id" : "doggo", "_geo": { "lat": 1, "lng": 2, "doggo": "are the best", "and": { "all": ["cats", { "are": "beautiful" } ] } } }),
+            )
+            .unwrap_err();
+        insta::assert_display_snapshot!(err, @r###"The `_geo` field in the document with the id: `"\"doggo\""` contains the following unexpected fields: `{"and":{"all":["cats",{"are":"beautiful"}]},"doggo":"are the best"}`."###);
+
+        db_snap!(index, geo_faceted_documents_ids); // ensure that no documents were inserted
+    }
 }
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@@ -123,7 +123,7 @@ impl<'t> Criterion for Attribute<'t> {
                            None => {
                                return Ok(Some(CriterionResult {
                                    query_tree: Some(query_tree),
-                                    candidates: Some(RoaringBitmap::new()),
+                                    candidates: Some(allowed_candidates),
                                    filtered_candidates: None,
                                    initial_candidates: Some(self.initial_candidates.take()),
                                }));
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@@ -182,15 +182,15 @@ impl<'t> Criterion for Proximity<'t> {
    }
 }

-fn resolve_candidates<'t>(
-    ctx: &'t dyn Context,
+fn resolve_candidates(
+    ctx: &dyn Context,
    query_tree: &Operation,
    proximity: u8,
    cache: &mut Cache,
    wdcache: &mut WordDerivationsCache,
 ) -> Result<RoaringBitmap> {
-    fn resolve_operation<'t>(
-        ctx: &'t dyn Context,
+    fn resolve_operation(
+        ctx: &dyn Context,
        query_tree: &Operation,
        proximity: u8,
        cache: &mut Cache,
@@ -243,8 +243,8 @@ fn resolve_candidates<'t>(
        Ok(result)
    }

-    fn mdfs_pair<'t>(
-        ctx: &'t dyn Context,
+    fn mdfs_pair(
+        ctx: &dyn Context,
        left: &Operation,
        right: &Operation,
        proximity: u8,
@@ -298,8 +298,8 @@ fn resolve_candidates<'t>(
        Ok(output)
    }

-    fn mdfs<'t>(
-        ctx: &'t dyn Context,
+    fn mdfs(
+        ctx: &dyn Context,
        branches: &[Operation],
        proximity: u8,
        cache: &mut Cache,
--- a/milli/src/search/criteria/typo.rs
+++ b/milli/src/search/criteria/typo.rs
@@ -239,15 +239,15 @@ fn alterate_query_tree(
    Ok(query_tree)
 }

-fn resolve_candidates<'t>(
-    ctx: &'t dyn Context,
+fn resolve_candidates(
+    ctx: &dyn Context,
    query_tree: &Operation,
    number_typos: u8,
    cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
    wdcache: &mut WordDerivationsCache,
 ) -> Result<RoaringBitmap> {
-    fn resolve_operation<'t>(
-        ctx: &'t dyn Context,
+    fn resolve_operation(
+        ctx: &dyn Context,
        query_tree: &Operation,
        number_typos: u8,
        cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
@@ -276,8 +276,8 @@ fn resolve_candidates<'t>(
        }
    }

-    fn mdfs<'t>(
-        ctx: &'t dyn Context,
+    fn mdfs(
+        ctx: &dyn Context,
        branches: &[Operation],
        mana: u8,
        cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
--- a/milli/src/search/facet/facet_distribution.rs
+++ b/milli/src/search/facet/facet_distribution.rs
@@ -291,6 +291,7 @@ impl<'a> FacetDistribution<'a> {
                if !invalid_fields.is_empty() {
                    return Err(UserError::InvalidFacetsDistribution {
                        invalid_facets_name: invalid_fields.into_iter().cloned().collect(),
+                        valid_facets_name: filterable_fields.into_iter().collect(),
                    }
                    .into());
                } else {
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -21,17 +21,51 @@ pub struct Filter<'a> {
    condition: FilterCondition<'a>,
 }

+#[derive(Debug)]
+pub enum BadGeoError {
+    Lat(f64),
+    Lng(f64),
+    BoundingBoxTopIsBelowBottom(f64, f64),
+}
+
+impl std::error::Error for BadGeoError {}
+
+impl Display for BadGeoError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Self::BoundingBoxTopIsBelowBottom(top, bottom) => {
+                write!(f, "The top latitude `{top}` is below the bottom latitude `{bottom}`.")
+            }
+            Self::Lat(lat) => write!(
+                f,
+                "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ",
+                lat
+            ),
+            Self::Lng(lng) => write!(
+                f,
+                "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ",
+                lng
+            ),
+        }
+    }
+}
+
 #[derive(Debug)]
 enum FilterError<'a> {
    AttributeNotFilterable { attribute: &'a str, filterable_fields: HashSet<String> },
-    BadGeo(&'a str),
-    BadGeoLat(f64),
-    BadGeoLng(f64),
+    ParseGeoError(BadGeoError),
+    ReservedGeo(&'a str),
    Reserved(&'a str),
    TooDeep,
 }
 impl<'a> std::error::Error for FilterError<'a> {}

+impl<'a> From<BadGeoError> for FilterError<'a> {
+    fn from(geo_error: BadGeoError) -> Self {
+        FilterError::ParseGeoError(geo_error)
+    }
+}
+
 impl<'a> Display for FilterError<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
@@ -43,7 +77,11 @@ impl<'a> Display for FilterError<'a> {
                        attribute,
                    )
                } else {
-                    let filterables_list = filterable_fields.iter().map(AsRef::as_ref).collect::<Vec<&str>>().join(" ");
+                    let filterables_list = filterable_fields
+                        .iter()
+                        .map(AsRef::as_ref)
+                        .collect::<Vec<&str>>()
+                        .join(" ");

                    write!(
                        f,
@@ -52,19 +90,19 @@ impl<'a> Display for FilterError<'a> {
                        filterables_list,
                    )
                }
-            },
-            Self::TooDeep => write!(f,
+            }
+            Self::TooDeep => write!(
+                f,
                "Too many filter conditions, can't process more than {} filters.",
                MAX_FILTER_DEPTH
            ),
+            Self::ReservedGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` or `_geoBoundingBox([latitude, longitude], [latitude, longitude])` built-in rules to filter on `_geo` field coordinates.", keyword),
            Self::Reserved(keyword) => write!(
                f,
                "`{}` is a reserved keyword and thus can't be used as a filter expression.",
                keyword
            ),
-            Self::BadGeo(keyword) => write!(f, "`{}` is a reserved keyword and thus can't be used as a filter expression. Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.", keyword),
-            Self::BadGeoLat(lat) => write!(f, "Bad latitude `{}`. Latitude must be contained between -90 and 90 degrees. ", lat),
-            Self::BadGeoLng(lng) => write!(f, "Bad longitude `{}`. Longitude must be contained between -180 and 180 degrees. ", lng),
+            Self::ParseGeoError(error) => write!(f, "{}", error),
        }
    }
 }
@@ -294,12 +332,12 @@ impl<'a> Filter<'a> {
                        Ok(RoaringBitmap::new())
                    }
                } else {
-                    match fid.lexeme() {
+                    match fid.value() {
                        attribute @ "_geo" => {
-                            Err(fid.as_external_error(FilterError::BadGeo(attribute)))?
+                            Err(fid.as_external_error(FilterError::ReservedGeo(attribute)))?
                        }
                        attribute if attribute.starts_with("_geoPoint(") => {
-                            Err(fid.as_external_error(FilterError::BadGeo("_geoPoint")))?
+                            Err(fid.as_external_error(FilterError::ReservedGeo("_geoPoint")))?
                        }
                        attribute @ "_geoDistance" => {
                            Err(fid.as_external_error(FilterError::Reserved(attribute)))?
@@ -351,14 +389,10 @@ impl<'a> Filter<'a> {
                    let base_point: [f64; 2] =
                        [point[0].parse_finite_float()?, point[1].parse_finite_float()?];
                    if !(-90.0..=90.0).contains(&base_point[0]) {
-                        return Err(
-                            point[0].as_external_error(FilterError::BadGeoLat(base_point[0]))
-                        )?;
+                        return Err(point[0].as_external_error(BadGeoError::Lat(base_point[0])))?;
                    }
                    if !(-180.0..=180.0).contains(&base_point[1]) {
-                        return Err(
-                            point[1].as_external_error(FilterError::BadGeoLng(base_point[1]))
-                        )?;
+                        return Err(point[1].as_external_error(BadGeoError::Lng(base_point[1])))?;
                    }
                    let radius = radius.parse_finite_float()?;
                    let rtree = match index.geo_rtree(rtxn)? {
@@ -385,6 +419,130 @@ impl<'a> Filter<'a> {
                    }))?
                }
            }
+            FilterCondition::GeoBoundingBox { top_left_point, bottom_right_point } => {
+                if filterable_fields.contains("_geo") {
+                    let top_left: [f64; 2] = [
+                        top_left_point[0].parse_finite_float()?,
+                        top_left_point[1].parse_finite_float()?,
+                    ];
+                    let bottom_right: [f64; 2] = [
+                        bottom_right_point[0].parse_finite_float()?,
+                        bottom_right_point[1].parse_finite_float()?,
+                    ];
+                    if !(-90.0..=90.0).contains(&top_left[0]) {
+                        return Err(
+                            top_left_point[0].as_external_error(BadGeoError::Lat(top_left[0]))
+                        )?;
+                    }
+                    if !(-180.0..=180.0).contains(&top_left[1]) {
+                        return Err(
+                            top_left_point[1].as_external_error(BadGeoError::Lng(top_left[1]))
+                        )?;
+                    }
+                    if !(-90.0..=90.0).contains(&bottom_right[0]) {
+                        return Err(bottom_right_point[0]
+                            .as_external_error(BadGeoError::Lat(bottom_right[0])))?;
+                    }
+                    if !(-180.0..=180.0).contains(&bottom_right[1]) {
+                        return Err(bottom_right_point[1]
+                            .as_external_error(BadGeoError::Lng(bottom_right[1])))?;
+                    }
+                    if top_left[0] < bottom_right[0] {
+                        return Err(bottom_right_point[1].as_external_error(
+                            BadGeoError::BoundingBoxTopIsBelowBottom(top_left[0], bottom_right[0]),
+                        ))?;
+                    }
+
+                    // Instead of writing a custom `GeoBoundingBox` filter we're simply going to re-use the range
+                    // filter to create the following filter;
+                    // `_geo.lat {top_left[0]} TO {bottom_right[0]} AND _geo.lng {top_left[1]} TO {bottom_right[1]}`
+                    // As we can see, we need to use a bunch of tokens that don't exist in the original filter,
+                    // thus we're going to create tokens that point to a random span but contain our text.
+
+                    let geo_lat_token =
+                        Token::new(top_left_point[0].original_span(), Some("_geo.lat".to_string()));
+
+                    let condition_lat = FilterCondition::Condition {
+                        fid: geo_lat_token,
+                        op: Condition::Between {
+                            from: bottom_right_point[0].clone(),
+                            to: top_left_point[0].clone(),
+                        },
+                    };
+
+                    let selected_lat = Filter { condition: condition_lat }.inner_evaluate(
+                        rtxn,
+                        index,
+                        filterable_fields,
+                    )?;
+
+                    let geo_lng_token =
+                        Token::new(top_left_point[1].original_span(), Some("_geo.lng".to_string()));
+                    let selected_lng = if top_left[1] > bottom_right[1] {
+                        // In this case the bounding box is wrapping around the earth (going from 180 to -180).
+                        // We need to update the lng part of the filter from;
+                        // `_geo.lng {top_left[1]} TO {bottom_right[1]}` to
+                        // `_geo.lng {top_left[1]} TO 180 AND _geo.lng -180 TO {bottom_right[1]}`
+
+                        let min_lng_token = Token::new(
+                            top_left_point[1].original_span(),
+                            Some("-180.0".to_string()),
+                        );
+                        let max_lng_token = Token::new(
+                            top_left_point[1].original_span(),
+                            Some("180.0".to_string()),
+                        );
+
+                        let condition_left = FilterCondition::Condition {
+                            fid: geo_lng_token.clone(),
+                            op: Condition::Between {
+                                from: top_left_point[1].clone(),
+                                to: max_lng_token,
+                            },
+                        };
+                        let left = Filter { condition: condition_left }.inner_evaluate(
+                            rtxn,
+                            index,
+                            filterable_fields,
+                        )?;
+
+                        let condition_right = FilterCondition::Condition {
+                            fid: geo_lng_token,
+                            op: Condition::Between {
+                                from: min_lng_token,
+                                to: bottom_right_point[1].clone(),
+                            },
+                        };
+                        let right = Filter { condition: condition_right }.inner_evaluate(
+                            rtxn,
+                            index,
+                            filterable_fields,
+                        )?;
+
+                        left | right
+                    } else {
+                        let condition_lng = FilterCondition::Condition {
+                            fid: geo_lng_token,
+                            op: Condition::Between {
+                                from: top_left_point[1].clone(),
+                                to: bottom_right_point[1].clone(),
+                            },
+                        };
+                        Filter { condition: condition_lng }.inner_evaluate(
+                            rtxn,
+                            index,
+                            filterable_fields,
+                        )?
+                    };
+
+                    Ok(selected_lat & selected_lng)
+                } else {
+                    Err(top_left_point[0].as_external_error(FilterError::AttributeNotFilterable {
+                        attribute: "_geo",
+                        filterable_fields: filterable_fields.clone(),
+                    }))?
+                }
+            }
        }
    }
 }
@@ -502,6 +660,12 @@ mod tests {
            "Attribute `_geo` is not filterable. This index does not have configured filterable attributes."
        ));

+        let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().starts_with(
+            "Attribute `_geo` is not filterable. This index does not have configured filterable attributes."
+        ));
+
        let filter = Filter::from_str("dog = \"bernese mountain\"").unwrap().unwrap();
        let error = filter.evaluate(&rtxn, &index).unwrap_err();
        assert!(error.to_string().starts_with(
@@ -524,6 +688,12 @@ mod tests {
            "Attribute `_geo` is not filterable. Available filterable attributes are: `title`."
        ));

+        let filter = Filter::from_str("_geoBoundingBox([42, 150], [30, 10])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().starts_with(
+            "Attribute `_geo` is not filterable. Available filterable attributes are: `title`."
+        ));
+
        let filter = Filter::from_str("name = 12").unwrap().unwrap();
        let error = filter.evaluate(&rtxn, &index).unwrap_err();
        assert!(error.to_string().starts_with(
@@ -675,6 +845,92 @@ mod tests {
        ));
    }

+    #[test]
+    fn geo_bounding_box_error() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| {
+                settings.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order
+                settings.set_filterable_fields(hashset! { S("_geo"), S("price") });
+            })
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        // geoboundingbox top left coord have a bad latitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([-90.0000001, 150], [30, 10])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(
+            error.to_string().starts_with(
+                "Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees."
+            ),
+            "{}",
+            error.to_string()
+        );
+
+        // geoboundingbox top left coord have a bad latitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([90.0000001, 150], [30, 10])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(
+            error.to_string().starts_with(
+                "Bad latitude `90.0000001`. Latitude must be contained between -90 and 90 degrees."
+            ),
+            "{}",
+            error.to_string()
+        );
+
+        // geoboundingbox bottom right coord have a bad latitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([30, 10], [-90.0000001, 150])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().contains(
+            "Bad latitude `-90.0000001`. Latitude must be contained between -90 and 90 degrees."
+        ));
+
+        // geoboundingbox bottom right coord have a bad latitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([30, 10], [90.0000001, 150])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().contains(
+            "Bad latitude `90.0000001`. Latitude must be contained between -90 and 90 degrees."
+        ));
+
+        // geoboundingbox top left coord have a bad longitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([-10, 180.000001], [30, 10])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().contains(
+            "Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees."
+        ));
+
+        // geoboundingbox top left coord have a bad longitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([-10, -180.000001], [30, 10])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().contains(
+            "Bad longitude `-180.000001`. Longitude must be contained between -180 and 180 degrees."
+        ));
+
+        // geoboundingbox bottom right coord have a bad longitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([30, 10], [-10, -180.000001])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().contains(
+            "Bad longitude `-180.000001`. Longitude must be contained between -180 and 180 degrees."
+        ));
+
+        // geoboundingbox bottom right coord have a bad longitude
+        let filter =
+            Filter::from_str("_geoBoundingBox([30, 10], [-10, 180.000001])").unwrap().unwrap();
+        let error = filter.evaluate(&rtxn, &index).unwrap_err();
+        assert!(error.to_string().contains(
+            "Bad longitude `180.000001`. Longitude must be contained between -180 and 180 degrees."
+        ));
+    }
+
    #[test]
    fn filter_depth() {
        // generates a big (2 MiB) filter with too much of ORs.
--- a/milli/src/search/facet/mod.rs
+++ b/milli/src/search/facet/mod.rs
@@ -4,7 +4,7 @@ use heed::types::{ByteSlice, DecodeIgnore};
 use heed::{BytesDecode, RoTxn};

 pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
-pub use self::filter::Filter;
+pub use self::filter::{BadGeoError, Filter};
 use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
 use crate::heed_codec::ByteSliceRefCodec;
 mod facet_distribution;
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -324,7 +324,7 @@ impl fmt::Debug for Search<'_> {
    }
 }

-#[derive(Default)]
+#[derive(Default, Debug)]
 pub struct SearchResult {
    pub matching_words: MatchingWords,
    pub candidates: RoaringBitmap,
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -6,7 +6,7 @@ use roaring::RoaringBitmap;

 use crate::facet::FacetType;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
-use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index};
+use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};

 #[track_caller]
 pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
@@ -427,8 +427,26 @@ pub fn snap_settings(index: &Index) -> String {
    snap
 }

+pub fn snap_documents(index: &Index) -> String {
+    let mut snap = String::new();
+    let rtxn = index.read_txn().unwrap();
+    let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
+    let display = fields_ids_map.ids().collect::<Vec<_>>();
+
+    for document in index.all_documents(&rtxn).unwrap() {
+        let doc = obkv_to_json(&display, &fields_ids_map, document.unwrap().1).unwrap();
+        snap.push_str(&serde_json::to_string(&doc).unwrap());
+        snap.push('\n');
+    }
+
+    snap
+}
+
 #[macro_export]
 macro_rules! full_snap_of_db {
+    ($index:ident, documents) => {{
+        $crate::snapshot_tests::snap_documents(&$index)
+    }};
    ($index:ident, settings) => {{
        $crate::snapshot_tests::snap_settings(&$index)
    }};
--- a/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap
+++ b/milli/src/snapshots/index.rs/bug_3007/geo_faceted_documents_ids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/index.rs
+---
+[0, ]
--- a/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap
+++ b/milli/src/snapshots/index.rs/unexpected_extra_fields_in_geo_field/geo_faceted_documents_ids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/index.rs
+---
+[]
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -591,9 +591,9 @@ fn remove_from_word_docids(
    Ok(())
 }

-fn remove_docids_from_field_id_docid_facet_value<'i, 'a>(
-    index: &'i Index,
-    wtxn: &'a mut heed::RwTxn,
+fn remove_docids_from_field_id_docid_facet_value(
+    index: &Index,
+    wtxn: &mut heed::RwTxn,
    facet_type: FacetType,
    field_id: FieldId,
    to_remove: &RoaringBitmap,
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@@ -157,9 +157,9 @@ impl FacetsUpdateIncrementalInner {
    ///
    /// ## Return
    /// See documentation of `insert_in_level`
-    fn insert_in_level_0<'t>(
+    fn insert_in_level_0(
        &self,
-        txn: &'t mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
@@ -211,9 +211,9 @@ impl FacetsUpdateIncrementalInner {
    /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted
    /// in the addition of a new key in that level, and that therefore the number of children
    /// of the parent node should be incremented.
-    fn insert_in_level<'t>(
+    fn insert_in_level(
        &self,
-        txn: &'t mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        level: u8,
        facet_value: &[u8],
@@ -348,9 +348,9 @@ impl FacetsUpdateIncrementalInner {
    }

    /// Insert the given facet value and corresponding document ids in the database.
-    pub fn insert<'t>(
+    pub fn insert(
        &self,
-        txn: &'t mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
@@ -470,9 +470,9 @@ impl FacetsUpdateIncrementalInner {
    /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4).
    /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust
    /// its left bound as well.
-    fn delete_in_level<'t>(
+    fn delete_in_level(
        &self,
-        txn: &'t mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        level: u8,
        facet_value: &[u8],
@@ -529,9 +529,9 @@ impl FacetsUpdateIncrementalInner {
        }
    }

-    fn delete_in_level_0<'t>(
+    fn delete_in_level_0(
        &self,
-        txn: &'t mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
@@ -557,9 +557,9 @@ impl FacetsUpdateIncrementalInner {
        }
    }

-    pub fn delete<'t>(
+    pub fn delete(
        &self,
-        txn: &'t mut RwTxn,
+        txn: &mut RwTxn,
        field_id: u16,
        facet_value: &[u8],
        docids: &RoaringBitmap,
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -98,7 +98,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
    // If the settings specifies that a _geo field must be used therefore we must check the
    // validity of it in all the documents of this batch and this is when we return `Some`.
    let geo_field_id = match documents_batch_index.id("_geo") {
-        Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id),
+        Some(geo_field_id)
+            if index.sortable_fields(rtxn)?.contains("_geo")
+                || index.filterable_fields(rtxn)?.contains("_geo") =>
+        {
+            Some(geo_field_id)
+        }
        _otherwise => None,
    };

@@ -367,11 +372,17 @@ pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {

 pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
    use GeoError::*;
-    let debug_id = || Value::from(id.debug());
+    let debug_id = || {
+        serde_json::from_slice(id.value().as_bytes()).unwrap_or_else(|_| Value::from(id.debug()))
+    };
    match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
        Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
            (Some(lat), Some(lng)) => {
                match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
+                    (Ok(_), Ok(_)) if !object.is_empty() => Ok(Err(UnexpectedExtraFields {
+                        document_id: debug_id(),
+                        value: object.into(),
+                    })),
                    (Ok(_), Ok(_)) => Ok(Ok(())),
                    (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
                    (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
@@ -384,6 +395,7 @@ pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult
            (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
            (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
        },
+        Value::Null => Ok(Ok(())),
        value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
    }
 }
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -59,6 +59,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
        } else if lat.is_some() && lng.is_none() {
            return Err(GeoError::MissingLongitude { document_id: document_id() })?;
        }
+        // else => the _geo object was `null`, there is nothing to do
    }

    writer_into_reader(writer)
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -1,6 +1,6 @@
 use std::borrow::Cow;
 use std::fs::File;
-use std::io::{self, Seek, SeekFrom};
+use std::io::{self, Seek};
 use std::time::Instant;

 use grenad::{CompressionType, Sorter};
@@ -66,7 +66,7 @@ pub fn sorter_into_reader(

 pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
    let mut file = writer.into_inner()?;
-    file.seek(SeekFrom::Start(0))?;
+    file.rewind()?;
    grenad::Reader::new(file).map_err(Into::into)
 }

--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -6,6 +6,7 @@ use roaring::RoaringBitmap;

 use super::read_u32_ne_bytes;
 use crate::heed_codec::CboRoaringBitmapCodec;
+use crate::update::index_documents::transform::Operation;
 use crate::Result;

 pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
@@ -57,21 +58,6 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
    Ok(obkvs.last().unwrap().clone())
 }

-/// Merge all the obks in the order we see them.
-pub fn merge_obkvs<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
-    Ok(obkvs
-        .iter()
-        .cloned()
-        .reduce(|acc, current| {
-            let first = obkv::KvReader::new(&acc);
-            let second = obkv::KvReader::new(&current);
-            let mut buffer = Vec::new();
-            merge_two_obkvs(first, second, &mut buffer);
-            Cow::from(buffer)
-        })
-        .unwrap())
-}
-
 pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
    use itertools::merge_join_by;
    use itertools::EitherOrBoth::{Both, Left, Right};
@@ -88,6 +74,41 @@ pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffe
    writer.finish().unwrap();
 }

+/// Merge all the obks in the order we see them.
+pub fn merge_obkvs_and_operations<'a>(
+    _key: &[u8],
+    obkvs: &[Cow<'a, [u8]>],
+) -> Result<Cow<'a, [u8]>> {
+    // [add, add, delete, add, add]
+    // we can ignore everything that happened before the last delete.
+    let starting_position =
+        obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
+
+    // [add, add, delete]
+    // if the last operation was a deletion then we simply return the deletion
+    if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
+    {
+        return Ok(obkvs[obkvs.len() - 1].clone());
+    }
+    let mut buffer = Vec::new();
+
+    // (add, add, delete) [add, add]
+    // in the other case, no deletion will be encountered during the merge
+    let mut ret =
+        obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
+            let first = obkv::KvReader::new(&acc);
+            let second = obkv::KvReader::new(&current[1..]);
+            merge_two_obkvs(first, second, &mut buffer);
+
+            // we want the result of the merge into our accumulator
+            std::mem::swap(&mut acc, &mut buffer);
+            acc
+        });
+
+    ret.insert(0, Operation::Addition as u8);
+    Ok(Cow::from(ret))
+}
+
 pub fn merge_cbo_roaring_bitmaps<'a>(
    _key: &[u8],
    values: &[Cow<'a, [u8]>],
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -13,9 +13,9 @@ pub use grenad_helpers::{
    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
-    concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs,
-    merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array,
-    serialize_roaring_bitmap, MergeFn,
+    concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps,
+    merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
+    roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn,
 };

 use crate::MAX_WORD_LENGTH;
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -79,6 +79,7 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, FP, FA> {
    progress: FP,
    should_abort: FA,
    added_documents: u64,
+    deleted_documents: u64,
 }

 #[derive(Default, Debug, Clone)]
@@ -122,6 +123,7 @@ where
            wtxn,
            index,
            added_documents: 0,
+            deleted_documents: 0,
        })
    }

@@ -166,6 +168,30 @@ where
        Ok((self, Ok(indexed_documents)))
    }

+    /// Remove a batch of documents from the current builder.
+    ///
+    /// Returns the number of documents deleted from the builder.
+    pub fn remove_documents(
+        mut self,
+        to_delete: Vec<String>,
+    ) -> Result<(Self, StdResult<u64, UserError>)> {
+        // Early return when there is no document to add
+        if to_delete.is_empty() {
+            return Ok((self, Ok(0)));
+        }
+
+        let deleted_documents = self
+            .transform
+            .as_mut()
+            .expect("Invalid document deletion state")
+            .remove_documents(to_delete, self.wtxn, &self.should_abort)?
+            as u64;
+
+        self.deleted_documents += deleted_documents;
+
+        Ok((self, Ok(deleted_documents)))
+    }
+
    #[logging_timer::time("IndexDocuments::{}")]
    pub fn execute(mut self) -> Result<DocumentAdditionResult> {
        if self.added_documents == 0 {
@@ -965,34 +991,6 @@ mod tests {
            .unwrap();
    }

-    #[test]
-    fn index_all_flavour_of_geo() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-
-        index
-            .update_settings(|settings| {
-                settings.set_filterable_fields(hashset!(S("_geo")));
-            })
-            .unwrap();
-
-        index
-            .add_documents(documents!([
-              { "id": 0, "_geo": { "lat": 31, "lng": [42] } },
-              { "id": 1, "_geo": { "lat": "31" }, "_geo.lng": 42 },
-              { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
-              { "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
-            ]))
-            .unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-
-        let mut search = crate::Search::new(&rtxn, &index);
-        search.filter(crate::Filter::from_str("_geoRadius(31, 42, 0.000001)").unwrap().unwrap());
-        let crate::SearchResult { documents_ids, .. } = search.execute().unwrap();
-        assert_eq!(documents_ids, vec![0, 1, 2, 3]);
-    }
-
    #[test]
    fn geo_error() {
        let mut index = TempIndex::new();
@@ -1934,4 +1932,328 @@ mod tests {
        let expected_cj_cmn_docids = [1, 5].iter().collect();
        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
    }
+
+    #[test]
+    fn add_and_delete_documents_in_single_transform() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let documents = documents!([
+            { "id": 1, "doggo": "kevin" },
+            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
+            { "id": 3, "name": "jean", "age": 25 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"3");
+
+        let (builder, removed) = builder.remove_documents(vec![S("2")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"1");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 3,
+            number_of_documents: 2,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":1,"doggo":"kevin"}
+        {"id":3,"name":"jean","age":25}
+        "###);
+    }
+
+    #[test]
+    fn add_update_and_delete_documents_in_single_transform() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let documents = documents!([
+            { "id": 1, "doggo": "kevin" },
+            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
+            { "id": 3, "name": "jean", "age": 25 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"3");
+
+        let documents = documents!([
+            { "id": 2, "catto": "jorts" },
+            { "id": 3, "legs": 4 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"2");
+
+        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"2");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 5,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":3,"name":"jean","age":25,"legs":4}
+        "###);
+    }
+
+    #[test]
+    fn add_document_and_in_another_transform_update_and_delete_documents() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let documents = documents!([
+            { "id": 1, "doggo": "kevin" },
+            { "id": 2, "doggo": { "name": "bob", "age": 20 } },
+            { "id": 3, "name": "jean", "age": 25 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"3");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 3,
+            number_of_documents: 3,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":1,"doggo":"kevin"}
+        {"id":2,"doggo":{"name":"bob","age":20}}
+        {"id":3,"name":"jean","age":25}
+        "###);
+
+        // A first batch of documents has been inserted
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let documents = documents!([
+            { "id": 2, "catto": "jorts" },
+            { "id": 3, "legs": 4 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"2");
+
+        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"2");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 2,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":3,"name":"jean","age":25,"legs":4}
+        "###);
+    }
+
+    #[test]
+    fn delete_document_and_then_add_documents_in_the_same_transform() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let (builder, removed) = builder.remove_documents(vec![S("1"), S("2")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"0");
+
+        let documents = documents!([
+            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
+            { "id": 3, "name": "bob", "age": 25 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"2");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 2,
+            number_of_documents: 2,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":2,"doggo":{"name":"jean","age":20}}
+        {"id":3,"name":"bob","age":25}
+        "###);
+    }
+
+    #[test]
+    fn delete_the_same_document_multiple_time() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let (builder, removed) =
+            builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"0");
+
+        let documents = documents!([
+            { "id": 1, "doggo": "kevin" },
+            { "id": 2, "doggo": { "name": "jean", "age": 20 } },
+            { "id": 3, "name": "bob", "age": 25 },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"3");
+
+        let (builder, removed) =
+            builder.remove_documents(vec![S("1"), S("2"), S("1"), S("2")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"2");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 3,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":3,"name":"bob","age":25}
+        "###);
+    }
+
+    #[test]
+    fn add_document_and_in_another_transform_delete_the_document_then_add_it_again() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let documents = documents!([
+            { "id": 1, "doggo": "kevin" },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"1");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 1,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":1,"doggo":"kevin"}
+        "###);
+
+        // A first batch of documents has been inserted
+
+        let mut wtxn = index.write_txn().unwrap();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &index.indexer_config,
+            index.index_documents_config.clone(),
+            |_| (),
+            || false,
+        )
+        .unwrap();
+
+        let (builder, removed) = builder.remove_documents(vec![S("1")]).unwrap();
+        insta::assert_display_snapshot!(removed.unwrap(), @"1");
+
+        let documents = documents!([
+            { "id": 1, "catto": "jorts" },
+        ]);
+        let (builder, added) = builder.add_documents(documents).unwrap();
+        insta::assert_display_snapshot!(added.unwrap(), @"1");
+
+        let addition = builder.execute().unwrap();
+        insta::assert_debug_snapshot!(addition, @r###"
+        DocumentAdditionResult {
+            indexed_documents: 1,
+            number_of_documents: 1,
+        }
+        "###);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents, @r###"
+        {"id":1,"catto":"jorts"}
+        "###);
+    }
 }
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -2,7 +2,7 @@ use std::borrow::Cow;
 use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::fs::File;
-use std::io::{Read, Seek, SeekFrom};
+use std::io::{Read, Seek};

 use fxhash::FxHashMap;
 use heed::RoTxn;
@@ -12,7 +12,9 @@ use roaring::RoaringBitmap;
 use serde_json::Value;
 use smartstring::SmartString;

-use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
+use super::helpers::{
+    create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
+};
 use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
@@ -50,8 +52,12 @@ pub struct Transform<'a, 'i> {
    pub index_documents_method: IndexDocumentsMethod,
    available_documents_ids: AvailableDocumentsIds,

+    // Both grenad follows the same format:
+    // key | value
+    // u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
    original_sorter: grenad::Sorter<MergeFn>,
    flattened_sorter: grenad::Sorter<MergeFn>,
+
    replaced_documents_ids: RoaringBitmap,
    new_documents_ids: RoaringBitmap,
    // To increase the cache locality and decrease the heap usage we use compact smartstring.
@@ -59,6 +65,14 @@ pub struct Transform<'a, 'i> {
    documents_count: usize,
 }

+/// This enum is specific to the grenad sorter stored in the transform.
+/// It's used as the first byte of the grenads and tells you if the document id was an addition or a deletion.
+#[repr(u8)]
+pub enum Operation {
+    Addition,
+    Deletion,
+}
+
 /// Create a mapping between the field ids found in the document batch and the one that were
 /// already present in the index.
 ///
@@ -94,7 +108,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        // with the same user id must be merged or fully replaced in the same batch.
        let merge_function = match index_documents_method {
            IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
-            IndexDocumentsMethod::UpdateDocuments => merge_obkvs,
+            IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
        };

        // We initialize the sorter with the user indexing settings.
@@ -151,9 +165,7 @@ impl<'a, 'i> Transform<'a, 'i> {
        FA: Fn() -> bool + Sync,
    {
        let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
-
        let external_documents_ids = self.index.external_documents_ids(wtxn)?;
-
        let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;

        let primary_key = cursor.primary_key().to_string();
@@ -161,6 +173,7 @@ impl<'a, 'i> Transform<'a, 'i> {
            self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;

        let mut obkv_buffer = Vec::new();
+        let mut document_sorter_buffer = Vec::new();
        let mut documents_count = 0;
        let mut docid_buffer: Vec<u8> = Vec::new();
        let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
@@ -212,10 +225,13 @@ impl<'a, 'i> Transform<'a, 'i> {
                Entry::Occupied(entry) => *entry.get() as u32,
                Entry::Vacant(entry) => {
                    // If the document was already in the db we mark it as a replaced document.
-                    // It'll be deleted later. We keep its original docid to insert it in the grenad.
+                    // It'll be deleted later.
                    if let Some(docid) = external_documents_ids.get(entry.key()) {
-                        self.replaced_documents_ids.insert(docid);
-                        original_docid = Some(docid);
+                        // If it was already in the list of replaced documents it means it was deleted
+                        // by the remove_document method. We should starts as if it never existed.
+                        if self.replaced_documents_ids.insert(docid) {
+                            original_docid = Some(docid);
+                        }
                    }
                    let docid = self
                        .available_documents_ids
@@ -248,26 +264,46 @@ impl<'a, 'i> Transform<'a, 'i> {
                    skip_insertion = true;
                } else {
                    // we associate the base document with the new key, everything will get merged later.
-                    self.original_sorter.insert(docid.to_be_bytes(), base_obkv)?;
+                    document_sorter_buffer.clear();
+                    document_sorter_buffer.push(Operation::Addition as u8);
+                    document_sorter_buffer.extend_from_slice(base_obkv);
+                    self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
                    match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
-                        Some(buffer) => {
-                            self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?
+                        Some(flattened_obkv) => {
+                            // we recreate our buffer with the flattened documents
+                            document_sorter_buffer.clear();
+                            document_sorter_buffer.push(Operation::Addition as u8);
+                            document_sorter_buffer.extend_from_slice(&flattened_obkv);
+                            self.flattened_sorter
+                                .insert(docid.to_be_bytes(), &document_sorter_buffer)?
                        }
-                        None => self.flattened_sorter.insert(docid.to_be_bytes(), base_obkv)?,
+                        None => self
+                            .flattened_sorter
+                            .insert(docid.to_be_bytes(), &document_sorter_buffer)?,
                    }
                }
            }

            if !skip_insertion {
                self.new_documents_ids.insert(docid);
+
+                document_sorter_buffer.clear();
+                document_sorter_buffer.push(Operation::Addition as u8);
+                document_sorter_buffer.extend_from_slice(&obkv_buffer);
                // We use the extracted/generated user id as the key for this document.
-                self.original_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?;
+                self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;

                match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
-                    Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
-                    None => {
-                        self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
+                    Some(flattened_obkv) => {
+                        document_sorter_buffer.clear();
+                        document_sorter_buffer.push(Operation::Addition as u8);
+                        document_sorter_buffer.extend_from_slice(&flattened_obkv);
+                        self.flattened_sorter
+                            .insert(docid.to_be_bytes(), &document_sorter_buffer)?
                    }
+                    None => self
+                        .flattened_sorter
+                        .insert(docid.to_be_bytes(), &document_sorter_buffer)?,
                }
            }
            documents_count += 1;
@@ -293,6 +329,73 @@ impl<'a, 'i> Transform<'a, 'i> {
        Ok(documents_count)
    }

+    /// The counter part of `read_documents` that removes documents either from the transform or the database.
+    /// It can be called before, after or in between two calls of the `read_documents`.
+    ///
+    /// It needs to update all the internal datastructure in the transform.
+    /// - If the document is coming from the database -> it's marked as a to_delete document
+    /// - If the document to remove was inserted by the `read_documents` method before AND was present in the db,
+    ///   it's marked as `to_delete` + added into the grenad to ensure we don't reinsert it.
+    /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
+    ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
+    /// - If the document to remove was not present in either the db or the transform we do nothing.
+    pub fn remove_documents<FA>(
+        &mut self,
+        mut to_remove: Vec<String>,
+        wtxn: &mut heed::RwTxn,
+        should_abort: FA,
+    ) -> Result<usize>
+    where
+        FA: Fn() -> bool + Sync,
+    {
+        // there may be duplicates in the documents to remove.
+        to_remove.sort_unstable();
+        to_remove.dedup();
+
+        let external_documents_ids = self.index.external_documents_ids(wtxn)?;
+
+        let mut documents_deleted = 0;
+        for to_remove in to_remove {
+            if should_abort() {
+                return Err(Error::InternalError(InternalError::AbortedIndexation));
+            }
+
+            match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
+                // if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
+                Entry::Occupied(entry) => {
+                    let doc_id = *entry.get() as u32;
+                    self.original_sorter
+                        .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
+                    self.flattened_sorter
+                        .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
+
+                    // we must NOT update the list of replaced_documents_ids
+                    // Either:
+                    // 1. It's already in it and there is nothing to do
+                    // 2. It wasn't in it because the document was created by a previous batch and since
+                    //    we're removing it there is nothing to do.
+                    self.new_documents_ids.remove(doc_id);
+                    entry.remove_entry();
+                }
+                Entry::Vacant(entry) => {
+                    // If the document was already in the db we mark it as a `to_delete` document.
+                    // It'll be deleted later. We don't need to push anything to the sorters.
+                    if let Some(docid) = external_documents_ids.get(entry.key()) {
+                        self.replaced_documents_ids.insert(docid);
+                    } else {
+                        // if the document is nowehere to be found, there is nothing to do and we must NOT
+                        // increment the count of documents_deleted
+                        continue;
+                    }
+                }
+            };
+
+            documents_deleted += 1;
+        }
+
+        Ok(documents_deleted)
+    }
+
    // Flatten a document from the fields ids map contained in self and insert the new
    // created fields. Returns `None` if the document doesn't need to be flattened.
    fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {
@@ -487,6 +590,11 @@ impl<'a, 'i> Transform<'a, 'i> {
        let mut documents_count = 0;

        while let Some((key, val)) = iter.next()? {
+            if val[0] == Operation::Deletion as u8 {
+                continue;
+            }
+            let val = &val[1..];
+
            // send a callback to show at which step we are
            documents_count += 1;
            progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
@@ -510,7 +618,7 @@ impl<'a, 'i> Transform<'a, 'i> {

        let mut original_documents = writer.into_inner()?;
        // We then extract the file and reset the seek to be able to read it again.
-        original_documents.seek(SeekFrom::Start(0))?;
+        original_documents.rewind()?;

        // We create a final writer to write the new documents in order from the sorter.
        let mut writer = create_writer(
@@ -518,11 +626,20 @@ impl<'a, 'i> Transform<'a, 'i> {
            self.indexer_settings.chunk_compression_level,
            tempfile::tempfile()?,
        );
-        // Once we have written all the documents into the final sorter, we write the documents
-        // into this writer, extract the file and reset the seek to be able to read it again.
-        self.flattened_sorter.write_into_stream_writer(&mut writer)?;
+
+        // Once we have written all the documents into the final sorter, we write the nested documents
+        // into this writer.
+        // We get rids of the `Operation` byte and skip the deleted documents as well.
+        let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
+        while let Some((key, val)) = iter.next()? {
+            if val[0] == Operation::Deletion as u8 {
+                continue;
+            }
+            let val = &val[1..];
+            writer.insert(key, val)?;
+        }
        let mut flattened_documents = writer.into_inner()?;
-        flattened_documents.seek(SeekFrom::Start(0))?;
+        flattened_documents.rewind()?;

        let mut new_external_documents_ids_builder: Vec<_> =
            self.new_external_documents_ids_builder.into_iter().collect();
@@ -650,10 +767,10 @@ impl<'a, 'i> Transform<'a, 'i> {
        // Once we have written all the documents, we extract
        // the file and reset the seek to be able to read it again.
        let mut original_documents = original_writer.into_inner()?;
-        original_documents.seek(SeekFrom::Start(0))?;
+        original_documents.rewind()?;

        let mut flattened_documents = flattened_writer.into_inner()?;
-        flattened_documents.seek(SeekFrom::Start(0))?;
+        flattened_documents.rewind()?;

        let output = TransformOutput {
            primary_key,
@@ -701,3 +818,45 @@ impl TransformOutput {
            .collect())
    }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn merge_obkvs() {
+        let mut doc_0 = Vec::new();
+        let mut kv_writer = KvWriter::new(&mut doc_0);
+        kv_writer.insert(0_u8, [0]).unwrap();
+        kv_writer.finish().unwrap();
+        doc_0.insert(0, Operation::Addition as u8);
+
+        let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
+        assert_eq!(*ret, doc_0);
+
+        let ret = merge_obkvs_and_operations(
+            &[],
+            &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
+        )
+        .unwrap();
+        assert_eq!(*ret, doc_0);
+
+        let ret = merge_obkvs_and_operations(
+            &[],
+            &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
+        )
+        .unwrap();
+        assert_eq!(*ret, [Operation::Deletion as u8]);
+
+        let ret = merge_obkvs_and_operations(
+            &[],
+            &[
+                Cow::from([Operation::Addition as u8, 1].as_slice()),
+                Cow::from([Operation::Deletion as u8].as_slice()),
+                Cow::from(doc_0.as_slice()),
+            ],
+        )
+        .unwrap();
+        assert_eq!(*ret, doc_0);
+    }
+}
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -2,7 +2,7 @@ use std::collections::{BTreeSet, HashMap, HashSet};
 use std::result::Result as StdResult;

 use charabia::{Tokenizer, TokenizerBuilder};
-use deserr::{DeserializeError, DeserializeFromValue};
+use deserr::{DeserializeError, Deserr};
 use itertools::Itertools;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 use time::OffsetDateTime;
@@ -23,9 +23,9 @@ pub enum Setting<T> {
    NotSet,
 }

-impl<T, E> DeserializeFromValue<E> for Setting<T>
+impl<T, E> Deserr<E> for Setting<T>
 where
-    T: DeserializeFromValue<E>,
+    T: Deserr<E>,
    E: DeserializeError,
 {
    fn deserialize_from_value<V: deserr::IntoValue>(
@@ -37,9 +37,6 @@ where
            _ => T::deserialize_from_value(value, location).map(Setting::Set),
        }
    }
-    fn default() -> Option<Self> {
-        Some(Self::NotSet)
-    }
 }

 impl<T> Default for Setting<T> {
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@@ -140,16 +140,20 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {

        // We remove all the entries that are no more required in this word prefix position
        // docids database.
-        let mut iter =
-            self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
-        while let Some(((prefix, _), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
-                unsafe { iter.del_current()? };
+        // We also avoid iterating over the whole `word_prefix_position_docids` database if we know in
+        // advance that the `if del_prefix_fst_words.contains(prefix.as_bytes()) {` condition below
+        // will always be false (i.e. if `del_prefix_fst_words` is empty).
+        if !del_prefix_fst_words.is_empty() {
+            let mut iter =
+                self.index.word_prefix_position_docids.iter_mut(self.wtxn)?.lazily_decode_data();
+            while let Some(((prefix, _), _)) = iter.next().transpose()? {
+                if del_prefix_fst_words.contains(prefix.as_bytes()) {
+                    unsafe { iter.del_current()? };
+                }
            }
+            drop(iter);
        }

-        drop(iter);
-
        // We finally write all the word prefix position docids into the LMDB database.
        sorter_into_lmdb_database(
            self.wtxn,