Try fix

Add a test
2025-12-11 07:05:43 +00:00 · 2023-08-08 16:52:36 +02:00 · 2023-08-08 16:43:08 +02:00
31 changed files with 219 additions and 649 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -469,7 +469,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

 [[package]]
 name = "benchmarks"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "anyhow",
 "bytes",
@@ -700,9 +700,9 @@ dependencies = [

 [[package]]
 name = "charabia"
-version = "0.8.3"
+version = "0.8.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "098219a776307414866165a03a9cc68c1578764fe3616fe979e1c280790ddd73"
+checksum = "57aa1b4a8dda126c03ebf2f7e31d16cfc8781c2fe80dedd1a33459efc3e07578"
 dependencies = [
 "aho-corasick",
 "cow-utils",
@@ -1199,7 +1199,7 @@ dependencies = [

 [[package]]
 name = "dump"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "anyhow",
 "big_s",
@@ -1413,7 +1413,7 @@ dependencies = [

 [[package]]
 name = "file-store"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "faux",
 "tempfile",
@@ -1435,12 +1435,11 @@ dependencies = [

 [[package]]
 name = "filter-parser"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "insta",
 "nom",
 "nom_locate",
- "unescaper",
 ]

 [[package]]
@@ -1455,7 +1454,7 @@ dependencies = [

 [[package]]
 name = "flatten-serde-json"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "criterion",
 "serde_json",
@@ -1573,7 +1572,7 @@ dependencies = [

 [[package]]
 name = "fuzzers"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "arbitrary",
 "clap",
@@ -1895,7 +1894,7 @@ dependencies = [

 [[package]]
 name = "index-scheduler"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "anyhow",
 "big_s",
@@ -2082,7 +2081,7 @@ dependencies = [

 [[package]]
 name = "json-depth-checker"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "criterion",
 "serde_json",
@@ -2494,7 +2493,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "meili-snap"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "insta",
 "md5",
@@ -2503,7 +2502,7 @@ dependencies = [

 [[package]]
 name = "meilisearch"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "actix-cors",
 "actix-http",
@@ -2592,7 +2591,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-auth"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "base64 0.21.2",
 "enum-iterator",
@@ -2611,7 +2610,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-types"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "actix-web",
 "anyhow",
@@ -2665,7 +2664,7 @@ dependencies = [

 [[package]]
 name = "milli"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "big_s",
 "bimap",
@@ -2995,7 +2994,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

 [[package]]
 name = "permissive-json-pointer"
-version = "1.3.4"
+version = "1.3.1"
 dependencies = [
 "big_s",
 "serde_json",
@@ -4148,15 +4147,6 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81"

-[[package]]
-name = "unescaper"
-version = "0.1.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8"
-dependencies = [
- "thiserror",
-]
-
 [[package]]
 name = "unicase"
 version = "2.6.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 [workspace.package]
-version = "1.3.4"
+version = "1.3.1"
 authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
 description = "Meilisearch HTTP server"
 homepage = "https://meilisearch.com"
--- a/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-10.snap
+++ b/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-10.snap
@@ -1,24 +0,0 @@
---
-source: dump/src/reader/mod.rs
-expression: spells.settings().unwrap()
---
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [],
-  "sortableAttributes": [],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness"
-  ],
-  "stopWords": [],
-  "synonyms": {},
-  "distinctAttribute": null
-}
--- a/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-4.snap
+++ b/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-4.snap
@@ -1,38 +0,0 @@
---
-source: dump/src/reader/mod.rs
-expression: products.settings().unwrap()
---
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [],
-  "sortableAttributes": [],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness"
-  ],
-  "stopWords": [],
-  "synonyms": {
-    "android": [
-      "phone",
-      "smartphone"
-    ],
-    "iphone": [
-      "phone",
-      "smartphone"
-    ],
-    "phone": [
-      "android",
-      "iphone",
-      "smartphone"
-    ]
-  },
-  "distinctAttribute": null
-}
--- a/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-7.snap
+++ b/dump/src/reader/snapshots/dumpreadertest__import_dump_v1-7.snap
@@ -1,31 +0,0 @@
---
-source: dump/src/reader/mod.rs
-expression: movies.settings().unwrap()
---
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [
-    "genres",
-    "id"
-  ],
-  "sortableAttributes": [
-    "genres",
-    "id"
-  ],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness",
-    "release_date:asc"
-  ],
-  "stopWords": [],
-  "synonyms": {},
-  "distinctAttribute": null
-}
--- a/filter-parser/Cargo.toml
+++ b/filter-parser/Cargo.toml
@@ -14,7 +14,6 @@ license.workspace = true
 [dependencies]
 nom = "7.1.3"
 nom_locate = "4.1.0"
-unescaper = "0.1.2"

 [dev-dependencies]
 insta = "1.29.0"
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@@ -62,7 +62,6 @@ pub enum ErrorKind<'a> {
    MisusedGeoRadius,
    MisusedGeoBoundingBox,
    InvalidPrimary,
-    InvalidEscapedNumber,
    ExpectedEof,
    ExpectedValue(ExpectedValueKind),
    MalformedValue,
@@ -148,9 +147,6 @@ impl<'a> Display for Error<'a> {
                let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
            }
-            ErrorKind::InvalidEscapedNumber => {
-                writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
-            }
            ErrorKind::ExpectedEof => {
                writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
            }
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -545,8 +545,6 @@ impl<'a> std::fmt::Display for Token<'a> {

 #[cfg(test)]
 pub mod tests {
-    use FilterCondition as Fc;
-
    use super::*;

    /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@@ -558,22 +556,14 @@ pub mod tests {
        unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
    }

-    fn p(s: &str) -> impl std::fmt::Display + '_ {
-        Fc::parse(s).unwrap().unwrap()
-    }
-
-    #[test]
-    fn parse_escaped() {
-        insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
-        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
-        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
-        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
-        // but it also works with other sequencies
-        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
-    }
-
    #[test]
    fn parse() {
+        use FilterCondition as Fc;
+
+        fn p(s: &str) -> impl std::fmt::Display + '_ {
+            Fc::parse(s).unwrap().unwrap()
+        }
+
        // Test equal
        insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
        insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@@ -171,24 +171,7 @@ pub fn parse_value(input: Span) -> IResult<Token> {
        })
    })?;

-    match unescaper::unescape(value.value()) {
-        Ok(content) => {
-            if content.len() != value.value().len() {
-                Ok((input, Token::new(value.original_span(), Some(content))))
-            } else {
-                Ok((input, value))
-            }
-        }
-        Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
-        Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
-            value.original_span(),
-            ErrorKind::InvalidEscapedNumber,
-        ))),
-        Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
-            value.original_span(),
-            ErrorKind::MalformedValue,
-        ))),
-    }
+    Ok((input, value))
 }

 fn is_value_component(c: char) -> bool {
@@ -335,17 +318,17 @@ pub mod test {
            ("\"cha'nnel\"", "cha'nnel", false),
            ("I'm tamo", "I", false),
            // escaped thing but not quote
-            (r#""\\""#, r#"\"#, true),
-            (r#""\\\\\\""#, r#"\\\"#, true),
-            (r#""aa\\aa""#, r#"aa\aa"#, true),
+            (r#""\\""#, r#"\\"#, false),
+            (r#""\\\\\\""#, r#"\\\\\\"#, false),
+            (r#""aa\\aa""#, r#"aa\\aa"#, false),
            // with double quote
            (r#""Hello \"world\"""#, r#"Hello "world""#, true),
-            (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
+            (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
            (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
            (r#""\"\"""#, r#""""#, true),
            // with simple quote
            (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
-            (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
+            (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
            (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
            (r#"'\'\''"#, r#"''"#, true),
        ];
@@ -367,14 +350,7 @@ pub mod test {
                "Filter `{}` was not supposed to be escaped",
                input
            );
-            assert_eq!(
-                token.value(),
-                expected,
-                "Filter `{}` failed by giving `{}` instead of `{}`.",
-                input,
-                token.value(),
-                expected
-            );
+            assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
        }
    }

--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -67,6 +67,10 @@ pub(crate) enum Batch {
        op: IndexOperation,
        must_create_index: bool,
    },
+    IndexDocumentDeletionByFilter {
+        index_uid: String,
+        task: Task,
+    },
    IndexCreation {
        index_uid: String,
        primary_key: Option<String>,
@@ -110,10 +114,6 @@ pub(crate) enum IndexOperation {
        documents: Vec<Vec<String>>,
        tasks: Vec<Task>,
    },
-    IndexDocumentDeletionByFilter {
-        index_uid: String,
-        task: Task,
-    },
    DocumentClear {
        index_uid: String,
        tasks: Vec<Task>,
@@ -155,6 +155,7 @@ impl Batch {
            | Batch::TaskDeletion(task)
            | Batch::Dump(task)
            | Batch::IndexCreation { task, .. }
+            | Batch::IndexDocumentDeletionByFilter { task, .. }
            | Batch::IndexUpdate { task, .. } => vec![task.uid],
            Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
                tasks.iter().map(|task| task.uid).collect()
@@ -166,7 +167,6 @@ impl Batch {
                | IndexOperation::DocumentClear { tasks, .. } => {
                    tasks.iter().map(|task| task.uid).collect()
                }
-                IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
                IndexOperation::SettingsAndDocumentOperation {
                    document_import_tasks: tasks,
                    settings_tasks: other,
@@ -194,7 +194,8 @@ impl Batch {
            IndexOperation { op, .. } => Some(op.index_uid()),
            IndexCreation { index_uid, .. }
            | IndexUpdate { index_uid, .. }
-            | IndexDeletion { index_uid, .. } => Some(index_uid),
+            | IndexDeletion { index_uid, .. }
+            | IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
        }
    }
 }
@@ -204,7 +205,6 @@ impl IndexOperation {
        match self {
            IndexOperation::DocumentOperation { index_uid, .. }
            | IndexOperation::DocumentDeletion { index_uid, .. }
-            | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
            | IndexOperation::DocumentClear { index_uid, .. }
            | IndexOperation::Settings { index_uid, .. }
            | IndexOperation::DocumentClearAndSetting { index_uid, .. }
@@ -239,12 +239,9 @@ impl IndexScheduler {
                let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
                match &task.kind {
                    KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
-                        Ok(Some(Batch::IndexOperation {
-                            op: IndexOperation::IndexDocumentDeletionByFilter {
-                                index_uid: index_uid.clone(),
-                                task,
-                            },
-                            must_create_index: false,
+                        Ok(Some(Batch::IndexDocumentDeletionByFilter {
+                            index_uid: index_uid.clone(),
+                            task,
                        }))
                    }
                    _ => unreachable!(),
@@ -537,9 +534,7 @@ impl IndexScheduler {
        let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued;

        // If autobatching is disabled we only take one task at a time.
-        // Otherwise, we take only a maximum of tasks to create batches.
-        let tasks_limit =
-            if self.autobatching_enabled { self.maximum_number_of_batched_tasks } else { 1 };
+        let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 };

        let enqueued = index_tasks
            .into_iter()
@@ -896,6 +891,51 @@ impl IndexScheduler {

                Ok(tasks)
            }
+            Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
+                let (index_uid, filter) =
+                    if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
+                        &task.kind
+                    {
+                        (index_uid, filter_expr)
+                    } else {
+                        unreachable!()
+                    };
+                let index = {
+                    let rtxn = self.env.read_txn()?;
+                    self.index_mapper.index(&rtxn, index_uid)?
+                };
+                let deleted_documents = delete_document_by_filter(filter, index);
+                let original_filter = if let Some(Details::DocumentDeletionByFilter {
+                    original_filter,
+                    deleted_documents: _,
+                }) = task.details
+                {
+                    original_filter
+                } else {
+                    // In the case of a `documentDeleteByFilter` the details MUST be set
+                    unreachable!();
+                };
+
+                match deleted_documents {
+                    Ok(deleted_documents) => {
+                        task.status = Status::Succeeded;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(deleted_documents),
+                        });
+                    }
+                    Err(e) => {
+                        task.status = Status::Failed;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(0),
+                        });
+                        task.error = Some(e.into());
+                    }
+                }
+
+                Ok(vec![task])
+            }
            Batch::IndexCreation { index_uid, primary_key, task } => {
                let wtxn = self.env.write_txn()?;
                if self.index_mapper.exists(&wtxn, &index_uid)? {
@@ -1252,47 +1292,6 @@ impl IndexScheduler {

                Ok(tasks)
            }
-            IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
-                let filter =
-                    if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
-                        &task.kind
-                    {
-                        filter_expr
-                    } else {
-                        unreachable!()
-                    };
-                let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
-                let original_filter = if let Some(Details::DocumentDeletionByFilter {
-                    original_filter,
-                    deleted_documents: _,
-                }) = task.details
-                {
-                    original_filter
-                } else {
-                    // In the case of a `documentDeleteByFilter` the details MUST be set
-                    unreachable!();
-                };
-
-                match deleted_documents {
-                    Ok(deleted_documents) => {
-                        task.status = Status::Succeeded;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(deleted_documents),
-                        });
-                    }
-                    Err(e) => {
-                        task.status = Status::Failed;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(0),
-                        });
-                        task.error = Some(e.into());
-                    }
-                }
-
-                Ok(vec![task])
-            }
            IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
                let indexer_config = self.index_mapper.indexer_config();
                let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@@ -1492,22 +1491,23 @@ impl IndexScheduler {
    }
 }

-fn delete_document_by_filter<'a>(
-    wtxn: &mut RwTxn<'a, '_>,
-    filter: &serde_json::Value,
-    index: &'a Index,
-) -> Result<u64> {
+fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
    let filter = Filter::from_json(filter)?;
    Ok(if let Some(filter) = filter {
-        let candidates = filter.evaluate(wtxn, index).map_err(|err| match err {
+        let mut wtxn = index.write_txn()?;
+
+        let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
            milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
                Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
            }
            e => e.into(),
        })?;
-        let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
+        let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
        delete_operation.delete_documents(&candidates);
-        delete_operation.execute().map(|result| result.deleted_documents)?
+        let deleted_documents =
+            delete_operation.execute().map(|result| result.deleted_documents)?;
+        wtxn.commit()?;
+        deleted_documents
    } else {
        0
    })
--- a/index-scheduler/src/insta_snapshot.rs
+++ b/index-scheduler/src/insta_snapshot.rs
@@ -15,7 +15,6 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {

    let IndexScheduler {
        autobatching_enabled,
-        maximum_number_of_batched_tasks: _,
        must_stop_processing: _,
        processing_tasks,
        file_store,
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -253,9 +253,6 @@ pub struct IndexSchedulerOptions {
    /// Set to `true` iff the index scheduler is allowed to automatically
    /// batch tasks together, to process multiple tasks at once.
    pub autobatching_enabled: bool,
-    /// If the autobatcher is allowed to automatically batch tasks
-    /// it will only batch this defined number of tasks at once.
-    pub maximum_number_of_batched_tasks: usize,
    /// The maximum number of tasks stored in the task queue before starting
    /// to auto schedule task deletions.
    pub max_number_of_tasks: usize,
@@ -313,9 +310,6 @@ pub struct IndexScheduler {
    /// Whether auto-batching is enabled or not.
    pub(crate) autobatching_enabled: bool,

-    /// The maximum number of tasks that will be batched together.
-    pub(crate) maximum_number_of_batched_tasks: usize,
-
    /// The max number of tasks allowed before the scheduler starts to delete
    /// the finished tasks automatically.
    pub(crate) max_number_of_tasks: usize,
@@ -369,7 +363,6 @@ impl IndexScheduler {
            index_mapper: self.index_mapper.clone(),
            wake_up: self.wake_up.clone(),
            autobatching_enabled: self.autobatching_enabled,
-            maximum_number_of_batched_tasks: self.maximum_number_of_batched_tasks,
            max_number_of_tasks: self.max_number_of_tasks,
            snapshots_path: self.snapshots_path.clone(),
            dumps_path: self.dumps_path.clone(),
@@ -465,7 +458,6 @@ impl IndexScheduler {
            // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
            wake_up: Arc::new(SignalEvent::auto(true)),
            autobatching_enabled: options.autobatching_enabled,
-            maximum_number_of_batched_tasks: options.maximum_number_of_batched_tasks,
            max_number_of_tasks: options.max_number_of_tasks,
            dumps_path: options.dumps_path,
            snapshots_path: options.snapshots_path,
@@ -798,19 +790,10 @@ impl IndexScheduler {

        let mut res = BTreeMap::new();

-        let processing_tasks = { self.processing_tasks.read().unwrap().processing.len() };
-
        res.insert(
            "statuses".to_string(),
            enum_iterator::all::<Status>()
-                .map(|s| {
-                    let tasks = self.get_status(&rtxn, s)?.len();
-                    match s {
-                        Status::Enqueued => Ok((s.to_string(), tasks - processing_tasks)),
-                        Status::Processing => Ok((s.to_string(), processing_tasks)),
-                        s => Ok((s.to_string(), tasks)),
-                    }
-                })
+                .map(|s| Ok((s.to_string(), self.get_status(&rtxn, s)?.len())))
                .collect::<Result<BTreeMap<String, u64>>>()?,
        );
        res.insert(
@@ -1595,7 +1578,6 @@ mod tests {
                index_count: 5,
                indexer_config,
                autobatching_enabled: true,
-                maximum_number_of_batched_tasks: usize::MAX,
                max_number_of_tasks: 1_000_000,
                instance_features: Default::default(),
            };
@@ -4147,154 +4129,4 @@ mod tests {
        snapshot!(json_string!(tasks, { "[].enqueuedAt" => "[date]", "[].startedAt" => "[date]", "[].finishedAt" => "[date]", ".**.original_filter" => "[filter]", ".**.query" => "[query]" }), name: "everything_has_been_processed");
        drop(rtxn);
    }
-
-    #[test]
-    fn basic_get_stats() {
-        let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
-
-        let kind = index_creation_task("catto", "mouse");
-        let _task = index_scheduler.register(kind).unwrap();
-        let kind = index_creation_task("doggo", "sheep");
-        let _task = index_scheduler.register(kind).unwrap();
-        let kind = index_creation_task("whalo", "fish");
-        let _task = index_scheduler.register(kind).unwrap();
-
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 3,
-            "failed": 0,
-            "processing": 0,
-            "succeeded": 0
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-
-        handle.advance_till([Start, BatchCreated]);
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 2,
-            "failed": 0,
-            "processing": 1,
-            "succeeded": 0
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-
-        handle.advance_till([
-            InsideProcessBatch,
-            InsideProcessBatch,
-            ProcessBatchSucceeded,
-            AfterProcessing,
-            Start,
-            BatchCreated,
-        ]);
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 1,
-            "failed": 0,
-            "processing": 1,
-            "succeeded": 1
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-
-        // now we make one more batch, the started_at field of the new tasks will be past `second_start_time`
-        handle.advance_till([
-            InsideProcessBatch,
-            InsideProcessBatch,
-            ProcessBatchSucceeded,
-            AfterProcessing,
-            Start,
-            BatchCreated,
-        ]);
-        snapshot!(json_string!(index_scheduler.get_stats().unwrap()), @r###"
-        {
-          "indexes": {
-            "catto": 1,
-            "doggo": 1,
-            "whalo": 1
-          },
-          "statuses": {
-            "canceled": 0,
-            "enqueued": 0,
-            "failed": 0,
-            "processing": 1,
-            "succeeded": 2
-          },
-          "types": {
-            "documentAdditionOrUpdate": 0,
-            "documentDeletion": 0,
-            "dumpCreation": 0,
-            "indexCreation": 3,
-            "indexDeletion": 0,
-            "indexSwap": 0,
-            "indexUpdate": 0,
-            "settingsUpdate": 0,
-            "snapshotCreation": 0,
-            "taskCancelation": 0,
-            "taskDeletion": 0
-          }
-        }
-        "###);
-    }
 }
--- a/meili-snap/src/lib.rs
+++ b/meili-snap/src/lib.rs
@@ -167,9 +167,7 @@ macro_rules! snapshot {
        let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, Some(&snap_name));
        settings.bind(|| {
            let snap = format!("{}", $value);
-            insta::allow_duplicates! {
-                meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
-            }
+            meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
        });
    };
    ($value:expr, @$inline:literal) => {
@@ -178,9 +176,7 @@ macro_rules! snapshot {
        let (settings, _, _) = $crate::default_snapshot_settings_for_test("", Some("_dummy_argument"));
        settings.bind(|| {
            let snap = format!("{}", $value);
-            insta::allow_duplicates! {
-                meili_snap::insta::assert_snapshot!(snap, @$inline);
-            }
+            meili_snap::insta::assert_snapshot!(snap, @$inline);
        });
    };
    ($value:expr) => {
@@ -198,9 +194,7 @@ macro_rules! snapshot {
        let (settings, snap_name, _) = $crate::default_snapshot_settings_for_test(test_name, None);
        settings.bind(|| {
            let snap = format!("{}", $value);
-            insta::allow_duplicates! {
-                meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
-            }
+            meili_snap::insta::assert_snapshot!(format!("{}", snap_name), snap);
        });
    };
 }
--- a/meilisearch/src/analytics/segment_analytics.rs
+++ b/meilisearch/src/analytics/segment_analytics.rs
@@ -285,7 +285,6 @@ impl From<Opt> for Infos {
            db_path,
            experimental_enable_metrics,
            experimental_reduce_indexing_memory_usage,
-            experimental_limit_batched_tasks: _,
            http_addr,
            master_key: _,
            env,
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -236,7 +236,6 @@ fn open_or_create_database_unchecked(
            enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
            indexer_config: (&opt.indexer_options).try_into()?,
            autobatching_enabled: true,
-            maximum_number_of_batched_tasks: opt.experimental_limit_batched_tasks,
            max_number_of_tasks: 1_000_000,
            index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
            index_count: DEFAULT_INDEX_COUNT,
--- a/meilisearch/src/option.rs
+++ b/meilisearch/src/option.rs
@@ -51,7 +51,6 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
 const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
 const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
    "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
-const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS";

 const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
 const DEFAULT_DB_PATH: &str = "./data.ms";
@@ -302,11 +301,6 @@ pub struct Opt {
    #[serde(default)]
    pub experimental_reduce_indexing_memory_usage: bool,

-    /// Experimental limit to the number of tasks per batch
-    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())]
-    #[serde(default = "default_limit_batched_tasks")]
-    pub experimental_limit_batched_tasks: usize,
-
    #[serde(flatten)]
    #[clap(flatten)]
    pub indexer_options: IndexerOpts,
@@ -399,8 +393,7 @@ impl Opt {
            #[cfg(all(not(debug_assertions), feature = "analytics"))]
            no_analytics,
            experimental_enable_metrics: enable_metrics_route,
-            experimental_reduce_indexing_memory_usage,
-            experimental_limit_batched_tasks,
+            experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
        } = self;
        export_to_env_if_not_present(MEILI_DB_PATH, db_path);
        export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@@ -444,11 +437,7 @@ impl Opt {
        );
        export_to_env_if_not_present(
            MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
-            experimental_reduce_indexing_memory_usage.to_string(),
-        );
-        export_to_env_if_not_present(
-            MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS,
-            experimental_limit_batched_tasks.to_string(),
+            reduce_indexing_memory_usage.to_string(),
        );
        indexer_options.export_to_env();
    }
@@ -750,10 +739,6 @@ fn default_dump_dir() -> PathBuf {
    PathBuf::from(DEFAULT_DUMP_DIR)
 }

-fn default_limit_batched_tasks() -> usize {
-    usize::MAX
-}
-
 /// Indicates if a snapshot was scheduled, and if yes with which interval.
 #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
 pub enum ScheduleSnapshot {
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@@ -666,7 +666,6 @@ fn compute_semantic_score(query: &[f32], vectors: Value) -> milli::Result<Option
        .map_err(InternalError::SerdeJson)?;
    Ok(vectors
        .into_iter()
-        .flatten()
        .map(|v| OrderedFloat(dot_product_similarity(query, &v)))
        .max()
        .map(OrderedFloat::into_inner))
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@@ -154,19 +154,6 @@ async fn delete_document_by_filter() {
        )
        .await;
    index.wait_task(1).await;
-
-    let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
-    {
-      "numberOfDocuments": 4,
-      "isIndexing": false,
-      "fieldDistribution": {
-        "color": 3,
-        "id": 4
-      }
-    }
-    "###);
-
    let (response, code) =
        index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
    snapshot!(code, @"202 Accepted");
@@ -201,18 +188,6 @@ async fn delete_document_by_filter() {
    }
    "###);

-    let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
-    {
-      "numberOfDocuments": 2,
-      "isIndexing": false,
-      "fieldDistribution": {
-        "color": 1,
-        "id": 2
-      }
-    }
-    "###);
-
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
@@ -266,18 +241,6 @@ async fn delete_document_by_filter() {
    }
    "###);

-    let (stats, _) = index.stats().await;
-    snapshot!(json_string!(stats), @r###"
-    {
-      "numberOfDocuments": 1,
-      "isIndexing": false,
-      "fieldDistribution": {
-        "color": 1,
-        "id": 1
-      }
-    }
-    "###);
-
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
--- a/meilisearch/tests/search/geo.rs
+++ b/meilisearch/tests/search/geo.rs
@@ -1,4 +1,3 @@
-use meili_snap::{json_string, snapshot};
 use once_cell::sync::Lazy;
 use serde_json::{json, Value};

@@ -61,59 +60,3 @@ async fn geo_sort_with_geo_strings() {
        )
        .await;
 }
-
-#[actix_rt::test]
-async fn geo_bounding_box_with_string_and_number() {
-    let server = Server::new().await;
-    let index = server.index("test");
-
-    let documents = DOCUMENTS.clone();
-    index.update_settings_filterable_attributes(json!(["_geo"])).await;
-    index.update_settings_sortable_attributes(json!(["_geo"])).await;
-    index.add_documents(documents, None).await;
-    index.wait_task(2).await;
-
-    index
-        .search(
-            json!({
-                "filter": "_geoBoundingBox([89, 179], [-89, -179])",
-            }),
-            |response, code| {
-                assert_eq!(code, 200, "{}", response);
-                snapshot!(json_string!(response, { ".processingTimeMs" => "[time]" }), @r###"
-                {
-                  "hits": [
-                    {
-                      "id": 1,
-                      "name": "Taco Truck",
-                      "address": "444 Salsa Street, Burritoville",
-                      "type": "Mexican",
-                      "rating": 9,
-                      "_geo": {
-                        "lat": 34.0522,
-                        "lng": -118.2437
-                      }
-                    },
-                    {
-                      "id": 2,
-                      "name": "La Bella Italia",
-                      "address": "456 Elm Street, Townsville",
-                      "type": "Italian",
-                      "rating": 9,
-                      "_geo": {
-                        "lat": "45.4777599",
-                        "lng": "9.1967508"
-                      }
-                    }
-                  ],
-                  "query": "",
-                  "processingTimeMs": "[time]",
-                  "limit": 20,
-                  "offset": 0,
-                  "estimatedTotalHits": 2
-                }
-                "###);
-            },
-        )
-        .await;
-}
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@@ -1104,3 +1104,59 @@ async fn camelcased_words() {
        })
        .await;
 }
+
+#[actix_rt::test]
+async fn simple_search_with_strange_synonyms() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await;
+    let r = index.wait_task(0).await;
+    meili_snap::snapshot!(r["status"], @r###""succeeded""###);
+
+    let documents = DOCUMENTS.clone();
+    index.add_documents(documents, None).await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "How to train"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "How & train"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "to"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+}
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -17,7 +17,7 @@ bincode = "1.3.3"
 bstr = "1.4.0"
 bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
 byteorder = "1.4.3"
-charabia = { version = "0.8.3", default-features = false }
+charabia = { version = "0.8.2", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.8"
 deserr = "0.5.0"
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1718,11 +1718,11 @@ pub(crate) mod tests {
            .unwrap();
        index
            .add_documents(documents!([
-                { "id": 0, "_geo": { "lat": "0", "lng": "0" } },
-                { "id": 1, "_geo": { "lat": 0, "lng": "-175" } },
-                { "id": 2, "_geo": { "lat": "0", "lng": 175 } },
+                { "id": 0, "_geo": { "lat": 0, "lng": 0 } },
+                { "id": 1, "_geo": { "lat": 0, "lng": -175 } },
+                { "id": 2, "_geo": { "lat": 0, "lng": 175 } },
                { "id": 3, "_geo": { "lat": 85, "lng": 0 } },
-                { "id": 4, "_geo": { "lat": "-85", "lng": "0" } },
+                { "id": 4, "_geo": { "lat": -85, "lng": 0 } },
            ]))
            .unwrap();

--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -97,7 +97,7 @@ const MAX_LMDB_KEY_LENGTH: usize = 500;
 ///
 /// This number is determined by the keys of the different facet databases
 /// and adding a margin of safety.
-pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 32;
+pub const MAX_FACET_VALUE_LENGTH: usize = MAX_LMDB_KEY_LENGTH - 20;

 /// The maximum length a word can be
 pub const MAX_WORD_LENGTH: usize = MAX_LMDB_KEY_LENGTH / 2;
@@ -293,15 +293,15 @@ pub fn normalize_facet(original: &str) -> String {
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
 #[serde(transparent)]
 pub struct VectorOrArrayOfVectors {
-    #[serde(with = "either::serde_untagged_optional")]
-    inner: Option<either::Either<Vec<f32>, Vec<Vec<f32>>>>,
+    #[serde(with = "either::serde_untagged")]
+    inner: either::Either<Vec<f32>, Vec<Vec<f32>>>,
 }

 impl VectorOrArrayOfVectors {
-    pub fn into_array_of_vectors(self) -> Option<Vec<Vec<f32>>> {
-        match self.inner? {
-            either::Either::Left(vector) => Some(vec![vector]),
-            either::Either::Right(vectors) => Some(vectors),
+    pub fn into_array_of_vectors(self) -> Vec<Vec<f32>> {
+        match self.inner {
+            either::Either::Left(vector) => vec![vector],
+            either::Either::Right(vectors) => vectors,
        }
    }
 }
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -91,12 +91,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
    /// Update the universes accordingly and inform the logger.
    macro_rules! back {
        () => {
-            // FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013>
-            // assert!(
-            //     ranking_rule_universes[cur_ranking_rule_index].is_empty(),
-            //     "The ranking rule {} did not sort its bucket exhaustively",
-            //     ranking_rules[cur_ranking_rule_index].id()
-            // );
+            assert!(
+                ranking_rule_universes[cur_ranking_rule_index].is_empty(),
+                "The ranking rule {} did not sort its bucket exhaustively",
+                ranking_rules[cur_ranking_rule_index].id()
+            );
            logger.end_iteration_ranking_rule(
                cur_ranking_rule_index,
                ranking_rules[cur_ranking_rule_index].as_ref(),
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -94,7 +94,7 @@ use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValu
 use crate::heed_codec::ByteSliceRefCodec;
 use crate::update::index_documents::create_sorter;
 use crate::update::merge_btreeset_string;
-use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
+use crate::{BEU16StrCodec, Index, Result, BEU16};

 pub mod bulk;
 pub mod delete;
@@ -191,16 +191,7 @@ impl<'i> FacetsUpdate<'i> {
        for result in database.iter(wtxn)? {
            let (facet_group_key, ()) = result?;
            if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
-                let mut normalized_facet = left_bound.normalize(&options);
-                let normalized_truncated_facet: String;
-                if normalized_facet.len() > MAX_FACET_VALUE_LENGTH {
-                    normalized_truncated_facet = normalized_facet
-                        .char_indices()
-                        .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
-                        .map(|(_, c)| c)
-                        .collect();
-                    normalized_facet = normalized_truncated_facet.into();
-                }
+                let normalized_facet = left_bound.normalize(&options);
                let set = BTreeSet::from_iter(std::iter::once(left_bound));
                let key = (field_id, normalized_facet.as_ref());
                let key = BEU16StrCodec::bytes_encode(&key).ok_or(heed::Error::Encoding)?;
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -44,7 +44,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
        if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
            normalised_truncated_value = normalised_value
                .char_indices()
-                .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
+                .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
                .map(|(_, c)| c)
                .collect();
            normalised_value = normalised_truncated_value.as_str();
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -28,13 +28,11 @@ pub struct ExtractedFacetValues {
 ///
 /// Returns the generated grenad reader containing the docid the fid and the orginal value as key
 /// and the normalized value as value extracted from the given chunk of documents.
-/// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
 #[logging_timer::time]
 pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
    obkv_documents: grenad::Reader<R>,
    indexer: GrenadParameters,
    faceted_fields: &HashSet<FieldId>,
-    geo_fields_ids: Option<(FieldId, FieldId)>,
 ) -> Result<ExtractedFacetValues> {
    let max_memory = indexer.max_memory_by_thread();

@@ -84,10 +82,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(

                let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;

-                match extract_facet_values(
-                    &value,
-                    geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
-                ) {
+                match extract_facet_values(&value) {
                    FilterableValues::Null => {
                        facet_is_null_docids.entry(field_id).or_default().insert(document);
                    }
@@ -180,13 +175,12 @@ enum FilterableValues {
    Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
 }

-fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
+fn extract_facet_values(value: &Value) -> FilterableValues {
    fn inner_extract_facet_values(
        value: &Value,
        can_recurse: bool,
        output_numbers: &mut Vec<f64>,
        output_strings: &mut Vec<(String, String)>,
-        geo_field: bool,
    ) {
        match value {
            Value::Null => (),
@@ -197,30 +191,13 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
                }
            }
            Value::String(original) => {
-                // if we're working on a geofield it MUST be something we can parse or else there was an internal error
-                // in the enrich pipeline. But since the enrich pipeline worked, we want to avoid crashing at all costs.
-                if geo_field {
-                    if let Ok(float) = original.parse() {
-                        output_numbers.push(float);
-                    } else {
-                        log::warn!(
-                            "Internal error, could not parse a geofield that has been validated. Please open an issue."
-                        )
-                    }
-                }
                let normalized = crate::normalize_facet(original);
                output_strings.push((normalized, original.clone()));
            }
            Value::Array(values) => {
                if can_recurse {
                    for value in values {
-                        inner_extract_facet_values(
-                            value,
-                            false,
-                            output_numbers,
-                            output_strings,
-                            geo_field,
-                        );
+                        inner_extract_facet_values(value, false, output_numbers, output_strings);
                    }
                }
            }
@@ -236,7 +213,7 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
        otherwise => {
            let mut numbers = Vec::new();
            let mut strings = Vec::new();
-            inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings, geo_field);
+            inner_extract_facet_values(otherwise, true, &mut numbers, &mut strings);
            FilterableValues::Values { numbers, strings }
        }
    }
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -33,7 +33,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
        // lazily get it when needed
        let document_id = || -> Value {
            let document_id = obkv.get(primary_key_id).unwrap();
-            from_slice(document_id).unwrap()
+            serde_json::from_slice(document_id).unwrap()
        };

        // first we retrieve the _vectors field
@@ -50,14 +50,12 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
                }
            };

-            if let Some(vectors) = vectors {
-                for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
-                    let index = u16::try_from(i).unwrap();
-                    let mut key = docid_bytes.to_vec();
-                    key.extend_from_slice(&index.to_be_bytes());
-                    let bytes = cast_slice(&vector);
-                    writer.insert(key, bytes)?;
-                }
+            for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
+                let index = u16::try_from(i).unwrap();
+                let mut key = docid_bytes.to_vec();
+                key.extend_from_slice(&index.to_be_bytes());
+                let bytes = cast_slice(&vector);
+                writer.insert(key, bytes)?;
            }
        }
        // else => the `_vectors` object was `null`, there is nothing to do
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -55,13 +55,7 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(
-                original_documents_chunk,
-                indexer,
-                lmdb_writer_sx.clone(),
-                vectors_field_id,
-                primary_key_id,
-            )
+            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
        })
        .collect::<Result<()>>()?;

@@ -78,6 +72,7 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
+                    vectors_field_id,
                    &stop_words,
                    max_positions_per_attributes,
                )
@@ -262,33 +257,11 @@ fn spawn_extraction_task<FE, FS, M>(
 /// - documents
 fn send_original_documents_data(
    original_documents_chunk: Result<grenad::Reader<File>>,
-    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
-    vectors_field_id: Option<FieldId>,
-    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;

-    if let Some(vectors_field_id) = vectors_field_id {
-        let documents_chunk_cloned = original_documents_chunk.clone();
-        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
-        rayon::spawn(move || {
-            let result = extract_vector_points(
-                documents_chunk_cloned,
-                indexer,
-                primary_key_id,
-                vectors_field_id,
-            );
-            let _ = match result {
-                Ok(vector_points) => {
-                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
-                }
-                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
-            };
-        });
-    }
-
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@@ -310,6 +283,7 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
+    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@@ -338,6 +312,25 @@ fn send_and_extract_flattened_documents_data(
        });
    }

+    if let Some(vectors_field_id) = vectors_field_id {
+        let documents_chunk_cloned = flattened_documents_chunk.clone();
+        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            let result = extract_vector_points(
+                documents_chunk_cloned,
+                indexer,
+                primary_key_id,
+                vectors_field_id,
+            );
+            let _ = match result {
+                Ok(vector_points) => {
+                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
+                }
+                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
+            };
+        });
+    }
+
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
@@ -373,7 +366,6 @@ fn send_and_extract_flattened_documents_data(
                    flattened_documents_chunk.clone(),
                    indexer,
                    faceted_fields,
-                    geo_fields_ids,
                )?;

                // send docid_fid_facet_numbers_chunk to DB writer
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2519,25 +2519,6 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }

-    /// Index multiple different number of vectors in documents.
-    /// Vectors must be of the same length.
-    #[test]
-    fn test_multiple_vectors() {
-        let index = TempIndex::new();
-
-        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
-        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
-        index
-            .add_documents(
-                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
-            )
-            .unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
-        assert_eq!(res.documents_ids.len(), 3);
-    }
-
    #[test]
    fn reproduce_the_bug() {
        /*
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -477,13 +477,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                for (word, synonyms) in synonyms {
                    // Normalize both the word and associated synonyms.
                    let normalized_word = normalize(&tokenizer, word);
-                    let normalized_synonyms =
-                        synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
+                    let normalized_synonyms: Vec<_> = synonyms
+                        .iter()
+                        .map(|synonym| normalize(&tokenizer, synonym))
+                        .filter(|synonym| !synonym.is_empty())
+                        .collect();

                    // Store the normalized synonyms under the normalized word,
                    // merging the possible duplicate words.
-                    let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
-                    entry.extend(normalized_synonyms);
+                    if !normalized_word.is_empty() && !normalized_synonyms.is_empty() {
+                        let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
+                        entry.extend(normalized_synonyms.into_iter());
+                    }
                }

                // Make sure that we don't have duplicate synonyms.
Author	SHA1	Message	Date
ManyTheFish	8dc5acf998	Try fix	2023-08-08 16:52:36 +02:00
ManyTheFish	fc2590fc9d	Add a test	2023-08-08 16:43:08 +02:00