Fix the tests

update the description of the cli argument
Expose a new flag to limit the number of batched tasks
2025-12-02 02:35:36 +00:00 · 2023-09-15 12:01:29 +02:00 · 2023-09-15 12:01:29 +02:00 · 2023-09-15 12:01:28 +02:00 · 2023-09-12 13:34:39 +00:00 · 2023-09-12 11:26:48 +02:00
20 changed files with 281 additions and 138 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -469,7 +469,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

 [[package]]
 name = "benchmarks"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "anyhow",
 "bytes",
@@ -700,9 +700,9 @@ dependencies = [

 [[package]]
 name = "charabia"
-version = "0.8.2"
+version = "0.8.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "57aa1b4a8dda126c03ebf2f7e31d16cfc8781c2fe80dedd1a33459efc3e07578"
+checksum = "098219a776307414866165a03a9cc68c1578764fe3616fe979e1c280790ddd73"
 dependencies = [
 "aho-corasick",
 "cow-utils",
@@ -1199,7 +1199,7 @@ dependencies = [

 [[package]]
 name = "dump"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "anyhow",
 "big_s",
@@ -1413,7 +1413,7 @@ dependencies = [

 [[package]]
 name = "file-store"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "faux",
 "tempfile",
@@ -1435,11 +1435,12 @@ dependencies = [

 [[package]]
 name = "filter-parser"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "insta",
 "nom",
 "nom_locate",
+ "unescaper",
 ]

 [[package]]
@@ -1454,7 +1455,7 @@ dependencies = [

 [[package]]
 name = "flatten-serde-json"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "criterion",
 "serde_json",
@@ -1572,7 +1573,7 @@ dependencies = [

 [[package]]
 name = "fuzzers"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "arbitrary",
 "clap",
@@ -1894,7 +1895,7 @@ dependencies = [

 [[package]]
 name = "index-scheduler"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "anyhow",
 "big_s",
@@ -2081,7 +2082,7 @@ dependencies = [

 [[package]]
 name = "json-depth-checker"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "criterion",
 "serde_json",
@@ -2493,7 +2494,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "meili-snap"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "insta",
 "md5",
@@ -2502,7 +2503,7 @@ dependencies = [

 [[package]]
 name = "meilisearch"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "actix-cors",
 "actix-http",
@@ -2591,7 +2592,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-auth"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "base64 0.21.2",
 "enum-iterator",
@@ -2610,7 +2611,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-types"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "actix-web",
 "anyhow",
@@ -2664,7 +2665,7 @@ dependencies = [

 [[package]]
 name = "milli"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "big_s",
 "bimap",
@@ -2994,7 +2995,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

 [[package]]
 name = "permissive-json-pointer"
-version = "1.3.1"
+version = "1.3.4"
 dependencies = [
 "big_s",
 "serde_json",
@@ -4147,6 +4148,15 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81"

+[[package]]
+name = "unescaper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8"
+dependencies = [
+ "thiserror",
+]
+
 [[package]]
 name = "unicase"
 version = "2.6.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 [workspace.package]
-version = "1.3.1"
+version = "1.3.4"
 authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
 description = "Meilisearch HTTP server"
 homepage = "https://meilisearch.com"
--- a/filter-parser/Cargo.toml
+++ b/filter-parser/Cargo.toml
@@ -14,6 +14,7 @@ license.workspace = true
 [dependencies]
 nom = "7.1.3"
 nom_locate = "4.1.0"
+unescaper = "0.1.2"

 [dev-dependencies]
 insta = "1.29.0"
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@@ -62,6 +62,7 @@ pub enum ErrorKind<'a> {
    MisusedGeoRadius,
    MisusedGeoBoundingBox,
    InvalidPrimary,
+    InvalidEscapedNumber,
    ExpectedEof,
    ExpectedValue(ExpectedValueKind),
    MalformedValue,
@@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> {
                let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
            }
+            ErrorKind::InvalidEscapedNumber => {
+                writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
+            }
            ErrorKind::ExpectedEof => {
                writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
            }
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> {

 #[cfg(test)]
 pub mod tests {
+    use FilterCondition as Fc;
+
    use super::*;

    /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@@ -556,14 +558,22 @@ pub mod tests {
        unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
    }

+    fn p(s: &str) -> impl std::fmt::Display + '_ {
+        Fc::parse(s).unwrap().unwrap()
+    }
+
+    #[test]
+    fn parse_escaped() {
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
+        // but it also works with other sequencies
+        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
+    }
+
    #[test]
    fn parse() {
-        use FilterCondition as Fc;
-
-        fn p(s: &str) -> impl std::fmt::Display + '_ {
-            Fc::parse(s).unwrap().unwrap()
-        }
-
        // Test equal
        insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
        insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult<Token> {
        })
    })?;

-    Ok((input, value))
+    match unescaper::unescape(value.value()) {
+        Ok(content) => {
+            if content.len() != value.value().len() {
+                Ok((input, Token::new(value.original_span(), Some(content))))
+            } else {
+                Ok((input, value))
+            }
+        }
+        Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
+        Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::InvalidEscapedNumber,
+        ))),
+        Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::MalformedValue,
+        ))),
+    }
 }

 fn is_value_component(c: char) -> bool {
@@ -318,17 +335,17 @@ pub mod test {
            ("\"cha'nnel\"", "cha'nnel", false),
            ("I'm tamo", "I", false),
            // escaped thing but not quote
-            (r#""\\""#, r#"\\"#, false),
-            (r#""\\\\\\""#, r#"\\\\\\"#, false),
-            (r#""aa\\aa""#, r#"aa\\aa"#, false),
+            (r#""\\""#, r#"\"#, true),
+            (r#""\\\\\\""#, r#"\\\"#, true),
+            (r#""aa\\aa""#, r#"aa\aa"#, true),
            // with double quote
            (r#""Hello \"world\"""#, r#"Hello "world""#, true),
-            (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
+            (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
            (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
            (r#""\"\"""#, r#""""#, true),
            // with simple quote
            (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
-            (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
+            (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
            (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
            (r#"'\'\''"#, r#"''"#, true),
        ];
@@ -350,7 +367,14 @@ pub mod test {
                "Filter `{}` was not supposed to be escaped",
                input
            );
-            assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
+            assert_eq!(
+                token.value(),
+                expected,
+                "Filter `{}` failed by giving `{}` instead of `{}`.",
+                input,
+                token.value(),
+                expected
+            );
        }
    }

--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -67,10 +67,6 @@ pub(crate) enum Batch {
        op: IndexOperation,
        must_create_index: bool,
    },
-    IndexDocumentDeletionByFilter {
-        index_uid: String,
-        task: Task,
-    },
    IndexCreation {
        index_uid: String,
        primary_key: Option<String>,
@@ -114,6 +110,10 @@ pub(crate) enum IndexOperation {
        documents: Vec<Vec<String>>,
        tasks: Vec<Task>,
    },
+    IndexDocumentDeletionByFilter {
+        index_uid: String,
+        task: Task,
+    },
    DocumentClear {
        index_uid: String,
        tasks: Vec<Task>,
@@ -155,7 +155,6 @@ impl Batch {
            | Batch::TaskDeletion(task)
            | Batch::Dump(task)
            | Batch::IndexCreation { task, .. }
-            | Batch::IndexDocumentDeletionByFilter { task, .. }
            | Batch::IndexUpdate { task, .. } => vec![task.uid],
            Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
                tasks.iter().map(|task| task.uid).collect()
@@ -167,6 +166,7 @@ impl Batch {
                | IndexOperation::DocumentClear { tasks, .. } => {
                    tasks.iter().map(|task| task.uid).collect()
                }
+                IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
                IndexOperation::SettingsAndDocumentOperation {
                    document_import_tasks: tasks,
                    settings_tasks: other,
@@ -194,8 +194,7 @@ impl Batch {
            IndexOperation { op, .. } => Some(op.index_uid()),
            IndexCreation { index_uid, .. }
            | IndexUpdate { index_uid, .. }
-            | IndexDeletion { index_uid, .. }
-            | IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
+            | IndexDeletion { index_uid, .. } => Some(index_uid),
        }
    }
 }
@@ -205,6 +204,7 @@ impl IndexOperation {
        match self {
            IndexOperation::DocumentOperation { index_uid, .. }
            | IndexOperation::DocumentDeletion { index_uid, .. }
+            | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
            | IndexOperation::DocumentClear { index_uid, .. }
            | IndexOperation::Settings { index_uid, .. }
            | IndexOperation::DocumentClearAndSetting { index_uid, .. }
@@ -239,9 +239,12 @@ impl IndexScheduler {
                let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
                match &task.kind {
                    KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
-                        Ok(Some(Batch::IndexDocumentDeletionByFilter {
-                            index_uid: index_uid.clone(),
-                            task,
+                        Ok(Some(Batch::IndexOperation {
+                            op: IndexOperation::IndexDocumentDeletionByFilter {
+                                index_uid: index_uid.clone(),
+                                task,
+                            },
+                            must_create_index: false,
                        }))
                    }
                    _ => unreachable!(),
@@ -534,7 +537,9 @@ impl IndexScheduler {
        let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued;

        // If autobatching is disabled we only take one task at a time.
-        let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 };
+        // Otherwise, we take only a maximum of tasks to create batches.
+        let tasks_limit =
+            if self.autobatching_enabled { self.maximum_number_of_batched_tasks } else { 1 };

        let enqueued = index_tasks
            .into_iter()
@@ -891,51 +896,6 @@ impl IndexScheduler {

                Ok(tasks)
            }
-            Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
-                let (index_uid, filter) =
-                    if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
-                        &task.kind
-                    {
-                        (index_uid, filter_expr)
-                    } else {
-                        unreachable!()
-                    };
-                let index = {
-                    let rtxn = self.env.read_txn()?;
-                    self.index_mapper.index(&rtxn, index_uid)?
-                };
-                let deleted_documents = delete_document_by_filter(filter, index);
-                let original_filter = if let Some(Details::DocumentDeletionByFilter {
-                    original_filter,
-                    deleted_documents: _,
-                }) = task.details
-                {
-                    original_filter
-                } else {
-                    // In the case of a `documentDeleteByFilter` the details MUST be set
-                    unreachable!();
-                };
-
-                match deleted_documents {
-                    Ok(deleted_documents) => {
-                        task.status = Status::Succeeded;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(deleted_documents),
-                        });
-                    }
-                    Err(e) => {
-                        task.status = Status::Failed;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(0),
-                        });
-                        task.error = Some(e.into());
-                    }
-                }
-
-                Ok(vec![task])
-            }
            Batch::IndexCreation { index_uid, primary_key, task } => {
                let wtxn = self.env.write_txn()?;
                if self.index_mapper.exists(&wtxn, &index_uid)? {
@@ -1292,6 +1252,47 @@ impl IndexScheduler {

                Ok(tasks)
            }
+            IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
+                let filter =
+                    if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
+                        &task.kind
+                    {
+                        filter_expr
+                    } else {
+                        unreachable!()
+                    };
+                let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
+                let original_filter = if let Some(Details::DocumentDeletionByFilter {
+                    original_filter,
+                    deleted_documents: _,
+                }) = task.details
+                {
+                    original_filter
+                } else {
+                    // In the case of a `documentDeleteByFilter` the details MUST be set
+                    unreachable!();
+                };
+
+                match deleted_documents {
+                    Ok(deleted_documents) => {
+                        task.status = Status::Succeeded;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(deleted_documents),
+                        });
+                    }
+                    Err(e) => {
+                        task.status = Status::Failed;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(0),
+                        });
+                        task.error = Some(e.into());
+                    }
+                }
+
+                Ok(vec![task])
+            }
            IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
                let indexer_config = self.index_mapper.indexer_config();
                let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@@ -1491,23 +1492,22 @@ impl IndexScheduler {
    }
 }

-fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
+fn delete_document_by_filter<'a>(
+    wtxn: &mut RwTxn<'a, '_>,
+    filter: &serde_json::Value,
+    index: &'a Index,
+) -> Result<u64> {
    let filter = Filter::from_json(filter)?;
    Ok(if let Some(filter) = filter {
-        let mut wtxn = index.write_txn()?;
-
-        let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
+        let candidates = filter.evaluate(wtxn, index).map_err(|err| match err {
            milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
                Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
            }
            e => e.into(),
        })?;
-        let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
+        let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
        delete_operation.delete_documents(&candidates);
-        let deleted_documents =
-            delete_operation.execute().map(|result| result.deleted_documents)?;
-        wtxn.commit()?;
-        deleted_documents
+        delete_operation.execute().map(|result| result.deleted_documents)?
    } else {
        0
    })
--- a/index-scheduler/src/insta_snapshot.rs
+++ b/index-scheduler/src/insta_snapshot.rs
@@ -15,6 +15,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {

    let IndexScheduler {
        autobatching_enabled,
+        maximum_number_of_batched_tasks: _,
        must_stop_processing: _,
        processing_tasks,
        file_store,
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -253,6 +253,9 @@ pub struct IndexSchedulerOptions {
    /// Set to `true` iff the index scheduler is allowed to automatically
    /// batch tasks together, to process multiple tasks at once.
    pub autobatching_enabled: bool,
+    /// If the autobatcher is allowed to automatically batch tasks
+    /// it will only batch this defined number of tasks at once.
+    pub maximum_number_of_batched_tasks: usize,
    /// The maximum number of tasks stored in the task queue before starting
    /// to auto schedule task deletions.
    pub max_number_of_tasks: usize,
@@ -310,6 +313,9 @@ pub struct IndexScheduler {
    /// Whether auto-batching is enabled or not.
    pub(crate) autobatching_enabled: bool,

+    /// The maximum number of tasks that will be batched together.
+    pub(crate) maximum_number_of_batched_tasks: usize,
+
    /// The max number of tasks allowed before the scheduler starts to delete
    /// the finished tasks automatically.
    pub(crate) max_number_of_tasks: usize,
@@ -363,6 +369,7 @@ impl IndexScheduler {
            index_mapper: self.index_mapper.clone(),
            wake_up: self.wake_up.clone(),
            autobatching_enabled: self.autobatching_enabled,
+            maximum_number_of_batched_tasks: self.maximum_number_of_batched_tasks,
            max_number_of_tasks: self.max_number_of_tasks,
            snapshots_path: self.snapshots_path.clone(),
            dumps_path: self.dumps_path.clone(),
@@ -458,6 +465,7 @@ impl IndexScheduler {
            // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
            wake_up: Arc::new(SignalEvent::auto(true)),
            autobatching_enabled: options.autobatching_enabled,
+            maximum_number_of_batched_tasks: options.maximum_number_of_batched_tasks,
            max_number_of_tasks: options.max_number_of_tasks,
            dumps_path: options.dumps_path,
            snapshots_path: options.snapshots_path,
@@ -1587,6 +1595,7 @@ mod tests {
                index_count: 5,
                indexer_config,
                autobatching_enabled: true,
+                maximum_number_of_batched_tasks: usize::MAX,
                max_number_of_tasks: 1_000_000,
                instance_features: Default::default(),
            };
--- a/meilisearch/src/analytics/segment_analytics.rs
+++ b/meilisearch/src/analytics/segment_analytics.rs
@@ -285,6 +285,7 @@ impl From<Opt> for Infos {
            db_path,
            experimental_enable_metrics,
            experimental_reduce_indexing_memory_usage,
+            experimental_limit_batched_tasks: _,
            http_addr,
            master_key: _,
            env,
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -236,6 +236,7 @@ fn open_or_create_database_unchecked(
            enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
            indexer_config: (&opt.indexer_options).try_into()?,
            autobatching_enabled: true,
+            maximum_number_of_batched_tasks: opt.experimental_limit_batched_tasks,
            max_number_of_tasks: 1_000_000,
            index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
            index_count: DEFAULT_INDEX_COUNT,
--- a/meilisearch/src/option.rs
+++ b/meilisearch/src/option.rs
@@ -51,6 +51,7 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
 const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
 const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
    "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
+const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS";

 const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
 const DEFAULT_DB_PATH: &str = "./data.ms";
@@ -301,6 +302,11 @@ pub struct Opt {
    #[serde(default)]
    pub experimental_reduce_indexing_memory_usage: bool,

+    /// Experimental limit to the number of tasks per batch
+    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())]
+    #[serde(default = "default_limit_batched_tasks")]
+    pub experimental_limit_batched_tasks: usize,
+
    #[serde(flatten)]
    #[clap(flatten)]
    pub indexer_options: IndexerOpts,
@@ -393,7 +399,8 @@ impl Opt {
            #[cfg(all(not(debug_assertions), feature = "analytics"))]
            no_analytics,
            experimental_enable_metrics: enable_metrics_route,
-            experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
+            experimental_reduce_indexing_memory_usage,
+            experimental_limit_batched_tasks,
        } = self;
        export_to_env_if_not_present(MEILI_DB_PATH, db_path);
        export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@@ -437,7 +444,11 @@ impl Opt {
        );
        export_to_env_if_not_present(
            MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
-            reduce_indexing_memory_usage.to_string(),
+            experimental_reduce_indexing_memory_usage.to_string(),
+        );
+        export_to_env_if_not_present(
+            MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS,
+            experimental_limit_batched_tasks.to_string(),
        );
        indexer_options.export_to_env();
    }
@@ -739,6 +750,10 @@ fn default_dump_dir() -> PathBuf {
    PathBuf::from(DEFAULT_DUMP_DIR)
 }

+fn default_limit_batched_tasks() -> usize {
+    usize::MAX
+}
+
 /// Indicates if a snapshot was scheduled, and if yes with which interval.
 #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
 pub enum ScheduleSnapshot {
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@@ -666,6 +666,7 @@ fn compute_semantic_score(query: &[f32], vectors: Value) -> milli::Result<Option
        .map_err(InternalError::SerdeJson)?;
    Ok(vectors
        .into_iter()
+        .flatten()
        .map(|v| OrderedFloat(dot_product_similarity(query, &v)))
        .max()
        .map(OrderedFloat::into_inner))
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@@ -154,6 +154,19 @@ async fn delete_document_by_filter() {
        )
        .await;
    index.wait_task(1).await;
+
+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 4,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 3,
+        "id": 4
+      }
+    }
+    "###);
+
    let (response, code) =
        index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
    snapshot!(code, @"202 Accepted");
@@ -188,6 +201,18 @@ async fn delete_document_by_filter() {
    }
    "###);

+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 2,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 1,
+        "id": 2
+      }
+    }
+    "###);
+
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
@@ -241,6 +266,18 @@ async fn delete_document_by_filter() {
    }
    "###);

+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 1,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 1,
+        "id": 1
+      }
+    }
+    "###);
+
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -17,7 +17,7 @@ bincode = "1.3.3"
 bstr = "1.4.0"
 bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
 byteorder = "1.4.3"
-charabia = { version = "0.8.2", default-features = false }
+charabia = { version = "0.8.3", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.8"
 deserr = "0.5.0"
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -293,15 +293,15 @@ pub fn normalize_facet(original: &str) -> String {
 #[derive(serde::Serialize, serde::Deserialize, Debug)]
 #[serde(transparent)]
 pub struct VectorOrArrayOfVectors {
-    #[serde(with = "either::serde_untagged")]
-    inner: either::Either<Vec<f32>, Vec<Vec<f32>>>,
+    #[serde(with = "either::serde_untagged_optional")]
+    inner: Option<either::Either<Vec<f32>, Vec<Vec<f32>>>>,
 }

 impl VectorOrArrayOfVectors {
-    pub fn into_array_of_vectors(self) -> Vec<Vec<f32>> {
-        match self.inner {
-            either::Either::Left(vector) => vec![vector],
-            either::Either::Right(vectors) => vectors,
+    pub fn into_array_of_vectors(self) -> Option<Vec<Vec<f32>>> {
+        match self.inner? {
+            either::Either::Left(vector) => Some(vec![vector]),
+            either::Either::Right(vectors) => Some(vectors),
        }
    }
 }
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -91,11 +91,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
    /// Update the universes accordingly and inform the logger.
    macro_rules! back {
        () => {
-            assert!(
-                ranking_rule_universes[cur_ranking_rule_index].is_empty(),
-                "The ranking rule {} did not sort its bucket exhaustively",
-                ranking_rules[cur_ranking_rule_index].id()
-            );
+            // FIXME: temporarily disabled assert: see <https://github.com/meilisearch/meilisearch/pull/4013>
+            // assert!(
+            //     ranking_rule_universes[cur_ranking_rule_index].is_empty(),
+            //     "The ranking rule {} did not sort its bucket exhaustively",
+            //     ranking_rules[cur_ranking_rule_index].id()
+            // );
            logger.end_iteration_ranking_rule(
                cur_ranking_rule_index,
                ranking_rules[cur_ranking_rule_index].as_ref(),
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -33,7 +33,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
        // lazily get it when needed
        let document_id = || -> Value {
            let document_id = obkv.get(primary_key_id).unwrap();
-            serde_json::from_slice(document_id).unwrap()
+            from_slice(document_id).unwrap()
        };

        // first we retrieve the _vectors field
@@ -50,12 +50,14 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
                }
            };

-            for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
-                let index = u16::try_from(i).unwrap();
-                let mut key = docid_bytes.to_vec();
-                key.extend_from_slice(&index.to_be_bytes());
-                let bytes = cast_slice(&vector);
-                writer.insert(key, bytes)?;
+            if let Some(vectors) = vectors {
+                for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
+                    let index = u16::try_from(i).unwrap();
+                    let mut key = docid_bytes.to_vec();
+                    key.extend_from_slice(&index.to_be_bytes());
+                    let bytes = cast_slice(&vector);
+                    writer.insert(key, bytes)?;
+                }
            }
        }
        // else => the `_vectors` object was `null`, there is nothing to do
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -55,7 +55,13 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
+            send_original_documents_data(
+                original_documents_chunk,
+                indexer,
+                lmdb_writer_sx.clone(),
+                vectors_field_id,
+                primary_key_id,
+            )
        })
        .collect::<Result<()>>()?;

@@ -72,7 +78,6 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
-                    vectors_field_id,
                    &stop_words,
                    max_positions_per_attributes,
                )
@@ -257,11 +262,33 @@ fn spawn_extraction_task<FE, FS, M>(
 /// - documents
 fn send_original_documents_data(
    original_documents_chunk: Result<grenad::Reader<File>>,
+    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    vectors_field_id: Option<FieldId>,
+    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;

+    if let Some(vectors_field_id) = vectors_field_id {
+        let documents_chunk_cloned = original_documents_chunk.clone();
+        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            let result = extract_vector_points(
+                documents_chunk_cloned,
+                indexer,
+                primary_key_id,
+                vectors_field_id,
+            );
+            let _ = match result {
+                Ok(vector_points) => {
+                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
+                }
+                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
+            };
+        });
+    }
+
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@@ -283,7 +310,6 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
-    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@@ -312,25 +338,6 @@ fn send_and_extract_flattened_documents_data(
        });
    }

-    if let Some(vectors_field_id) = vectors_field_id {
-        let documents_chunk_cloned = flattened_documents_chunk.clone();
-        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
-        rayon::spawn(move || {
-            let result = extract_vector_points(
-                documents_chunk_cloned,
-                indexer,
-                primary_key_id,
-                vectors_field_id,
-            );
-            let _ = match result {
-                Ok(vector_points) => {
-                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
-                }
-                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
-            };
-        });
-    }
-
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2519,6 +2519,25 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }

+    /// Index multiple different number of vectors in documents.
+    /// Vectors must be of the same length.
+    #[test]
+    fn test_multiple_vectors() {
+        let index = TempIndex::new();
+
+        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
+        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
+        index
+            .add_documents(
+                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
+            )
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
+        assert_eq!(res.documents_ids.len(), 3);
+    }
+
    #[test]
    fn reproduce_the_bug() {
        /*
Author	SHA1	Message	Date
Clément Renault	f46c2de607	Fix the tests	2023-09-15 12:01:29 +02:00
Tamo	388d78f70e	update the description of the cli argument	2023-09-15 12:01:29 +02:00
Clément Renault	b252900470	Expose a new flag to limit the number of batched tasks	2023-09-15 12:01:28 +02:00
meili-bors[bot]	8822ca234e	Merge #4057 4057: Fix stats delete by filter for v1.3.4 r=irevoire a=curquiza Fixes https://github.com/meilisearch/meilisearch/issues/4018 for v1.3.4 Co-authored-by: Tamo <tamo@meilisearch.com>	2023-09-12 13:34:39 +00:00
Tamo	d23abc8771	fix clippy	2023-09-12 11:26:48 +02:00
Tamo	036b846e4d	Fix the stats of the documents deletion by filter The issue was that the operation « DocumentDeletionByFilter » was not declared as an index operation. That means the indexes stats were not reprocessed after the application of the operation.	2023-09-12 11:26:41 +02:00
meili-bors[bot]	9889390d13	Merge #4055 4055: Update version for the next release (v1.3.4) in Cargo.toml r=curquiza a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: curquiza <curquiza@users.noreply.github.com>	2023-09-11 17:04:31 +00:00
curquiza	8e2bb29cf1	Update version for the next release (v1.3.4) in Cargo.toml	2023-09-11 16:20:03 +00:00
meili-bors[bot]	256cf33bca	Merge #4039 4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-07 09:25:58 +00:00
meili-bors[bot]	9945cbf9db	Merge #4038 4038: Fix filter escaping issues r=ManyTheFish a=Kerollmops This PR fixes #4034 by always escaping the sequences. Users must always put quotes (simple or double) to escape the filter values. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-06 12:29:29 +00:00
Kerollmops	03d0f628bd	Use the unescaper crate to unescape any char sequence	2023-09-06 13:59:45 +02:00
Kerollmops	ea78060916	Fix tests that were supposed to escape characters	2023-09-06 13:59:45 +02:00
Kerollmops	b42d48187a	Add a test case scenario	2023-09-06 13:59:44 +02:00
Kerollmops	679c0b0f97	Extract the vectors from the non-flattened version of the documents	2023-09-06 12:26:00 +02:00
Kerollmops	e02d0064bd	Add a test case scenario	2023-09-06 12:26:00 +02:00
meili-bors[bot]	7ef3572f11	Merge #4037 4037: Update version for the next release (v1.3.3) in Cargo.toml r=curquiza a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: curquiza <curquiza@users.noreply.github.com>	2023-09-06 09:50:58 +00:00
curquiza	93285041a9	Update version for the next release (v1.3.3) in Cargo.toml	2023-09-06 09:23:20 +00:00
meili-bors[bot]	cdb4b3e024	Merge #4013 4013: Fix the ranking rule by temporarily disabling an assert in the bucket sort algorithm r=Kerollmops a=Kerollmops This PR temporarily disables an assertion, making the search crash. [I created a tracking issue](https://github.com/meilisearch/meilisearch/issues/4012) to find a better way to fix this. It no longer reverts `a20e4d447c`, which seemed to generate unreachable graphs and make the bucket sort ranking algorithm panic because of entering an unreachable state. We discussed that below in the comments. Temporary fixes #4002, fixes #4006, and fixes #3995. --- It took me approximately 2 days to find the first bad commit just because I'm bad in `git bisect` x `bash`, i.e. [I misused `%1` with `$!` to kill the most recently backgrounded job](https://unix.stackexchange.com/a/340084/212574)... <details> <summary>Here is the script I used to find the invalid commit</summary> ```bash #!/usr/bin/env bash set -x # remove the data rm -rf data.ms # build meilisearch cargo build --release # ignore this commit if it doesn't compile if [[ $? != 0 ]]; then exit 125 fi # index the dump and start from it ./target/release/meilisearch \ --http-addr 'localhost:7705' \ --import-dump $HOME/Downloads/modified-20230822-083016113.dump & # wait 10 sec while it indexes the docs sleep 5 # check if the server crashes on requests echo '{ "q": "rtx 305", "attributesToHighlight": [ "*" ], "highlightPreTag": "<ais-highlight-0000000000>", "highlightPostTag": "</ais-highlight-0000000000>", "limit": 21, "offset": 0 }' \| xh 'localhost:7705/indexes/arvutitark_local_orderables/search' last_exit_code=$? # Now kill Meilisearch kill $! # Clean the potential Cargo.lock git checkout . exit $last_exit_code ``` </details> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: Clément Renault <clement@meilisearch.com>	2023-08-23 15:30:56 +00:00
Clément Renault	8c0ebd1331	Update milli/src/search/new/bucket_sort.rs Co-authored-by: Louis Dureuil <louis@meilisearch.com>	2023-08-23 16:40:39 +02:00
Kerollmops	5130e06b41	Temporarily disable an assert in the ranking rules	2023-08-23 16:11:54 +02:00
Clément Renault	08e27ef73f	Merge pull request #4008 from meilisearch/fix-highlighting-panic Bump charabia to 0.8.3	2023-08-23 11:56:45 +02:00
Kerollmops	717b069907	Bump charabia to 0.8.3	2023-08-22 16:25:00 +02:00
meili-bors[bot]	7ea154673a	Merge #4000 4000: Update version for the next release (v1.3.2) in Cargo.toml r=irevoire a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: irevoire <irevoire@users.noreply.github.com>	2023-08-16 10:41:33 +00:00
irevoire	b947f3bb9d	Update version for the next release (v1.3.2) in Cargo.toml	2023-08-16 08:20:36 +00:00
meili-bors[bot]	4c35817c5f	Merge #3998 3998: Accept the `null` JSON value as a value of the `_vectors` field r=irevoire a=Kerollmops This PR fixes #3979 by accepting `null` JSON values in the `_vectors` fields provided by the user. Can the reviewer please verify that I am merging in the right branch? I think we must create a new _release-v1.3.2_. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-08-16 08:12:24 +00:00
Kerollmops	c53841e166	Accept the null JSON value as the value of _vectors	2023-08-14 16:03:55 +02:00