Temporary use the charabia git repository to get the latest fixes

Enable by default the puffin server
Expose a new flag to limit the number of batched tasks
2025-12-08 21:55:42 +00:00 · 2023-09-19 10:15:17 +02:00 · 2023-09-18 18:12:16 +02:00 · 2023-09-18 18:11:12 +02:00 · 2023-09-18 18:10:46 +02:00 · 2023-09-18 18:10:29 +02:00
24 changed files with 465 additions and 386 deletions
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@@ -53,5 +53,6 @@ jobs:
        uses: mislav/bump-homebrew-formula-action@v2
        with:
          formula-name: meilisearch
+          formula-path: Formula/m/meilisearch.rb
        env:
          COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -700,8 +700,7 @@ dependencies = [
 [[package]]
 name = "charabia"
 version = "0.8.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "098219a776307414866165a03a9cc68c1578764fe3616fe979e1c280790ddd73"
+source = "git+https://github.com/meilisearch/charabia?branch=main#5c3d09a7127dcf5e0e5d94d991c4d3d5ef4768cc"
 dependencies = [
 "aho-corasick",
 "cow-utils",
@@ -1073,16 +1072,6 @@ dependencies = [
 "syn 1.0.109",
 ]

-[[package]]
-name = "debugid"
-version = "0.8.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d"
-dependencies = [
- "serde",
- "uuid 1.4.1",
-]
-
 [[package]]
 name = "deranged"
 version = "0.3.7"
@@ -1454,18 +1443,7 @@ dependencies = [
 "insta",
 "nom",
 "nom_locate",
-]
-
-[[package]]
-name = "findshlibs"
-version = "0.10.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64"
-dependencies = [
- "cc",
- "lazy_static",
- "libc",
- "winapi",
+ "unescaper",
 ]

 [[package]]
@@ -1816,17 +1794,6 @@ dependencies = [
 "digest",
 ]

-[[package]]
-name = "hostname"
-version = "0.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c731c3e10504cc8ed35cfe2f1db4c9274c3d35fa486e3b31df46f068ef3e867"
-dependencies = [
- "libc",
- "match_cfg",
- "winapi",
-]
-
 [[package]]
 name = "http"
 version = "0.2.9"
@@ -2209,9 +2176,9 @@ dependencies = [

 [[package]]
 name = "lindera-cc-cedict-builder"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2d2e8f2ca97ddf952fe340642511b9c14b373cb2eef711d526bb8ef2ca0969b8"
+checksum = "6f567a47e47b5420908424de2c6c5e424e3cafe588d0146bd128c0f3755758a3"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2228,9 +2195,9 @@ dependencies = [

 [[package]]
 name = "lindera-compress"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f72b460559bcbe8a9cee85ea4a5056133ed3abf373031191589236e656d65b59"
+checksum = "49f3e553d55ebe9881fa5e5de588b0a153456e93564d17dfbef498912caf63a2"
 dependencies = [
 "anyhow",
 "flate2",
@@ -2239,9 +2206,9 @@ dependencies = [

 [[package]]
 name = "lindera-core"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f586eb8a9393c32d5525e0e9336a3727bd1329674740097126f3b0bff8a1a1ea"
+checksum = "a9a2440cc156a4a911a174ec68203543d1efb10df3a700a59b6bf581e453c726"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2256,9 +2223,9 @@ dependencies = [

 [[package]]
 name = "lindera-decompress"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1fb1facd8da698072fcc7338bd757730db53d59f313f44dd583fa03681dcc0e1"
+checksum = "e077a410e61c962cb526f71b7effd62ffc607488a8f61869c937582d2ccb529b"
 dependencies = [
 "anyhow",
 "flate2",
@@ -2267,9 +2234,9 @@ dependencies = [

 [[package]]
 name = "lindera-dictionary"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ec7be7410b1da7017a8948986b87af67082f605e9a716f0989790d795d677f0c"
+checksum = "d9f57491adf7b311a3ee87f5e4a36454df16a2ec73de4ef28b2106fac80bd782"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2287,9 +2254,9 @@ dependencies = [

 [[package]]
 name = "lindera-ipadic-builder"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "705d07f8a45d04fd95149f7ad41a26d1f9e56c9c00402be6f9dd05e3d88b99c6"
+checksum = "a3476ec7748aebd2eb23d496ddfce5e7e0a5c031cffcd214451043e02d029f11"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2308,9 +2275,9 @@ dependencies = [

 [[package]]
 name = "lindera-ipadic-neologd-builder"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "633a93983ba13fba42328311a501091bd4a7aff0c94ae9eaa9d4733dd2b0468a"
+checksum = "7b1c7576a02d5e4af2bf62de51790a01bc4b8bc0d0b6a6b86a46b157f5cb306d"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2329,9 +2296,9 @@ dependencies = [

 [[package]]
 name = "lindera-ko-dic"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a428e0d316b6c86f51bd919479692bc41ad840dba266ebc044663970f431ea18"
+checksum = "b713ecd5b827d7d448c3c5eb3c6d5899ecaf22cd17087599996349a02c76828d"
 dependencies = [
 "bincode",
 "byteorder",
@@ -2346,9 +2313,9 @@ dependencies = [

 [[package]]
 name = "lindera-ko-dic-builder"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2a5288704c6b8a069c0a1705c38758e836497698b50453373ab3d56c6f9a7ef8"
+checksum = "3e545752f6487be87b572529ad594cb3b48d2ef20821516f598b2d152d23277b"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2366,9 +2333,9 @@ dependencies = [

 [[package]]
 name = "lindera-tokenizer"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "106ba439b2e87529d9bbedbb88d69f635baba1195c26502b308f55a85885fc81"
+checksum = "24a2d4606a5a4da62ac4a3680ee884a75da7f0c892dc967fc9cb983ceba39a8f"
 dependencies = [
 "bincode",
 "byteorder",
@@ -2381,9 +2348,9 @@ dependencies = [

 [[package]]
 name = "lindera-unidic"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3399b6dcfe1701333451d184ff3c677f433b320153427b146360c9e4bd8cb816"
+checksum = "388b1bdf81794b5d5b8057ce0321c58ff4b90d676b637948ccc7863ae2f43d28"
 dependencies = [
 "bincode",
 "byteorder",
@@ -2398,9 +2365,9 @@ dependencies = [

 [[package]]
 name = "lindera-unidic-builder"
-version = "0.27.0"
+version = "0.27.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b698227fdaeac32289173ab389b990d4eb00a40cbc9912020f69a0c491dabf55"
+checksum = "cdfa3e29a22c047da57fadd960ff674b720de15a1e2fb35b5ed67f3408afb469"
 dependencies = [
 "anyhow",
 "bincode",
@@ -2524,12 +2491,6 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d"

-[[package]]
-name = "match_cfg"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ffbee8634e0d45d258acb448e7eaab3fce7a0a467395d4d9f228e3c1f01fb2e4"
-
 [[package]]
 name = "md5"
 version = "0.7.0"
@@ -2610,8 +2571,6 @@ dependencies = [
 "rustls 0.20.8",
 "rustls-pemfile",
 "segment",
- "sentry",
- "sentry-actix",
 "serde",
 "serde_json",
 "serde_urlencoded",
@@ -2931,17 +2890,6 @@ dependencies = [
 "num-traits",
 ]

-[[package]]
-name = "os_info"
-version = "3.7.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "006e42d5b888366f1880eda20371fedde764ed2213dc8496f49622fa0c99cd5e"
-dependencies = [
- "log",
- "serde",
- "winapi",
-]
-
 [[package]]
 name = "page_size"
 version = "0.4.2"
@@ -3664,126 +3612,6 @@ version = "1.0.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918"

-[[package]]
-name = "sentry"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2e95efd0cefa32028cdb9766c96de71d96671072f9fb494dc9fb84c0ef93e52b"
-dependencies = [
- "httpdate",
- "reqwest",
- "rustls 0.21.6",
- "sentry-backtrace",
- "sentry-contexts",
- "sentry-core",
- "sentry-debug-images",
- "sentry-panic",
- "sentry-tracing",
- "tokio",
- "ureq",
- "webpki-roots 0.25.2",
-]
-
-[[package]]
-name = "sentry-actix"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "795851be3047d9be16ea8a808383f89e75d2aebc9b53d25fe708c27bc56b4488"
-dependencies = [
- "actix-web",
- "futures-util",
- "sentry-core",
-]
-
-[[package]]
-name = "sentry-backtrace"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ac2bac6f310c4c4c4bb094d1541d32ae497f8c5c23405e85492cefdfe0971a9"
-dependencies = [
- "backtrace",
- "once_cell",
- "regex",
- "sentry-core",
-]
-
-[[package]]
-name = "sentry-contexts"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6c3e17295cecdbacf66c5bd38d6e1147e09e1e9d824d2d5341f76638eda02a3a"
-dependencies = [
- "hostname",
- "libc",
- "os_info",
- "rustc_version",
- "sentry-core",
- "uname",
-]
-
-[[package]]
-name = "sentry-core"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8339474f587f36cb110fa1ed1b64229eea6d47b0b886375579297b7e47aeb055"
-dependencies = [
- "once_cell",
- "rand",
- "sentry-types",
- "serde",
- "serde_json",
-]
-
-[[package]]
-name = "sentry-debug-images"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1c11e7d2b809b06497a18a2e60f513206462ae2db27081dfb7be9ade1f329cc8"
-dependencies = [
- "findshlibs",
- "once_cell",
- "sentry-core",
-]
-
-[[package]]
-name = "sentry-panic"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "875b69f506da75bd664029eafb05f8934297d2990192896d17325f066bd665b7"
-dependencies = [
- "sentry-backtrace",
- "sentry-core",
-]
-
-[[package]]
-name = "sentry-tracing"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89feead9bdd116f8035e89567651340fc382db29240b6c55ef412078b08d1aa3"
-dependencies = [
- "sentry-backtrace",
- "sentry-core",
- "tracing-core",
- "tracing-subscriber",
-]
-
-[[package]]
-name = "sentry-types"
-version = "0.31.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "99dc599bd6646884fc403d593cdcb9816dd67c50cff3271c01ff123617908dcd"
-dependencies = [
- "debugid",
- "getrandom",
- "hex",
- "serde",
- "serde_json",
- "thiserror",
- "time",
- "url",
- "uuid 1.4.1",
-]
-
 [[package]]
 name = "serde"
 version = "1.0.183"
@@ -4332,16 +4160,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a"
 dependencies = [
 "once_cell",
- "valuable",
-]
-
-[[package]]
-name = "tracing-subscriber"
-version = "0.3.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77"
-dependencies = [
- "tracing-core",
 ]

 [[package]]
@@ -4363,12 +4181,12 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"

 [[package]]
-name = "uname"
-version = "0.1.1"
+name = "unescaper"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b72f89f0ca32e4db1c04e2a72f5345d59796d4866a1ee0609084569f73683dc8"
+checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8"
 dependencies = [
- "libc",
+ "thiserror",
 ]

 [[package]]
@@ -4443,7 +4261,6 @@ dependencies = [
 "form_urlencoded",
 "idna",
 "percent-encoding",
- "serde",
 ]

 [[package]]
@@ -4483,12 +4300,6 @@ dependencies = [
 "serde",
 ]

-[[package]]
-name = "valuable"
-version = "0.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d"
-
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -4657,12 +4468,6 @@ dependencies = [
 "rustls-webpki 0.100.1",
 ]

-[[package]]
-name = "webpki-roots"
-version = "0.25.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
-
 [[package]]
 name = "whatlang"
 version = "0.16.2"
--- a/filter-parser/Cargo.toml
+++ b/filter-parser/Cargo.toml
@@ -14,6 +14,7 @@ license.workspace = true
 [dependencies]
 nom = "7.1.3"
 nom_locate = "4.1.0"
+unescaper = "0.1.2"

 [dev-dependencies]
 insta = "1.29.0"
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@@ -62,6 +62,7 @@ pub enum ErrorKind<'a> {
    MisusedGeoRadius,
    MisusedGeoBoundingBox,
    InvalidPrimary,
+    InvalidEscapedNumber,
    ExpectedEof,
    ExpectedValue(ExpectedValueKind),
    MalformedValue,
@@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> {
                let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
            }
+            ErrorKind::InvalidEscapedNumber => {
+                writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
+            }
            ErrorKind::ExpectedEof => {
                writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
            }
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> {

 #[cfg(test)]
 pub mod tests {
+    use FilterCondition as Fc;
+
    use super::*;

    /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@@ -556,14 +558,22 @@ pub mod tests {
        unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
    }

+    fn p(s: &str) -> impl std::fmt::Display + '_ {
+        Fc::parse(s).unwrap().unwrap()
+    }
+
+    #[test]
+    fn parse_escaped() {
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
+        // but it also works with other sequencies
+        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
+    }
+
    #[test]
    fn parse() {
-        use FilterCondition as Fc;
-
-        fn p(s: &str) -> impl std::fmt::Display + '_ {
-            Fc::parse(s).unwrap().unwrap()
-        }
-
        // Test equal
        insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
        insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult<Token> {
        })
    })?;

-    Ok((input, value))
+    match unescaper::unescape(value.value()) {
+        Ok(content) => {
+            if content.len() != value.value().len() {
+                Ok((input, Token::new(value.original_span(), Some(content))))
+            } else {
+                Ok((input, value))
+            }
+        }
+        Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
+        Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::InvalidEscapedNumber,
+        ))),
+        Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::MalformedValue,
+        ))),
+    }
 }

 fn is_value_component(c: char) -> bool {
@@ -318,17 +335,17 @@ pub mod test {
            ("\"cha'nnel\"", "cha'nnel", false),
            ("I'm tamo", "I", false),
            // escaped thing but not quote
-            (r#""\\""#, r#"\\"#, false),
-            (r#""\\\\\\""#, r#"\\\\\\"#, false),
-            (r#""aa\\aa""#, r#"aa\\aa"#, false),
+            (r#""\\""#, r#"\"#, true),
+            (r#""\\\\\\""#, r#"\\\"#, true),
+            (r#""aa\\aa""#, r#"aa\aa"#, true),
            // with double quote
            (r#""Hello \"world\"""#, r#"Hello "world""#, true),
-            (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
+            (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
            (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
            (r#""\"\"""#, r#""""#, true),
            // with simple quote
            (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
-            (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
+            (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
            (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
            (r#"'\'\''"#, r#"''"#, true),
        ];
@@ -350,7 +367,14 @@ pub mod test {
                "Filter `{}` was not supposed to be escaped",
                input
            );
-            assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
+            assert_eq!(
+                token.value(),
+                expected,
+                "Filter `{}` failed by giving `{}` instead of `{}`.",
+                input,
+                token.value(),
+                expected
+            );
        }
    }

--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -67,10 +67,6 @@ pub(crate) enum Batch {
        op: IndexOperation,
        must_create_index: bool,
    },
-    IndexDocumentDeletionByFilter {
-        index_uid: String,
-        task: Task,
-    },
    IndexCreation {
        index_uid: String,
        primary_key: Option<String>,
@@ -114,6 +110,10 @@ pub(crate) enum IndexOperation {
        documents: Vec<Vec<String>>,
        tasks: Vec<Task>,
    },
+    IndexDocumentDeletionByFilter {
+        index_uid: String,
+        task: Task,
+    },
    DocumentClear {
        index_uid: String,
        tasks: Vec<Task>,
@@ -155,7 +155,6 @@ impl Batch {
            | Batch::TaskDeletion(task)
            | Batch::Dump(task)
            | Batch::IndexCreation { task, .. }
-            | Batch::IndexDocumentDeletionByFilter { task, .. }
            | Batch::IndexUpdate { task, .. } => vec![task.uid],
            Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
                tasks.iter().map(|task| task.uid).collect()
@@ -167,6 +166,7 @@ impl Batch {
                | IndexOperation::DocumentClear { tasks, .. } => {
                    tasks.iter().map(|task| task.uid).collect()
                }
+                IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
                IndexOperation::SettingsAndDocumentOperation {
                    document_import_tasks: tasks,
                    settings_tasks: other,
@@ -194,8 +194,7 @@ impl Batch {
            IndexOperation { op, .. } => Some(op.index_uid()),
            IndexCreation { index_uid, .. }
            | IndexUpdate { index_uid, .. }
-            | IndexDeletion { index_uid, .. }
-            | IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
+            | IndexDeletion { index_uid, .. } => Some(index_uid),
        }
    }
 }
@@ -205,6 +204,7 @@ impl IndexOperation {
        match self {
            IndexOperation::DocumentOperation { index_uid, .. }
            | IndexOperation::DocumentDeletion { index_uid, .. }
+            | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
            | IndexOperation::DocumentClear { index_uid, .. }
            | IndexOperation::Settings { index_uid, .. }
            | IndexOperation::DocumentClearAndSetting { index_uid, .. }
@@ -239,9 +239,12 @@ impl IndexScheduler {
                let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
                match &task.kind {
                    KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
-                        Ok(Some(Batch::IndexDocumentDeletionByFilter {
-                            index_uid: index_uid.clone(),
-                            task,
+                        Ok(Some(Batch::IndexOperation {
+                            op: IndexOperation::IndexDocumentDeletionByFilter {
+                                index_uid: index_uid.clone(),
+                                task,
+                            },
+                            must_create_index: false,
                        }))
                    }
                    _ => unreachable!(),
@@ -536,7 +539,9 @@ impl IndexScheduler {
        let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued;

        // If autobatching is disabled we only take one task at a time.
-        let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 };
+        // Otherwise, we take only a maximum of tasks to create batches.
+        let tasks_limit =
+            if self.autobatching_enabled { self.maximum_number_of_batched_tasks } else { 1 };

        let enqueued = index_tasks
            .into_iter()
@@ -896,51 +901,6 @@ impl IndexScheduler {

                Ok(tasks)
            }
-            Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
-                let (index_uid, filter) =
-                    if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
-                        &task.kind
-                    {
-                        (index_uid, filter_expr)
-                    } else {
-                        unreachable!()
-                    };
-                let index = {
-                    let rtxn = self.env.read_txn()?;
-                    self.index_mapper.index(&rtxn, index_uid)?
-                };
-                let deleted_documents = delete_document_by_filter(filter, index);
-                let original_filter = if let Some(Details::DocumentDeletionByFilter {
-                    original_filter,
-                    deleted_documents: _,
-                }) = task.details
-                {
-                    original_filter
-                } else {
-                    // In the case of a `documentDeleteByFilter` the details MUST be set
-                    unreachable!();
-                };
-
-                match deleted_documents {
-                    Ok(deleted_documents) => {
-                        task.status = Status::Succeeded;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(deleted_documents),
-                        });
-                    }
-                    Err(e) => {
-                        task.status = Status::Failed;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(0),
-                        });
-                        task.error = Some(e.into());
-                    }
-                }
-
-                Ok(vec![task])
-            }
            Batch::IndexCreation { index_uid, primary_key, task } => {
                let wtxn = self.env.write_txn()?;
                if self.index_mapper.exists(&wtxn, &index_uid)? {
@@ -1299,6 +1259,47 @@ impl IndexScheduler {

                Ok(tasks)
            }
+            IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
+                let filter =
+                    if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
+                        &task.kind
+                    {
+                        filter_expr
+                    } else {
+                        unreachable!()
+                    };
+                let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
+                let original_filter = if let Some(Details::DocumentDeletionByFilter {
+                    original_filter,
+                    deleted_documents: _,
+                }) = task.details
+                {
+                    original_filter
+                } else {
+                    // In the case of a `documentDeleteByFilter` the details MUST be set
+                    unreachable!();
+                };
+
+                match deleted_documents {
+                    Ok(deleted_documents) => {
+                        task.status = Status::Succeeded;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(deleted_documents),
+                        });
+                    }
+                    Err(e) => {
+                        task.status = Status::Failed;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(0),
+                        });
+                        task.error = Some(e.into());
+                    }
+                }
+
+                Ok(vec![task])
+            }
            IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
                let indexer_config = self.index_mapper.indexer_config();
                let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@@ -1498,23 +1499,22 @@ impl IndexScheduler {
    }
 }

-fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
+fn delete_document_by_filter<'a>(
+    wtxn: &mut RwTxn<'a, '_>,
+    filter: &serde_json::Value,
+    index: &'a Index,
+) -> Result<u64> {
    let filter = Filter::from_json(filter)?;
    Ok(if let Some(filter) = filter {
-        let mut wtxn = index.write_txn()?;
-
-        let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
+        let candidates = filter.evaluate(wtxn, index).map_err(|err| match err {
            milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
                Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
            }
            e => e.into(),
        })?;
-        let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
+        let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
        delete_operation.delete_documents(&candidates);
-        let deleted_documents =
-            delete_operation.execute().map(|result| result.deleted_documents)?;
-        wtxn.commit()?;
-        deleted_documents
+        delete_operation.execute().map(|result| result.deleted_documents)?
    } else {
        0
    })
--- a/index-scheduler/src/insta_snapshot.rs
+++ b/index-scheduler/src/insta_snapshot.rs
@@ -15,6 +15,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {

    let IndexScheduler {
        autobatching_enabled,
+        maximum_number_of_batched_tasks: _,
        must_stop_processing: _,
        processing_tasks,
        file_store,
--- a/index-scheduler/src/lib.rs
+++ b/index-scheduler/src/lib.rs
@@ -253,6 +253,9 @@ pub struct IndexSchedulerOptions {
    /// Set to `true` iff the index scheduler is allowed to automatically
    /// batch tasks together, to process multiple tasks at once.
    pub autobatching_enabled: bool,
+    /// If the autobatcher is allowed to automatically batch tasks
+    /// it will only batch this defined number of tasks at once.
+    pub maximum_number_of_batched_tasks: usize,
    /// The maximum number of tasks stored in the task queue before starting
    /// to auto schedule task deletions.
    pub max_number_of_tasks: usize,
@@ -310,6 +313,9 @@ pub struct IndexScheduler {
    /// Whether auto-batching is enabled or not.
    pub(crate) autobatching_enabled: bool,

+    /// The maximum number of tasks that will be batched together.
+    pub(crate) maximum_number_of_batched_tasks: usize,
+
    /// The max number of tasks allowed before the scheduler starts to delete
    /// the finished tasks automatically.
    pub(crate) max_number_of_tasks: usize,
@@ -363,6 +369,7 @@ impl IndexScheduler {
            index_mapper: self.index_mapper.clone(),
            wake_up: self.wake_up.clone(),
            autobatching_enabled: self.autobatching_enabled,
+            maximum_number_of_batched_tasks: self.maximum_number_of_batched_tasks,
            max_number_of_tasks: self.max_number_of_tasks,
            snapshots_path: self.snapshots_path.clone(),
            dumps_path: self.dumps_path.clone(),
@@ -458,6 +465,7 @@ impl IndexScheduler {
            // we want to start the loop right away in case meilisearch was ctrl+Ced while processing things
            wake_up: Arc::new(SignalEvent::auto(true)),
            autobatching_enabled: options.autobatching_enabled,
+            maximum_number_of_batched_tasks: options.maximum_number_of_batched_tasks,
            max_number_of_tasks: options.max_number_of_tasks,
            dumps_path: options.dumps_path,
            snapshots_path: options.snapshots_path,
@@ -1589,6 +1597,7 @@ mod tests {
                index_count: 5,
                indexer_config,
                autobatching_enabled: true,
+                maximum_number_of_batched_tasks: usize::MAX,
                max_number_of_tasks: 1_000_000,
                instance_features: Default::default(),
            };
--- a/meilisearch/Cargo.toml
+++ b/meilisearch/Cargo.toml
@@ -83,16 +83,6 @@ rustls-pemfile = "1.0.2"
 segment = { version = "0.2.2", optional = true }
 serde = { version = "1.0.160", features = ["derive"] }
 serde_json = { version = "1.0.95", features = ["preserve_order"] }
-sentry = { version = "0.31.6", default-features = false, features = [
-    "backtrace",
-    "contexts",
-    "debug-images",
-    "panic",
-    "reqwest",
-    "rustls",
-] }
-sentry-actix = "0.31.6"
-serde_urlencoded = "0.7.1"
 sha2 = "0.10.6"
 siphasher = "0.3.10"
 slice-group-by = "0.3.0"
@@ -100,7 +90,6 @@ static-files = { version = "0.2.3", optional = true }
 sysinfo = "0.29.7"
 tar = "0.4.38"
 tempfile = "3.5.0"
-termcolor = "1.2.0"
 thiserror = "1.0.40"
 time = { version = "0.3.20", features = [
    "serde-well-known",
@@ -114,6 +103,8 @@ toml = "0.7.3"
 uuid = { version = "1.3.1", features = ["serde", "v4"] }
 walkdir = "2.3.3"
 yaup = "0.2.1"
+serde_urlencoded = "0.7.1"
+termcolor = "1.2.0"

 [dev-dependencies]
 actix-rt = "2.8.0"
@@ -142,10 +133,20 @@ vergen = { version = "7.5.1", default-features = false, features = ["git"] }
 zip = { version = "0.6.4", optional = true }

 [features]
-default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
+default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard", "profile-with-puffin"]
 analytics = ["segment"]
 profile-with-puffin = ["dep:puffin_http"]
-mini-dashboard = ["actix-web-static-files", "static-files", "anyhow", "cargo_toml", "hex", "reqwest", "sha-1", "tempfile", "zip"]
+mini-dashboard = [
+    "actix-web-static-files",
+    "static-files",
+    "anyhow",
+    "cargo_toml",
+    "hex",
+    "reqwest",
+    "sha-1",
+    "tempfile",
+    "zip",
+]
 chinese = ["meilisearch-types/chinese"]
 hebrew = ["meilisearch-types/hebrew"]
 japanese = ["meilisearch-types/japanese"]
--- a/meilisearch/src/analytics/segment_analytics.rs
+++ b/meilisearch/src/analytics/segment_analytics.rs
@@ -285,6 +285,7 @@ impl From<Opt> for Infos {
            db_path,
            experimental_enable_metrics,
            experimental_reduce_indexing_memory_usage,
+            experimental_limit_batched_tasks: _,
            http_addr,
            master_key: _,
            env,
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -101,7 +101,7 @@ pub fn create_app(
        InitError = (),
    >,
 > {
-    actix_web::App::new()
+    let app = actix_web::App::new()
        .configure(|s| {
            configure_data(
                s,
@@ -112,23 +112,23 @@ pub fn create_app(
            )
        })
        .configure(routes::configure)
-        .configure(|s| dashboard(s, enable_dashboard))
-        .wrap(sentry_actix::Sentry::new())
-        .wrap(actix_web::middleware::Condition::new(
-            opt.experimental_enable_metrics,
-            middleware::RouteMetrics,
-        ))
-        .wrap(
-            Cors::default()
-                .send_wildcard()
-                .allow_any_header()
-                .allow_any_origin()
-                .allow_any_method()
-                .max_age(86_400), // 24h
-        )
-        .wrap(actix_web::middleware::Logger::default())
-        .wrap(actix_web::middleware::Compress::default())
-        .wrap(actix_web::middleware::NormalizePath::new(actix_web::middleware::TrailingSlash::Trim))
+        .configure(|s| dashboard(s, enable_dashboard));
+
+    let app = app.wrap(actix_web::middleware::Condition::new(
+        opt.experimental_enable_metrics,
+        middleware::RouteMetrics,
+    ));
+    app.wrap(
+        Cors::default()
+            .send_wildcard()
+            .allow_any_header()
+            .allow_any_origin()
+            .allow_any_method()
+            .max_age(86_400), // 24h
+    )
+    .wrap(actix_web::middleware::Logger::default())
+    .wrap(actix_web::middleware::Compress::default())
+    .wrap(actix_web::middleware::NormalizePath::new(actix_web::middleware::TrailingSlash::Trim))
 }

 enum OnFailure {
@@ -236,6 +236,7 @@ fn open_or_create_database_unchecked(
            enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
            indexer_config: (&opt.indexer_options).try_into()?,
            autobatching_enabled: true,
+            maximum_number_of_batched_tasks: opt.experimental_limit_batched_tasks,
            max_number_of_tasks: 1_000_000,
            index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
            index_count: DEFAULT_INDEX_COUNT,
--- a/meilisearch/src/main.rs
+++ b/meilisearch/src/main.rs
@@ -30,13 +30,6 @@ fn setup(opt: &Opt) -> anyhow::Result<()> {
 async fn main() -> anyhow::Result<()> {
    let (opt, config_read_from) = Opt::try_build()?;

-    let _sentry = sentry::init(sentry::ClientOptions {
-        release: sentry::release_name!(),
-        session_mode: sentry::SessionMode::Request,
-        auto_session_tracking: true,
-        ..Default::default()
-    });
-
    #[cfg(feature = "profile-with-puffin")]
    let _server = puffin_http::Server::new(&format!("0.0.0.0:{}", puffin_http::DEFAULT_PORT))?;
    puffin::set_scopes_on(cfg!(feature = "profile-with-puffin"));
--- a/meilisearch/src/option.rs
+++ b/meilisearch/src/option.rs
@@ -51,6 +51,7 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
 const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
 const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
    "MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
+const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS: &str = "MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS";

 const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
 const DEFAULT_DB_PATH: &str = "./data.ms";
@@ -301,6 +302,11 @@ pub struct Opt {
    #[serde(default)]
    pub experimental_reduce_indexing_memory_usage: bool,

+    /// Experimental limit to the number of tasks per batch
+    #[clap(long, env = MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())]
+    #[serde(default = "default_limit_batched_tasks")]
+    pub experimental_limit_batched_tasks: usize,
+
    #[serde(flatten)]
    #[clap(flatten)]
    pub indexer_options: IndexerOpts,
@@ -393,7 +399,8 @@ impl Opt {
            #[cfg(all(not(debug_assertions), feature = "analytics"))]
            no_analytics,
            experimental_enable_metrics: enable_metrics_route,
-            experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
+            experimental_reduce_indexing_memory_usage,
+            experimental_limit_batched_tasks,
        } = self;
        export_to_env_if_not_present(MEILI_DB_PATH, db_path);
        export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
@@ -437,7 +444,11 @@ impl Opt {
        );
        export_to_env_if_not_present(
            MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
-            reduce_indexing_memory_usage.to_string(),
+            experimental_reduce_indexing_memory_usage.to_string(),
+        );
+        export_to_env_if_not_present(
+            MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS,
+            experimental_limit_batched_tasks.to_string(),
        );
        indexer_options.export_to_env();
    }
@@ -739,6 +750,10 @@ fn default_dump_dir() -> PathBuf {
    PathBuf::from(DEFAULT_DUMP_DIR)
 }

+fn default_limit_batched_tasks() -> usize {
+    usize::MAX
+}
+
 /// Indicates if a snapshot was scheduled, and if yes with which interval.
 #[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
 pub enum ScheduleSnapshot {
--- a/meilisearch/src/routes/swap_indexes.rs
+++ b/meilisearch/src/routes/swap_indexes.rs
@@ -60,8 +60,7 @@ pub async fn swap_indexes(
    }

    let task = KindWithContent::IndexSwap { swaps };
-
-    let task = index_scheduler.register(task)?;
-    let task: SummarizedTaskView = task.into();
+    let task: SummarizedTaskView =
+        tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
    Ok(HttpResponse::Accepted().json(task))
 }
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@@ -154,6 +154,19 @@ async fn delete_document_by_filter() {
        )
        .await;
    index.wait_task(1).await;
+
+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 4,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 3,
+        "id": 4
+      }
+    }
+    "###);
+
    let (response, code) =
        index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
    snapshot!(code, @"202 Accepted");
@@ -188,6 +201,18 @@ async fn delete_document_by_filter() {
    }
    "###);

+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 2,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 1,
+        "id": 2
+      }
+    }
+    "###);
+
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
@@ -241,6 +266,18 @@ async fn delete_document_by_filter() {
    }
    "###);

+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 1,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 1,
+        "id": 1
+      }
+    }
+    "###);
+
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@@ -1104,3 +1104,59 @@ async fn camelcased_words() {
        })
        .await;
 }
+
+#[actix_rt::test]
+async fn simple_search_with_strange_synonyms() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await;
+    let r = index.wait_task(0).await;
+    meili_snap::snapshot!(r["status"], @r###""succeeded""###);
+
+    let documents = DOCUMENTS.clone();
+    index.add_documents(documents, None).await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "How to train"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "How & train"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "to"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+}
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -17,7 +17,8 @@ bincode = "1.3.3"
 bstr = "1.4.0"
 bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
 byteorder = "1.4.3"
-charabia = { version = "0.8.3", default-features = false }
+# charabia = { version = "0.8.3", default-features = false }
+charabia = { git = "https://github.com/meilisearch/charabia", branch = "main", default-features = false }
 concat-arrays = "0.1.2"
 crossbeam-channel = "0.5.8"
 deserr = { version = "0.6.0", features = ["actix-web"]}
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -418,19 +418,11 @@ impl<'t> Matcher<'t, '_> {
        } else {
            match &self.matches {
                Some((tokens, matches)) => {
-                    // If the text has to be cropped,
-                    // compute the best interval to crop around.
-                    let matches = match format_options.crop {
-                        Some(crop_size) if crop_size > 0 => {
-                            self.find_best_match_interval(matches, crop_size)
-                        }
-                        _ => matches,
-                    };
-
                    // If the text has to be cropped,
                    // crop around the best interval.
                    let (byte_start, byte_end) = match format_options.crop {
                        Some(crop_size) if crop_size > 0 => {
+                            let matches = self.find_best_match_interval(matches, crop_size);
                            self.crop_bounds(tokens, matches, crop_size)
                        }
                        _ => (0, self.text.len()),
@@ -450,6 +442,11 @@ impl<'t> Matcher<'t, '_> {
                        for m in matches {
                            let token = &tokens[m.token_position];

+                            // skip matches out of the crop window.
+                            if token.byte_start < byte_start || token.byte_end > byte_end {
+                                continue;
+                            }
+
                            if byte_index < token.byte_start {
                                formatted.push(&self.text[byte_index..token.byte_start]);
                            }
@@ -800,6 +797,37 @@ mod tests {
        );
    }

+    #[test]
+    fn format_highlight_crop_phrase_query() {
+        //! testing: https://github.com/meilisearch/meilisearch/issues/3975
+        let temp_index = TempIndex::new();
+        temp_index
+            .add_documents(documents!([
+                { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
+            ]))
+            .unwrap();
+        let rtxn = temp_index.read_txn().unwrap();
+
+        let format_options = FormatOptions { highlight: true, crop: Some(10) };
+        let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
+
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
+        let mut matcher = builder.build(text);
+        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…had the power to split <em>the</em> <em>world</em> between those who…"
+        );
+
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
+        let mut matcher = builder.build(text);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
+        );
+    }
+
    #[test]
    fn smaller_crop_size() {
        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -226,9 +226,9 @@ fn process_tokens<'a>(
 ) -> impl Iterator<Item = (usize, Token<'a>)> {
    tokens
        .skip_while(|token| token.is_separator())
-        .scan((0, None), |(offset, prev_kind), token| {
+        .scan((0, None), |(offset, prev_kind), mut token| {
            match token.kind {
-                TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
                    *offset += match *prev_kind {
                        Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
                        Some(_) => 1,
@@ -244,7 +244,7 @@ fn process_tokens<'a>(
                {
                    *prev_kind = Some(token.kind);
                }
-                _ => (),
+                _ => token.kind = TokenKind::Unknown,
            }
            Some((*offset, token))
        })
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -59,7 +59,13 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
+            send_original_documents_data(
+                original_documents_chunk,
+                indexer,
+                lmdb_writer_sx.clone(),
+                vectors_field_id,
+                primary_key_id,
+            )
        })
        .collect::<Result<()>>()?;

@@ -76,7 +82,6 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
-                    vectors_field_id,
                    &stop_words,
                    &allowed_separators,
                    &dictionary,
@@ -265,11 +270,33 @@ fn spawn_extraction_task<FE, FS, M>(
 /// - documents
 fn send_original_documents_data(
    original_documents_chunk: Result<grenad::Reader<File>>,
+    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    vectors_field_id: Option<FieldId>,
+    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;

+    if let Some(vectors_field_id) = vectors_field_id {
+        let documents_chunk_cloned = original_documents_chunk.clone();
+        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            let result = extract_vector_points(
+                documents_chunk_cloned,
+                indexer,
+                primary_key_id,
+                vectors_field_id,
+            );
+            let _ = match result {
+                Ok(vector_points) => {
+                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
+                }
+                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
+            };
+        });
+    }
+
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@@ -291,7 +318,6 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
-    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    allowed_separators: &Option<&[&str]>,
    dictionary: &Option<&[&str]>,
@@ -322,25 +348,6 @@ fn send_and_extract_flattened_documents_data(
        });
    }

-    if let Some(vectors_field_id) = vectors_field_id {
-        let documents_chunk_cloned = flattened_documents_chunk.clone();
-        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
-        rayon::spawn(move || {
-            let result = extract_vector_points(
-                documents_chunk_cloned,
-                indexer,
-                primary_key_id,
-                vectors_field_id,
-            );
-            let _ = match result {
-                Ok(vector_points) => {
-                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
-                }
-                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
-            };
-        });
-    }
-
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2550,6 +2550,25 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }

+    /// Index multiple different number of vectors in documents.
+    /// Vectors must be of the same length.
+    #[test]
+    fn test_multiple_vectors() {
+        let index = TempIndex::new();
+
+        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
+        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
+        index
+            .add_documents(
+                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
+            )
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
+        assert_eq!(res.documents_ids.len(), 3);
+    }
+
    #[test]
    fn reproduce_the_bug() {
        /*
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    tokenizer
                        .tokenize(text)
                        .filter_map(|token| {
-                            if token.is_word() {
+                            if token.is_word() && !token.lemma().is_empty() {
                                Some(token.lemma().to_string())
                            } else {
                                None
@@ -608,13 +608,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                for (word, synonyms) in user_synonyms {
                    // Normalize both the word and associated synonyms.
                    let normalized_word = normalize(&tokenizer, word);
-                    let normalized_synonyms =
-                        synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
+                    let normalized_synonyms: Vec<_> = synonyms
+                        .iter()
+                        .map(|synonym| normalize(&tokenizer, synonym))
+                        .filter(|synonym| !synonym.is_empty())
+                        .collect();

                    // Store the normalized synonyms under the normalized word,
                    // merging the possible duplicate words.
-                    let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
-                    entry.extend(normalized_synonyms);
+                    if !normalized_word.is_empty() && !normalized_synonyms.is_empty() {
+                        let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
+                        entry.extend(normalized_synonyms.into_iter());
+                    }
                }

                // Make sure that we don't have duplicate synonyms.
@@ -1422,6 +1427,43 @@ mod tests {
        assert!(result.documents_ids.is_empty());
    }

+    #[test]
+    fn thai_synonyms() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.autogenerate_docids = true;
+
+        let mut wtxn = index.write_txn().unwrap();
+        // Send 3 documents with ids from 1 to 3.
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "name": "ยี่ปุ่น" },
+                    { "name": "ญี่ปุ่น" },
+                ]),
+            )
+            .unwrap();
+
+        // In the same transaction provide some synonyms
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_synonyms(btreemap! {
+                    "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
+                });
+            })
+            .unwrap();
+        wtxn.commit().unwrap();
+
+        // Ensure synonyms are effectively stored
+        let rtxn = index.read_txn().unwrap();
+        let synonyms = index.synonyms(&rtxn).unwrap();
+        assert!(!synonyms.is_empty()); // at this point the index should return something
+
+        // Check that we can use synonyms
+        let result = index.search(&rtxn).query("japanese").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2);
+    }
+
    #[test]
    fn setting_searchable_recomputes_other_settings() {
        let index = TempIndex::new();
--- a/permissive-json-pointer/src/lib.rs
+++ b/permissive-json-pointer/src/lib.rs
@@ -186,12 +186,16 @@ fn create_value(value: &Document, mut selectors: HashSet<&str>) -> Document {
                    let array = create_array(array, &sub_selectors);
                    if !array.is_empty() {
                        new_value.insert(key.to_string(), array.into());
+                    } else {
+                        new_value.insert(key.to_string(), Value::Array(vec![]));
                    }
                }
                Value::Object(object) => {
                    let object = create_value(object, sub_selectors);
                    if !object.is_empty() {
                        new_value.insert(key.to_string(), object.into());
+                    } else {
+                        new_value.insert(key.to_string(), Value::Object(Map::new()));
                    }
                }
                _ => (),
@@ -211,6 +215,8 @@ fn create_array(array: &[Value], selectors: &HashSet<&str>) -> Vec<Value> {
                let array = create_array(array, selectors);
                if !array.is_empty() {
                    res.push(array.into());
+                } else {
+                    res.push(Value::Array(vec![]));
                }
            }
            Value::Object(object) => {
@@ -637,6 +643,24 @@ mod tests {
        );
    }

+    #[test]
+    fn empty_array_object_return_empty() {
+        let value: Value = json!({
+            "array": [],
+            "object": {},
+        });
+        let value: &Document = value.as_object().unwrap();
+
+        let res: Value = select_values(value, vec!["array.name", "object.name"]).into();
+        assert_eq!(
+            res,
+            json!({
+                "array": [],
+                "object": {},
+            })
+        );
+    }
+
    #[test]
    fn all_conflict_variation() {
        let value: Value = json!({
Author	SHA1	Message	Date
Kerollmops	b126bf3aec	Temporary use the charabia git repository to get the latest fixes	2023-09-19 10:15:17 +02:00
Kerollmops	e82ff56416	Enable by default the puffin server	2023-09-18 18:12:16 +02:00
Clément Renault	1b26dde438	Expose a new flag to limit the number of batched tasks	2023-09-18 18:11:12 +02:00
Tamo	3bb644b54d	update the description of the cli argument	2023-09-18 18:10:46 +02:00
Clément Renault	34b9145db2	Fix the tests	2023-09-18 18:10:29 +02:00
meili-bors[bot]	76c05d1b20	Merge #4053 4053: Fix the stats of the documents deletion by filter r=Kerollmops a=irevoire # Pull Request The issue was that the operation « DocumentDeletionByFilter » was not declared as an index operation. That means the index stats were not reprocessed after the application of the operation. ## Related issue Fixes #4018 ## What does this PR do? - Move the `DocumentDeletionByFilter` internal operation into the category of the `IndexOperation`. This means that the stats will automatically be re-processed after a batch is processed. - Update a test to ensure that the stats are valid after each operation ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Tamo <tamo@meilisearch.com>	2023-09-11 15:53:26 +00:00
Tamo	34fac115d5	fix clippy	2023-09-11 17:15:57 +02:00
meili-bors[bot]	a09686fcbd	Merge #3997 3997: Refactor empty arrays/objects should return empty instead of null r=Kerollmops a=dogukanakkaya # Pull Request ## What does this PR do? At the moment if we select empty objects and array of object properties with dot notations like: ```json { "array": [], "object": {} } ``` ```rs GetDocumentOptions { fields: Some(vec!["array.name", "object.name"]) } ``` returns null if the array/object has no property yet. I am not sure if this is expected or it's the correct behaviour but I add my document with a property that is assigned to an empty array/object, later on when I select it, returns null which is kinda weird and unexpected in my opinion. This PR fixes that issue by returning an empty vector if the array is empty or an empty map if object is empty. This is not added for `permissive-json-pointer/src/lib.rs:224` because `create_array` loops over each item. Selecting a single property that is an object, in an array of objects would result other objects to be empty maps instead of none. ```json "doggos": [ { "jean": { "race": { "name": "bernese mountain", } } }, { "marc": { "age": 4, "race": { "name": "golden retriever", } } } ] ``` ```rs GetDocumentOptions { fields: Some(vec!["doggos.jean"]) } ``` Would result in `jean` object and an extra empty object for `marc`. ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: dogukanakkaya <doguakkaya27@hotmail.com>	2023-09-11 13:46:02 +00:00
dogukanakkaya	393be40179	Refactor empty arrays/objects should return empty instead of null	2023-09-11 15:56:15 +03:00
meili-bors[bot]	487d493f49	Merge #4043 4043: Bring back hotfixes from v1.3.3 into v1.4.0 r=Kerollmops a=curquiza Co-authored-by: curquiza <curquiza@users.noreply.github.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: curquiza <clementine@meilisearch.com>	2023-09-11 12:27:34 +00:00
Tamo	9258e5b5bf	Fix the stats of the documents deletion by filter The issue was that the operation « DocumentDeletionByFilter » was not declared as an index operation. That means the indexes stats were not reprocessed after the application of the operation.	2023-09-11 14:04:10 +02:00
meili-bors[bot]	462b4654c4	Merge #4028 4028: Fix highlighting bug when searching for a phrase with cropping r=ManyTheFish a=vivek-26 # Pull Request ## Related issue Fixes #3975 ## What does this PR do? This PR - - Fixes the bug where searching only for a phrase (containing multiple words) along with cropping, highlighted only the first word of the phrase. - Adds unit test case for the above mentioned scenario. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Vivek Kumar <vivek.26@outlook.com>	2023-09-11 07:58:41 +00:00
Vivek Kumar	abfa7ded25	use a new temp index in the test	2023-09-08 12:32:47 +05:30
Vivek Kumar	f2837aaec2	add another test case	2023-09-08 11:39:54 +05:30
Vivek Kumar	11df155598	fix highlighting bug when searching for a phrase with cropping	2023-09-08 11:39:52 +05:30
curquiza	651657c03e	Fix git conflicts	2023-09-07 16:48:13 +02:00
meili-bors[bot]	b9ad59c969	Merge #4041 4041: Register the swap indexe task in a spawn blocking to be sure to never… r=ManyTheFish a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4040 ## What does this PR do? - Register the swap indexes task in a spawn blocking task Co-authored-by: Tamo <tamo@meilisearch.com>	2023-09-07 10:22:01 +00:00
Tamo	66aa682e23	Register the swap indexe task in a spawn blocking to be sure to never block the main thread	2023-09-07 11:37:02 +02:00
meili-bors[bot]	256cf33bca	Merge #4039 4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-07 09:25:58 +00:00
meili-bors[bot]	9945cbf9db	Merge #4038 4038: Fix filter escaping issues r=ManyTheFish a=Kerollmops This PR fixes #4034 by always escaping the sequences. Users must always put quotes (simple or double) to escape the filter values. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-06 12:29:29 +00:00
Kerollmops	03d0f628bd	Use the unescaper crate to unescape any char sequence	2023-09-06 13:59:45 +02:00
Kerollmops	ea78060916	Fix tests that were supposed to escape characters	2023-09-06 13:59:45 +02:00
Kerollmops	b42d48187a	Add a test case scenario	2023-09-06 13:59:44 +02:00
Kerollmops	679c0b0f97	Extract the vectors from the non-flattened version of the documents	2023-09-06 12:26:00 +02:00
Kerollmops	e02d0064bd	Add a test case scenario	2023-09-06 12:26:00 +02:00
meili-bors[bot]	7ef3572f11	Merge #4037 4037: Update version for the next release (v1.3.3) in Cargo.toml r=curquiza a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: curquiza <curquiza@users.noreply.github.com>	2023-09-06 09:50:58 +00:00
curquiza	93285041a9	Update version for the next release (v1.3.3) in Cargo.toml	2023-09-06 09:23:20 +00:00
meili-bors[bot]	dc3d9c90d9	Merge #3994 3994: Fix synonyms with separators r=Kerollmops a=ManyTheFish # Pull Request ## Related issue Fixes #3977 ## Available prototype ``` $ docker pull getmeili/meilisearch:prototype-fix-synonyms-with-separators-0 ``` ## What does this PR do? - add a new test - filter the empty synonyms after normalization Co-authored-by: ManyTheFish <many@meilisearch.com>	2023-09-05 14:42:46 +00:00
meili-bors[bot]	287cf25d39	Merge #4033 4033: Fix thai synonyms r=Kerollmops a=Kerollmops Fixes #4031 Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>	2023-09-05 13:54:33 +00:00
ManyTheFish	66aa6d5871	Ignore tokens with empty normalized value during indexing process	2023-09-05 15:44:14 +02:00
Kerollmops	8ac5b765bc	Fix synonyms normalization	2023-09-04 16:12:48 +02:00
meili-bors[bot]	cea93e9a37	Merge #4016 4016: Define the full Homebrew formula path r=curquiza a=Kerollmops This PR fixes #4015 by defining the full Homebrew formula path. Co-authored-by: Clément Renault <clement@meilisearch.com>	2023-09-04 13:10:28 +00:00
Kerollmops	085aad0a94	Add a test	2023-09-04 14:39:33 +02:00
Clément Renault	6db80b0836	Define the full Homebrew formula path	2023-08-24 11:24:47 +02:00
ManyTheFish	8dc5acf998	Try fix	2023-08-08 16:52:36 +02:00
ManyTheFish	fc2590fc9d	Add a test	2023-08-08 16:43:08 +02:00