Activate only the necessary features for Japanese

Merge #4108
4108: Fix bug where search with distinct attribute and no ranking, returns offset+limit hits r=curquiza a=vivek-26 # Pull Request ## Related issue Fixes #4078 ## What does this PR do? This PR - - Fixes bug where search with distinct attribute and no ranking, returns offset+limit hits. - Adds unit and integration tests. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Vivek Kumar <vivek.26@outlook.com>
2025-12-28 07:26:56 +00:00 · 2023-10-16 11:31:59 +02:00 · 2023-10-12 07:51:29 +00:00 · 2023-10-11 14:27:00 +00:00 · 2023-10-11 19:12:56 +05:30 · 2023-10-11 19:02:27 +05:30
40 changed files with 636 additions and 222 deletions
--- a/.github/workflows/publish-apt-brew-pkg.yml
+++ b/.github/workflows/publish-apt-brew-pkg.yml
@@ -53,5 +53,6 @@ jobs:
        uses: mislav/bump-homebrew-formula-action@v2
        with:
          formula-name: meilisearch
+          formula-path: Formula/m/meilisearch.rb
        env:
          COMMITTER_TOKEN: ${{ secrets.HOMEBREW_COMMITTER_TOKEN }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -468,7 +468,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

 [[package]]
 name = "benchmarks"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "anyhow",
 "bytes",
@@ -1206,7 +1206,7 @@ dependencies = [

 [[package]]
 name = "dump"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "anyhow",
 "big_s",
@@ -1417,7 +1417,7 @@ dependencies = [

 [[package]]
 name = "file-store"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "faux",
 "tempfile",
@@ -1439,11 +1439,12 @@ dependencies = [

 [[package]]
 name = "filter-parser"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "insta",
 "nom",
 "nom_locate",
+ "unescaper",
 ]

 [[package]]
@@ -1458,7 +1459,7 @@ dependencies = [

 [[package]]
 name = "flatten-serde-json"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "criterion",
 "serde_json",
@@ -1576,7 +1577,7 @@ dependencies = [

 [[package]]
 name = "fuzzers"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "arbitrary",
 "clap",
@@ -1890,7 +1891,7 @@ dependencies = [

 [[package]]
 name = "index-scheduler"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "anyhow",
 "big_s",
@@ -2087,7 +2088,7 @@ dependencies = [

 [[package]]
 name = "json-depth-checker"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "criterion",
 "serde_json",
@@ -2499,7 +2500,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "meili-snap"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "insta",
 "md5",
@@ -2508,7 +2509,7 @@ dependencies = [

 [[package]]
 name = "meilisearch"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "actix-cors",
 "actix-http",
@@ -2599,7 +2600,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-auth"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "base64 0.21.2",
 "enum-iterator",
@@ -2618,7 +2619,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-types"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "actix-web",
 "anyhow",
@@ -2672,7 +2673,7 @@ dependencies = [

 [[package]]
 name = "milli"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "big_s",
 "bimap",
@@ -2994,7 +2995,7 @@ checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94"

 [[package]]
 name = "permissive-json-pointer"
-version = "1.4.0"
+version = "1.4.1"
 dependencies = [
 "big_s",
 "serde_json",
@@ -4180,6 +4181,15 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"

+[[package]]
+name = "unescaper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8"
+dependencies = [
+ "thiserror",
+]
+
 [[package]]
 name = "unicase"
 version = "2.6.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -18,7 +18,7 @@ members = [
 ]

 [workspace.package]
-version = "1.4.0"
+version = "1.4.1"
 authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
 description = "Meilisearch HTTP server"
 homepage = "https://meilisearch.com"
--- a/2
+++ b/2
@@ -17,7 +17,7 @@ RUN     set -eux; \
        if [ "$apkArch" = "aarch64" ]; then \
            export JEMALLOC_SYS_WITH_LG_PAGE=16; \
        fi && \
-        cargo build --release
+        cargo build --release --no-default-features --features "analytics mini-dashboard japanese"

 # Run
 FROM    alpine:3.16
--- a/filter-parser/Cargo.toml
+++ b/filter-parser/Cargo.toml
@@ -14,6 +14,7 @@ license.workspace = true
 [dependencies]
 nom = "7.1.3"
 nom_locate = "4.1.0"
+unescaper = "0.1.2"

 [dev-dependencies]
 insta = "1.29.0"
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@@ -62,6 +62,7 @@ pub enum ErrorKind<'a> {
    MisusedGeoRadius,
    MisusedGeoBoundingBox,
    InvalidPrimary,
+    InvalidEscapedNumber,
    ExpectedEof,
    ExpectedValue(ExpectedValueKind),
    MalformedValue,
@@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> {
                let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
            }
+            ErrorKind::InvalidEscapedNumber => {
+                writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
+            }
            ErrorKind::ExpectedEof => {
                writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
            }
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> {

 #[cfg(test)]
 pub mod tests {
+    use FilterCondition as Fc;
+
    use super::*;

    /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@@ -556,14 +558,22 @@ pub mod tests {
        unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
    }

+    fn p(s: &str) -> impl std::fmt::Display + '_ {
+        Fc::parse(s).unwrap().unwrap()
+    }
+
+    #[test]
+    fn parse_escaped() {
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
+        // but it also works with other sequencies
+        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
+    }
+
    #[test]
    fn parse() {
-        use FilterCondition as Fc;
-
-        fn p(s: &str) -> impl std::fmt::Display + '_ {
-            Fc::parse(s).unwrap().unwrap()
-        }
-
        // Test equal
        insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
        insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult<Token> {
        })
    })?;

-    Ok((input, value))
+    match unescaper::unescape(value.value()) {
+        Ok(content) => {
+            if content.len() != value.value().len() {
+                Ok((input, Token::new(value.original_span(), Some(content))))
+            } else {
+                Ok((input, value))
+            }
+        }
+        Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
+        Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::InvalidEscapedNumber,
+        ))),
+        Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::MalformedValue,
+        ))),
+    }
 }

 fn is_value_component(c: char) -> bool {
@@ -318,17 +335,17 @@ pub mod test {
            ("\"cha'nnel\"", "cha'nnel", false),
            ("I'm tamo", "I", false),
            // escaped thing but not quote
-            (r#""\\""#, r#"\\"#, false),
-            (r#""\\\\\\""#, r#"\\\\\\"#, false),
-            (r#""aa\\aa""#, r#"aa\\aa"#, false),
+            (r#""\\""#, r#"\"#, true),
+            (r#""\\\\\\""#, r#"\\\"#, true),
+            (r#""aa\\aa""#, r#"aa\aa"#, true),
            // with double quote
            (r#""Hello \"world\"""#, r#"Hello "world""#, true),
-            (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
+            (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
            (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
            (r#""\"\"""#, r#""""#, true),
            // with simple quote
            (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
-            (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
+            (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
            (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
            (r#"'\'\''"#, r#"''"#, true),
        ];
@@ -350,7 +367,14 @@ pub mod test {
                "Filter `{}` was not supposed to be escaped",
                input
            );
-            assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
+            assert_eq!(
+                token.value(),
+                expected,
+                "Filter `{}` failed by giving `{}` instead of `{}`.",
+                input,
+                token.value(),
+                expected
+            );
        }
    }

--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -67,10 +67,6 @@ pub(crate) enum Batch {
        op: IndexOperation,
        must_create_index: bool,
    },
-    IndexDocumentDeletionByFilter {
-        index_uid: String,
-        task: Task,
-    },
    IndexCreation {
        index_uid: String,
        primary_key: Option<String>,
@@ -114,6 +110,10 @@ pub(crate) enum IndexOperation {
        documents: Vec<Vec<String>>,
        tasks: Vec<Task>,
    },
+    IndexDocumentDeletionByFilter {
+        index_uid: String,
+        task: Task,
+    },
    DocumentClear {
        index_uid: String,
        tasks: Vec<Task>,
@@ -155,7 +155,6 @@ impl Batch {
            | Batch::TaskDeletion(task)
            | Batch::Dump(task)
            | Batch::IndexCreation { task, .. }
-            | Batch::IndexDocumentDeletionByFilter { task, .. }
            | Batch::IndexUpdate { task, .. } => vec![task.uid],
            Batch::SnapshotCreation(tasks) | Batch::IndexDeletion { tasks, .. } => {
                tasks.iter().map(|task| task.uid).collect()
@@ -167,6 +166,7 @@ impl Batch {
                | IndexOperation::DocumentClear { tasks, .. } => {
                    tasks.iter().map(|task| task.uid).collect()
                }
+                IndexOperation::IndexDocumentDeletionByFilter { task, .. } => vec![task.uid],
                IndexOperation::SettingsAndDocumentOperation {
                    document_import_tasks: tasks,
                    settings_tasks: other,
@@ -194,8 +194,7 @@ impl Batch {
            IndexOperation { op, .. } => Some(op.index_uid()),
            IndexCreation { index_uid, .. }
            | IndexUpdate { index_uid, .. }
-            | IndexDeletion { index_uid, .. }
-            | IndexDocumentDeletionByFilter { index_uid, .. } => Some(index_uid),
+            | IndexDeletion { index_uid, .. } => Some(index_uid),
        }
    }
 }
@@ -205,6 +204,7 @@ impl IndexOperation {
        match self {
            IndexOperation::DocumentOperation { index_uid, .. }
            | IndexOperation::DocumentDeletion { index_uid, .. }
+            | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
            | IndexOperation::DocumentClear { index_uid, .. }
            | IndexOperation::Settings { index_uid, .. }
            | IndexOperation::DocumentClearAndSetting { index_uid, .. }
@@ -239,9 +239,12 @@ impl IndexScheduler {
                let task = self.get_task(rtxn, id)?.ok_or(Error::CorruptedTaskQueue)?;
                match &task.kind {
                    KindWithContent::DocumentDeletionByFilter { index_uid, .. } => {
-                        Ok(Some(Batch::IndexDocumentDeletionByFilter {
-                            index_uid: index_uid.clone(),
-                            task,
+                        Ok(Some(Batch::IndexOperation {
+                            op: IndexOperation::IndexDocumentDeletionByFilter {
+                                index_uid: index_uid.clone(),
+                                task,
+                            },
+                            must_create_index: false,
                        }))
                    }
                    _ => unreachable!(),
@@ -896,51 +899,6 @@ impl IndexScheduler {

                Ok(tasks)
            }
-            Batch::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
-                let (index_uid, filter) =
-                    if let KindWithContent::DocumentDeletionByFilter { index_uid, filter_expr } =
-                        &task.kind
-                    {
-                        (index_uid, filter_expr)
-                    } else {
-                        unreachable!()
-                    };
-                let index = {
-                    let rtxn = self.env.read_txn()?;
-                    self.index_mapper.index(&rtxn, index_uid)?
-                };
-                let deleted_documents = delete_document_by_filter(filter, index);
-                let original_filter = if let Some(Details::DocumentDeletionByFilter {
-                    original_filter,
-                    deleted_documents: _,
-                }) = task.details
-                {
-                    original_filter
-                } else {
-                    // In the case of a `documentDeleteByFilter` the details MUST be set
-                    unreachable!();
-                };
-
-                match deleted_documents {
-                    Ok(deleted_documents) => {
-                        task.status = Status::Succeeded;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(deleted_documents),
-                        });
-                    }
-                    Err(e) => {
-                        task.status = Status::Failed;
-                        task.details = Some(Details::DocumentDeletionByFilter {
-                            original_filter,
-                            deleted_documents: Some(0),
-                        });
-                        task.error = Some(e.into());
-                    }
-                }
-
-                Ok(vec![task])
-            }
            Batch::IndexCreation { index_uid, primary_key, task } => {
                let wtxn = self.env.write_txn()?;
                if self.index_mapper.exists(&wtxn, &index_uid)? {
@@ -1299,6 +1257,47 @@ impl IndexScheduler {

                Ok(tasks)
            }
+            IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
+                let filter =
+                    if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
+                        &task.kind
+                    {
+                        filter_expr
+                    } else {
+                        unreachable!()
+                    };
+                let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
+                let original_filter = if let Some(Details::DocumentDeletionByFilter {
+                    original_filter,
+                    deleted_documents: _,
+                }) = task.details
+                {
+                    original_filter
+                } else {
+                    // In the case of a `documentDeleteByFilter` the details MUST be set
+                    unreachable!();
+                };
+
+                match deleted_documents {
+                    Ok(deleted_documents) => {
+                        task.status = Status::Succeeded;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(deleted_documents),
+                        });
+                    }
+                    Err(e) => {
+                        task.status = Status::Failed;
+                        task.details = Some(Details::DocumentDeletionByFilter {
+                            original_filter,
+                            deleted_documents: Some(0),
+                        });
+                        task.error = Some(e.into());
+                    }
+                }
+
+                Ok(vec![task])
+            }
            IndexOperation::Settings { index_uid: _, settings, mut tasks } => {
                let indexer_config = self.index_mapper.indexer_config();
                let mut builder = milli::update::Settings::new(index_wtxn, index, indexer_config);
@@ -1498,23 +1497,22 @@ impl IndexScheduler {
    }
 }

-fn delete_document_by_filter(filter: &serde_json::Value, index: Index) -> Result<u64> {
+fn delete_document_by_filter<'a>(
+    wtxn: &mut RwTxn<'a, '_>,
+    filter: &serde_json::Value,
+    index: &'a Index,
+) -> Result<u64> {
    let filter = Filter::from_json(filter)?;
    Ok(if let Some(filter) = filter {
-        let mut wtxn = index.write_txn()?;
-
-        let candidates = filter.evaluate(&wtxn, &index).map_err(|err| match err {
+        let candidates = filter.evaluate(wtxn, index).map_err(|err| match err {
            milli::Error::UserError(milli::UserError::InvalidFilter(_)) => {
                Error::from(err).with_custom_error_code(Code::InvalidDocumentFilter)
            }
            e => e.into(),
        })?;
-        let mut delete_operation = DeleteDocuments::new(&mut wtxn, &index)?;
+        let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
        delete_operation.delete_documents(&candidates);
-        let deleted_documents =
-            delete_operation.execute().map(|result| result.deleted_documents)?;
-        wtxn.commit()?;
-        deleted_documents
+        delete_operation.execute().map(|result| result.deleted_documents)?
    } else {
        0
    })
--- a/meilisearch/src/routes/swap_indexes.rs
+++ b/meilisearch/src/routes/swap_indexes.rs
@@ -60,8 +60,7 @@ pub async fn swap_indexes(
    }

    let task = KindWithContent::IndexSwap { swaps };
-
-    let task = index_scheduler.register(task)?;
-    let task: SummarizedTaskView = task.into();
+    let task: SummarizedTaskView =
+        tokio::task::spawn_blocking(move || index_scheduler.register(task)).await??.into();
    Ok(HttpResponse::Accepted().json(task))
 }
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@@ -154,6 +154,19 @@ async fn delete_document_by_filter() {
        )
        .await;
    index.wait_task(1).await;
+
+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 4,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 3,
+        "id": 4
+      }
+    }
+    "###);
+
    let (response, code) =
        index.delete_document_by_filter(json!({ "filter": "color = blue"})).await;
    snapshot!(code, @"202 Accepted");
@@ -188,6 +201,18 @@ async fn delete_document_by_filter() {
    }
    "###);

+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 2,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 1,
+        "id": 2
+      }
+    }
+    "###);
+
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
@@ -241,6 +266,18 @@ async fn delete_document_by_filter() {
    }
    "###);

+    let (stats, _) = index.stats().await;
+    snapshot!(json_string!(stats), @r###"
+    {
+      "numberOfDocuments": 1,
+      "isIndexing": false,
+      "fieldDistribution": {
+        "color": 1,
+        "id": 1
+      }
+    }
+    "###);
+
    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
    snapshot!(code, @"200 OK");
    snapshot!(json_string!(documents), @r###"
--- a/meilisearch/tests/search/distinct.rs
+++ b/meilisearch/tests/search/distinct.rs
@@ -0,0 +1,63 @@
+use meili_snap::snapshot;
+use once_cell::sync::Lazy;
+use serde_json::{json, Value};
+
+use crate::common::Server;
+
+pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
+    json!([
+        {"productId": 1, "shopId": 1},
+        {"productId": 2, "shopId": 1},
+        {"productId": 3, "shopId": 2},
+        {"productId": 4, "shopId": 2},
+        {"productId": 5, "shopId": 3},
+        {"productId": 6, "shopId": 3},
+        {"productId": 7, "shopId": 4},
+        {"productId": 8, "shopId": 4},
+        {"productId": 9, "shopId": 5},
+        {"productId": 10, "shopId": 5}
+    ])
+});
+
+pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
+pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
+
+/// testing: https://github.com/meilisearch/meilisearch/issues/4078
+#[actix_rt::test]
+async fn distinct_search_with_offset_no_ranking() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let documents = DOCUMENTS.clone();
+    index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
+    index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
+    index.wait_task(1).await;
+
+    fn get_hits(response: Value) -> Vec<i64> {
+        let hits_array = response["hits"].as_array().unwrap();
+        hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
+    }
+
+    let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
+    let hits = get_hits(response);
+    snapshot!(code, @"200 OK");
+    snapshot!(hits.len(), @"2");
+    snapshot!(format!("{:?}", hits), @"[1, 2]");
+
+    let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
+    let hits = get_hits(response);
+    snapshot!(code, @"200 OK");
+    snapshot!(hits.len(), @"2");
+    snapshot!(format!("{:?}", hits), @"[3, 4]");
+
+    let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
+    let hits = get_hits(response);
+    snapshot!(code, @"200 OK");
+    snapshot!(hits.len(), @"1");
+    snapshot!(format!("{:?}", hits), @"[5]");
+
+    let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
+    let hits = get_hits(response);
+    snapshot!(code, @"200 OK");
+    snapshot!(hits.len(), @"0");
+}
--- a/meilisearch/tests/search/mod.rs
+++ b/meilisearch/tests/search/mod.rs
@@ -1,6 +1,7 @@
 // This modules contains all the test concerning search. Each particular feature of the search
 // should be tested in its own module to isolate tests and keep the tests readable.

+mod distinct;
 mod errors;
 mod facet_search;
 mod formatted;
@@ -1104,3 +1105,59 @@ async fn camelcased_words() {
        })
        .await;
 }
+
+#[actix_rt::test]
+async fn simple_search_with_strange_synonyms() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    index.update_settings(json!({ "synonyms": {"&": ["to"], "to": ["&"]} })).await;
+    let r = index.wait_task(0).await;
+    meili_snap::snapshot!(r["status"], @r###""succeeded""###);
+
+    let documents = DOCUMENTS.clone();
+    index.add_documents(documents, None).await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "How to train"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "How & train"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "to"}), |response, code| {
+            meili_snap::snapshot!(code, @"200 OK");
+            meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
+            [
+              {
+                "title": "How to Train Your Dragon: The Hidden World",
+                "id": "166428"
+              }
+            ]
+            "###);
+        })
+        .await;
+}
--- a/milli/src/documents/enriched.rs
+++ b/milli/src/documents/enriched.rs
@@ -1,4 +1,5 @@
 use std::fs::File;
+use std::io::BufReader;
 use std::{io, str};

 use obkv::KvReader;
@@ -19,14 +20,14 @@ use crate::FieldId;
 pub struct EnrichedDocumentsBatchReader<R> {
    documents: DocumentsBatchReader<R>,
    primary_key: String,
-    external_ids: grenad::ReaderCursor<File>,
+    external_ids: grenad::ReaderCursor<BufReader<File>>,
 }

 impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
    pub fn new(
        documents: DocumentsBatchReader<R>,
        primary_key: String,
-        external_ids: grenad::Reader<File>,
+        external_ids: grenad::Reader<BufReader<File>>,
    ) -> Result<Self, Error> {
        if documents.documents_count() as u64 == external_ids.len() {
            Ok(EnrichedDocumentsBatchReader {
@@ -75,7 +76,7 @@ pub struct EnrichedDocument<'a> {
 pub struct EnrichedDocumentsBatchCursor<R> {
    documents: DocumentsBatchCursor<R>,
    primary_key: String,
-    external_ids: grenad::ReaderCursor<File>,
+    external_ids: grenad::ReaderCursor<BufReader<File>>,
 }

 impl<R> EnrichedDocumentsBatchCursor<R> {
--- a/milli/src/search/new/bucket_sort.rs
+++ b/milli/src/search/new/bucket_sort.rs
@@ -46,18 +46,27 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
        if let Some(distinct_fid) = distinct_fid {
            let mut excluded = RoaringBitmap::new();
            let mut results = vec![];
+            let mut skip = 0;
            for docid in universe.iter() {
-                if results.len() >= from + length {
+                if results.len() >= length {
                    break;
                }
                if excluded.contains(docid) {
                    continue;
                }
+
                distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?;
+                skip += 1;
+                if skip <= from {
+                    continue;
+                }
+
                results.push(docid);
            }
+
            let mut all_candidates = universe - excluded;
            all_candidates.extend(results.iter().copied());
+
            return Ok(BucketSortOutput {
                scores: vec![Default::default(); results.len()],
                docids: results,
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -418,19 +418,11 @@ impl<'t> Matcher<'t, '_> {
        } else {
            match &self.matches {
                Some((tokens, matches)) => {
-                    // If the text has to be cropped,
-                    // compute the best interval to crop around.
-                    let matches = match format_options.crop {
-                        Some(crop_size) if crop_size > 0 => {
-                            self.find_best_match_interval(matches, crop_size)
-                        }
-                        _ => matches,
-                    };
-
                    // If the text has to be cropped,
                    // crop around the best interval.
                    let (byte_start, byte_end) = match format_options.crop {
                        Some(crop_size) if crop_size > 0 => {
+                            let matches = self.find_best_match_interval(matches, crop_size);
                            self.crop_bounds(tokens, matches, crop_size)
                        }
                        _ => (0, self.text.len()),
@@ -450,6 +442,11 @@ impl<'t> Matcher<'t, '_> {
                        for m in matches {
                            let token = &tokens[m.token_position];

+                            // skip matches out of the crop window.
+                            if token.byte_start < byte_start || token.byte_end > byte_end {
+                                continue;
+                            }
+
                            if byte_index < token.byte_start {
                                formatted.push(&self.text[byte_index..token.byte_start]);
                            }
@@ -800,6 +797,37 @@ mod tests {
        );
    }

+    #[test]
+    fn format_highlight_crop_phrase_query() {
+        //! testing: https://github.com/meilisearch/meilisearch/issues/3975
+        let temp_index = TempIndex::new();
+        temp_index
+            .add_documents(documents!([
+                { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
+            ]))
+            .unwrap();
+        let rtxn = temp_index.read_txn().unwrap();
+
+        let format_options = FormatOptions { highlight: true, crop: Some(10) };
+        let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
+
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
+        let mut matcher = builder.build(text);
+        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…had the power to split <em>the</em> <em>world</em> between those who…"
+        );
+
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
+        let mut matcher = builder.build(text);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
+        );
+    }
+
    #[test]
    fn smaller_crop_size() {
        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -1,5 +1,6 @@
 use std::borrow::Cow;
 use std::fs::File;
+use std::io::BufReader;

 use grenad::CompressionType;
 use heed::types::ByteSlice;
@@ -30,7 +31,7 @@ pub struct FacetsUpdateBulk<'i> {
    facet_type: FacetType,
    field_ids: Vec<FieldId>,
    // None if level 0 does not need to be updated
-    new_data: Option<grenad::Reader<File>>,
+    new_data: Option<grenad::Reader<BufReader<File>>>,
 }

 impl<'i> FacetsUpdateBulk<'i> {
@@ -38,7 +39,7 @@ impl<'i> FacetsUpdateBulk<'i> {
        index: &'i Index,
        field_ids: Vec<FieldId>,
        facet_type: FacetType,
-        new_data: grenad::Reader<File>,
+        new_data: grenad::Reader<BufReader<File>>,
        group_size: u8,
        min_level_size: u8,
    ) -> FacetsUpdateBulk<'i> {
@@ -187,7 +188,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
        &self,
        field_id: FieldId,
        txn: &RoTxn,
-    ) -> Result<(Vec<grenad::Reader<File>>, RoaringBitmap)> {
+    ) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
        let mut all_docids = RoaringBitmap::new();
        let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
            for bitmap in bitmaps {
@@ -259,7 +260,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
        field_id: u16,
        level: u8,
        handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
-    ) -> Result<Vec<grenad::Reader<File>>> {
+    ) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
        if level == 0 {
            self.read_level_0(rtxn, field_id, handle_group)?;
            // Level 0 is already in the database
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::fs::File;
+use std::io::BufReader;

 use heed::types::{ByteSlice, DecodeIgnore};
 use heed::{BytesDecode, Error, RoTxn, RwTxn};
@@ -34,14 +35,14 @@ pub struct FacetsUpdateIncremental<'i> {
    index: &'i Index,
    inner: FacetsUpdateIncrementalInner,
    facet_type: FacetType,
-    new_data: grenad::Reader<File>,
+    new_data: grenad::Reader<BufReader<File>>,
 }

 impl<'i> FacetsUpdateIncremental<'i> {
    pub fn new(
        index: &'i Index,
        facet_type: FacetType,
-        new_data: grenad::Reader<File>,
+        new_data: grenad::Reader<BufReader<File>>,
        group_size: u8,
        min_level_size: u8,
        max_group_size: u8,
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -78,6 +78,7 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;

 use std::collections::BTreeSet;
 use std::fs::File;
+use std::io::BufReader;
 use std::iter::FromIterator;

 use charabia::normalizer::{Normalize, NormalizerOption};
@@ -108,13 +109,17 @@ pub struct FacetsUpdate<'i> {
    index: &'i Index,
    database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
    facet_type: FacetType,
-    new_data: grenad::Reader<File>,
+    new_data: grenad::Reader<BufReader<File>>,
    group_size: u8,
    max_group_size: u8,
    min_level_size: u8,
 }
 impl<'i> FacetsUpdate<'i> {
-    pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader<File>) -> Self {
+    pub fn new(
+        index: &'i Index,
+        facet_type: FacetType,
+        new_data: grenad::Reader<BufReader<File>>,
+    ) -> Self {
        let database = match facet_type {
            FacetType::String => index
                .facet_id_string_docids
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -1,4 +1,4 @@
-use std::io::{Read, Seek};
+use std::io::{BufWriter, Read, Seek};
 use std::result::Result as StdResult;
 use std::{fmt, iter};

@@ -35,7 +35,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(

    let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();

-    let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
+    let mut external_ids = tempfile::tempfile().map(BufWriter::new).map(grenad::Writer::new)?;
    let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];

    // The primary key *field id* that has already been set for this index or the one
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -1,6 +1,7 @@
 use std::collections::{HashMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
+use std::io::BufReader;
 use std::{io, mem, str};

 use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
@@ -31,7 +32,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    allowed_separators: Option<&[&str]>,
    dictionary: Option<&[&str]>,
    max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
+) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
    puffin::profile_function!();

    let max_positions_per_attributes = max_positions_per_attributes
@@ -226,9 +227,9 @@ fn process_tokens<'a>(
 ) -> impl Iterator<Item = (usize, Token<'a>)> {
    tokens
        .skip_while(|token| token.is_separator())
-        .scan((0, None), |(offset, prev_kind), token| {
+        .scan((0, None), |(offset, prev_kind), mut token| {
            match token.kind {
-                TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
+                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
                    *offset += match *prev_kind {
                        Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
                        Some(_) => 1,
@@ -244,7 +245,7 @@ fn process_tokens<'a>(
                {
                    *prev_kind = Some(token.kind);
                }
-                _ => (),
+                _ => token.kind = TokenKind::Unknown,
            }
            Some((*offset, token))
        })
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use heed::{BytesDecode, BytesEncode};

@@ -19,7 +19,7 @@ use crate::Result;
 pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
    docid_fid_facet_number: grenad::Reader<R>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use heed::BytesEncode;

@@ -17,7 +17,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
    docid_fid_facet_string: grenad::Reader<R>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -1,7 +1,7 @@
 use std::collections::{BTreeMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};
 use std::mem::size_of;

 use heed::zerocopy::AsBytes;
@@ -17,11 +17,11 @@ use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET

 /// The extracted facet values stored in grenad files by type.
 pub struct ExtractedFacetValues {
-    pub docid_fid_facet_numbers_chunk: grenad::Reader<File>,
-    pub docid_fid_facet_strings_chunk: grenad::Reader<File>,
-    pub fid_facet_is_null_docids_chunk: grenad::Reader<File>,
-    pub fid_facet_is_empty_docids_chunk: grenad::Reader<File>,
-    pub fid_facet_exists_docids_chunk: grenad::Reader<File>,
+    pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
+    pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
 }

 /// Extracts the facet values of each faceted field of each document.
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -1,6 +1,6 @@
 use std::collections::HashMap;
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use grenad::Sorter;

@@ -21,7 +21,7 @@ use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
 pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use concat_arrays::concat_arrays;
 use serde_json::Value;
@@ -18,7 +18,7 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
    indexer: GrenadParameters,
    primary_key_id: FieldId,
    (lat_fid, lng_fid): (FieldId, FieldId),
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let mut writer = create_writer(
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -1,6 +1,6 @@
 use std::convert::TryFrom;
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use bytemuck::cast_slice;
 use serde_json::{from_slice, Value};
@@ -18,7 +18,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
    indexer: GrenadParameters,
    primary_key_id: FieldId,
    vectors_fid: FieldId,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let mut writer = create_writer(
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -1,6 +1,6 @@
 use std::collections::HashSet;
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};
 use std::iter::FromIterator;

 use roaring::RoaringBitmap;
@@ -26,7 +26,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
    exact_attributes: &HashSet<FieldId>,
-) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
+) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use super::helpers::{
    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
@@ -14,7 +14,7 @@ use crate::{relative_from_absolute_position, DocumentId, Result};
 pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -1,6 +1,7 @@
 use std::cmp::Ordering;
 use std::collections::{BinaryHeap, HashMap};
 use std::fs::File;
+use std::io::BufReader;
 use std::{cmp, io, mem, str, vec};

 use super::helpers::{
@@ -20,7 +21,7 @@ use crate::{DocumentId, Result};
 pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use super::helpers::{
    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
@@ -17,7 +17,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
 pub fn extract_word_position_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    puffin::profile_function!();

    let max_memory = indexer.max_memory_by_thread();
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -12,6 +12,7 @@ mod extract_word_position_docids;

 use std::collections::HashSet;
 use std::fs::File;
+use std::io::BufReader;

 use crossbeam_channel::Sender;
 use log::debug;
@@ -39,8 +40,8 @@ use crate::{FieldId, Result};
 /// Send data in grenad file over provided Sender.
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn data_from_obkv_documents(
-    original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
-    flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<File>>> + Send,
+    original_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
+    flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send,
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    searchable_fields: Option<HashSet<FieldId>>,
@@ -59,7 +60,13 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
+            send_original_documents_data(
+                original_documents_chunk,
+                indexer,
+                lmdb_writer_sx.clone(),
+                vectors_field_id,
+                primary_key_id,
+            )
        })
        .collect::<Result<()>>()?;

@@ -76,7 +83,6 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
-                    vectors_field_id,
                    &stop_words,
                    &allowed_separators,
                    &dictionary,
@@ -147,7 +153,7 @@ pub(crate) fn data_from_obkv_documents(
        });
    }

-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
        docid_word_positions_chunks.clone(),
        indexer,
        lmdb_writer_sx.clone(),
@@ -157,7 +163,7 @@ pub(crate) fn data_from_obkv_documents(
        "word-pair-proximity-docids",
    );

-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
        docid_word_positions_chunks.clone(),
        indexer,
        lmdb_writer_sx.clone(),
@@ -167,7 +173,11 @@ pub(crate) fn data_from_obkv_documents(
        "field-id-wordcount-docids",
    );

-    spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
+    spawn_extraction_task::<
+        _,
+        _,
+        Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
+    >(
        docid_word_positions_chunks.clone(),
        indexer,
        lmdb_writer_sx.clone(),
@@ -180,7 +190,7 @@ pub(crate) fn data_from_obkv_documents(
        "word-docids",
    );

-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
        docid_word_positions_chunks.clone(),
        indexer,
        lmdb_writer_sx.clone(),
@@ -189,7 +199,7 @@ pub(crate) fn data_from_obkv_documents(
        TypedChunk::WordPositionDocids,
        "word-position-docids",
    );
-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
        docid_word_positions_chunks,
        indexer,
        lmdb_writer_sx.clone(),
@@ -199,7 +209,7 @@ pub(crate) fn data_from_obkv_documents(
        "word-fid-docids",
    );

-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
        docid_fid_facet_strings_chunks,
        indexer,
        lmdb_writer_sx.clone(),
@@ -209,7 +219,7 @@ pub(crate) fn data_from_obkv_documents(
        "field-id-facet-string-docids",
    );

-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
        docid_fid_facet_numbers_chunks,
        indexer,
        lmdb_writer_sx,
@@ -264,12 +274,34 @@ fn spawn_extraction_task<FE, FS, M>(
 /// Extract chunked data and send it into lmdb_writer_sx sender:
 /// - documents
 fn send_original_documents_data(
-    original_documents_chunk: Result<grenad::Reader<File>>,
+    original_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
+    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    vectors_field_id: Option<FieldId>,
+    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;

+    if let Some(vectors_field_id) = vectors_field_id {
+        let documents_chunk_cloned = original_documents_chunk.clone();
+        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            let result = extract_vector_points(
+                documents_chunk_cloned,
+                indexer,
+                primary_key_id,
+                vectors_field_id,
+            );
+            let _ = match result {
+                Ok(vector_points) => {
+                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
+                }
+                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
+            };
+        });
+    }
+
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@@ -284,14 +316,13 @@ fn send_original_documents_data(
 #[allow(clippy::too_many_arguments)]
 #[allow(clippy::type_complexity)]
 fn send_and_extract_flattened_documents_data(
-    flattened_documents_chunk: Result<grenad::Reader<File>>,
+    flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>,
    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
    searchable_fields: &Option<HashSet<FieldId>>,
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
-    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    allowed_separators: &Option<&[&str]>,
    dictionary: &Option<&[&str]>,
@@ -302,7 +333,10 @@ fn send_and_extract_flattened_documents_data(
        grenad::Reader<CursorClonableMmap>,
        (
            grenad::Reader<CursorClonableMmap>,
-            (grenad::Reader<File>, (grenad::Reader<File>, grenad::Reader<File>)),
+            (
+                grenad::Reader<BufReader<File>>,
+                (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>),
+            ),
        ),
    ),
 )> {
@@ -322,25 +356,6 @@ fn send_and_extract_flattened_documents_data(
        });
    }

-    if let Some(vectors_field_id) = vectors_field_id {
-        let documents_chunk_cloned = flattened_documents_chunk.clone();
-        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
-        rayon::spawn(move || {
-            let result = extract_vector_points(
-                documents_chunk_cloned,
-                indexer,
-                primary_key_id,
-                vectors_field_id,
-            );
-            let _ = match result {
-                Ok(vector_points) => {
-                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
-                }
-                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
-            };
-        });
-    }
-
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -1,6 +1,6 @@
 use std::borrow::Cow;
 use std::fs::File;
-use std::io::{self, Seek};
+use std::io::{self, BufReader, BufWriter, Seek};
 use std::time::Instant;

 use grenad::{CompressionType, Sorter};
@@ -17,13 +17,13 @@ pub fn create_writer<R: io::Write>(
    typ: grenad::CompressionType,
    level: Option<u32>,
    file: R,
-) -> grenad::Writer<R> {
+) -> grenad::Writer<BufWriter<R>> {
    let mut builder = grenad::Writer::builder();
    builder.compression_type(typ);
    if let Some(level) = level {
        builder.compression_level(level);
    }
-    builder.build(file)
+    builder.build(BufWriter::new(file))
 }

 pub fn create_sorter(
@@ -53,7 +53,7 @@ pub fn create_sorter(
 pub fn sorter_into_reader(
    sorter: grenad::Sorter<MergeFn>,
    indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<grenad::Reader<BufReader<File>>> {
    let mut writer = create_writer(
        indexer.chunk_compression_type,
        indexer.chunk_compression_level,
@@ -64,16 +64,18 @@ pub fn sorter_into_reader(
    writer_into_reader(writer)
 }

-pub fn writer_into_reader(writer: grenad::Writer<File>) -> Result<grenad::Reader<File>> {
-    let mut file = writer.into_inner()?;
+pub fn writer_into_reader(
+    writer: grenad::Writer<BufWriter<File>>,
+) -> Result<grenad::Reader<BufReader<File>>> {
+    let mut file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
    file.rewind()?;
-    grenad::Reader::new(file).map_err(Into::into)
+    grenad::Reader::new(BufReader::new(file)).map_err(Into::into)
 }

 pub unsafe fn as_cloneable_grenad(
-    reader: &grenad::Reader<File>,
+    reader: &grenad::Reader<BufReader<File>>,
 ) -> Result<grenad::Reader<CursorClonableMmap>> {
-    let file = reader.get_ref();
+    let file = reader.get_ref().get_ref();
    let mmap = memmap2::Mmap::map(file)?;
    let cursor = io::Cursor::new(ClonableMmap::from(mmap));
    let reader = grenad::Reader::new(cursor)?;
@@ -89,8 +91,8 @@ where
    fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
 }

-impl MergeableReader for Vec<grenad::Reader<File>> {
-    type Output = grenad::Reader<File>;
+impl MergeableReader for Vec<grenad::Reader<BufReader<File>>> {
+    type Output = grenad::Reader<BufReader<File>>;

    fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
        let mut merger = MergerBuilder::new(merge_fn);
@@ -99,8 +101,8 @@ impl MergeableReader for Vec<grenad::Reader<File>> {
    }
 }

-impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
-    type Output = (grenad::Reader<File>, grenad::Reader<File>);
+impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+    type Output = (grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>);

    fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
        let mut m1 = MergerBuilder::new(merge_fn);
@@ -125,7 +127,7 @@ impl<R: io::Read + io::Seek> MergerBuilder<R> {
        Ok(())
    }

-    fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
+    fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<BufReader<File>>> {
        let merger = self.0.build();
        let mut writer = create_writer(
            params.chunk_compression_type,
@@ -176,7 +178,7 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
    reader: grenad::Reader<R>,
    indexer: GrenadParameters,
    documents_chunk_size: usize,
-) -> Result<impl Iterator<Item = Result<grenad::Reader<File>>>> {
+) -> Result<impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>>> {
    let mut continue_reading = true;
    let mut cursor = reader.into_cursor()?;

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2550,6 +2550,25 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }

+    /// Index multiple different number of vectors in documents.
+    /// Vectors must be of the same length.
+    #[test]
+    fn test_multiple_vectors() {
+        let index = TempIndex::new();
+
+        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
+        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
+        index
+            .add_documents(
+                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
+            )
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
+        assert_eq!(res.documents_ids.len(), 3);
+    }
+
    #[test]
    fn reproduce_the_bug() {
        /*
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -659,8 +659,10 @@ impl<'a, 'i> Transform<'a, 'i> {
            new_documents_ids: self.new_documents_ids,
            replaced_documents_ids: self.replaced_documents_ids,
            documents_count: self.documents_count,
-            original_documents,
-            flattened_documents,
+            original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
+            flattened_documents: flattened_documents
+                .into_inner()
+                .map_err(|err| err.into_error())?,
        })
    }

@@ -779,8 +781,10 @@ impl<'a, 'i> Transform<'a, 'i> {
            new_documents_ids: documents_ids,
            replaced_documents_ids: RoaringBitmap::default(),
            documents_count,
-            original_documents,
-            flattened_documents,
+            original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
+            flattened_documents: flattened_documents
+                .into_inner()
+                .map_err(|err| err.into_error())?,
        };

        let new_facets = output.compute_real_facets(wtxn, self.index)?;
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -2,7 +2,7 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
-use std::io;
+use std::io::{self, BufReader};

 use bytemuck::allocation::pod_collect_to_vec;
 use charabia::{Language, Script};
@@ -27,22 +27,22 @@ pub(crate) enum TypedChunk {
    FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
    FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
    Documents(grenad::Reader<CursorClonableMmap>),
-    FieldIdWordcountDocids(grenad::Reader<File>),
+    FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
    NewDocumentsIds(RoaringBitmap),
    WordDocids {
-        word_docids_reader: grenad::Reader<File>,
-        exact_word_docids_reader: grenad::Reader<File>,
+        word_docids_reader: grenad::Reader<BufReader<File>>,
+        exact_word_docids_reader: grenad::Reader<BufReader<File>>,
    },
-    WordPositionDocids(grenad::Reader<File>),
-    WordFidDocids(grenad::Reader<File>),
-    WordPairProximityDocids(grenad::Reader<File>),
-    FieldIdFacetStringDocids(grenad::Reader<File>),
-    FieldIdFacetNumberDocids(grenad::Reader<File>),
-    FieldIdFacetExistsDocids(grenad::Reader<File>),
-    FieldIdFacetIsNullDocids(grenad::Reader<File>),
-    FieldIdFacetIsEmptyDocids(grenad::Reader<File>),
-    GeoPoints(grenad::Reader<File>),
-    VectorPoints(grenad::Reader<File>),
+    WordPositionDocids(grenad::Reader<BufReader<File>>),
+    WordFidDocids(grenad::Reader<BufReader<File>>),
+    WordPairProximityDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetExistsDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetIsNullDocids(grenad::Reader<BufReader<File>>),
+    FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
+    GeoPoints(grenad::Reader<BufReader<File>>),
+    VectorPoints(grenad::Reader<BufReader<File>>),
    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
 }

--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -1,6 +1,6 @@
 use std::borrow::Cow;
 use std::collections::HashSet;
-use std::io::BufReader;
+use std::io::{BufReader, BufWriter};

 use grenad::CompressionType;
 use heed::types::ByteSlice;
@@ -119,9 +119,9 @@ pub fn insert_into_database(
 pub fn write_into_lmdb_database_without_merging(
    wtxn: &mut heed::RwTxn,
    database: heed::PolyDatabase,
-    writer: grenad::Writer<std::fs::File>,
+    writer: grenad::Writer<BufWriter<std::fs::File>>,
 ) -> Result<()> {
-    let file = writer.into_inner()?;
+    let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
    let reader = grenad::Reader::new(BufReader::new(file))?;
    if database.is_empty(wtxn)? {
        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -573,7 +573,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    tokenizer
                        .tokenize(text)
                        .filter_map(|token| {
-                            if token.is_word() {
+                            if token.is_word() && !token.lemma().is_empty() {
                                Some(token.lemma().to_string())
                            } else {
                                None
@@ -608,13 +608,18 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                for (word, synonyms) in user_synonyms {
                    // Normalize both the word and associated synonyms.
                    let normalized_word = normalize(&tokenizer, word);
-                    let normalized_synonyms =
-                        synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
+                    let normalized_synonyms: Vec<_> = synonyms
+                        .iter()
+                        .map(|synonym| normalize(&tokenizer, synonym))
+                        .filter(|synonym| !synonym.is_empty())
+                        .collect();

                    // Store the normalized synonyms under the normalized word,
                    // merging the possible duplicate words.
-                    let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
-                    entry.extend(normalized_synonyms);
+                    if !normalized_word.is_empty() && !normalized_synonyms.is_empty() {
+                        let entry = new_synonyms.entry(normalized_word).or_insert_with(Vec::new);
+                        entry.extend(normalized_synonyms.into_iter());
+                    }
                }

                // Make sure that we don't have duplicate synonyms.
@@ -1422,6 +1427,43 @@ mod tests {
        assert!(result.documents_ids.is_empty());
    }

+    #[test]
+    fn thai_synonyms() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.autogenerate_docids = true;
+
+        let mut wtxn = index.write_txn().unwrap();
+        // Send 3 documents with ids from 1 to 3.
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "name": "ยี่ปุ่น" },
+                    { "name": "ญี่ปุ่น" },
+                ]),
+            )
+            .unwrap();
+
+        // In the same transaction provide some synonyms
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_synonyms(btreemap! {
+                    "japanese".to_string() => vec![S("ญี่ปุ่น"), S("ยี่ปุ่น")],
+                });
+            })
+            .unwrap();
+        wtxn.commit().unwrap();
+
+        // Ensure synonyms are effectively stored
+        let rtxn = index.read_txn().unwrap();
+        let synonyms = index.synonyms(&rtxn).unwrap();
+        assert!(!synonyms.is_empty()); // at this point the index should return something
+
+        // Check that we can use synonyms
+        let result = index.search(&rtxn).query("japanese").execute().unwrap();
+        assert_eq!(result.documents_ids.len(), 2);
+    }
+
    #[test]
    fn setting_searchable_recomputes_other_settings() {
        let index = TempIndex::new();
--- a/milli/tests/search/distinct.rs
+++ b/milli/tests/search/distinct.rs
@@ -8,7 +8,7 @@ use Criterion::*;
 use crate::search::{self, EXTERNAL_DOCUMENTS_IDS};

 macro_rules! test_distinct {
-    ($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $criteria:expr, $n_res:expr) => {
+    ($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $offset:expr, $criteria:expr, $n_res:expr) => {
        #[test]
        fn $func() {
            let criteria = $criteria;
@@ -27,6 +27,7 @@ macro_rules! test_distinct {
            let mut search = Search::new(&rtxn, &index);
            search.query(search::TEST_QUERY);
            search.limit($limit);
+            search.offset($offset);
            search.exhaustive_number_hits($exhaustive);

            search.terms_matching_strategy(TermsMatchingStrategy::default());
@@ -47,6 +48,7 @@ macro_rules! test_distinct {
                            Some(d.id)
                        }
                    })
+                    .skip($offset)
                    .take($limit)
                    .collect();

@@ -61,6 +63,7 @@ test_distinct!(
    tag,
    true,
    1,
+    0,
    vec![Words, Typo, Proximity, Attribute, Exactness],
    3
 );
@@ -69,6 +72,7 @@ test_distinct!(
    asc_desc_rank,
    true,
    1,
+    0,
    vec![Words, Typo, Proximity, Attribute, Exactness],
    7
 );
@@ -77,6 +81,7 @@ test_distinct!(
    asc_desc_rank,
    true,
    0,
+    0,
    vec![Desc(S("attribute_rank")), Desc(S("exactness_rank")), Exactness, Typo],
    7
 );
@@ -86,6 +91,7 @@ test_distinct!(
    tag,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Typo, Proximity, Attribute, Exactness],
    3
 );
@@ -94,6 +100,7 @@ test_distinct!(
    asc_desc_rank,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Typo, Proximity, Attribute, Exactness],
    7
 );
@@ -102,6 +109,7 @@ test_distinct!(
    tag,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words],
    3
 );
@@ -110,6 +118,7 @@ test_distinct!(
    asc_desc_rank,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words],
    7
 );
@@ -118,6 +127,7 @@ test_distinct!(
    tag,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Typo],
    3
 );
@@ -126,6 +136,7 @@ test_distinct!(
    asc_desc_rank,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Typo],
    7
 );
@@ -134,6 +145,7 @@ test_distinct!(
    tag,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Proximity],
    3
 );
@@ -142,6 +154,7 @@ test_distinct!(
    asc_desc_rank,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Proximity],
    7
 );
@@ -150,6 +163,7 @@ test_distinct!(
    tag,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Attribute],
    3
 );
@@ -158,6 +172,7 @@ test_distinct!(
    asc_desc_rank,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Attribute],
    7
 );
@@ -166,6 +181,7 @@ test_distinct!(
    tag,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Exactness],
    3
 );
@@ -174,6 +190,47 @@ test_distinct!(
    asc_desc_rank,
    false,
    EXTERNAL_DOCUMENTS_IDS.len(),
+    0,
    vec![Words, Exactness],
    7
 );
+test_distinct!(
+    // testing: https://github.com/meilisearch/meilisearch/issues/4078
+    distinct_string_limit_and_offset,
+    tag,
+    false,
+    EXTERNAL_DOCUMENTS_IDS.len(),
+    1,
+    vec![],
+    2
+);
+test_distinct!(
+    // testing: https://github.com/meilisearch/meilisearch/issues/4078
+    exhaustive_distinct_string_limit_and_offset,
+    tag,
+    true,
+    1,
+    2,
+    vec![],
+    1
+);
+test_distinct!(
+    // testing: https://github.com/meilisearch/meilisearch/issues/4078
+    distinct_number_limit_and_offset,
+    asc_desc_rank,
+    false,
+    EXTERNAL_DOCUMENTS_IDS.len(),
+    2,
+    vec![],
+    5
+);
+test_distinct!(
+    // testing: https://github.com/meilisearch/meilisearch/issues/4078
+    exhaustive_distinct_number_limit_and_offset,
+    asc_desc_rank,
+    true,
+    2,
+    4,
+    vec![],
+    3
+);
--- a/permissive-json-pointer/src/lib.rs
+++ b/permissive-json-pointer/src/lib.rs
@@ -186,12 +186,16 @@ fn create_value(value: &Document, mut selectors: HashSet<&str>) -> Document {
                    let array = create_array(array, &sub_selectors);
                    if !array.is_empty() {
                        new_value.insert(key.to_string(), array.into());
+                    } else {
+                        new_value.insert(key.to_string(), Value::Array(vec![]));
                    }
                }
                Value::Object(object) => {
                    let object = create_value(object, sub_selectors);
                    if !object.is_empty() {
                        new_value.insert(key.to_string(), object.into());
+                    } else {
+                        new_value.insert(key.to_string(), Value::Object(Map::new()));
                    }
                }
                _ => (),
@@ -211,6 +215,8 @@ fn create_array(array: &[Value], selectors: &HashSet<&str>) -> Vec<Value> {
                let array = create_array(array, selectors);
                if !array.is_empty() {
                    res.push(array.into());
+                } else {
+                    res.push(Value::Array(vec![]));
                }
            }
            Value::Object(object) => {
@@ -637,6 +643,24 @@ mod tests {
        );
    }

+    #[test]
+    fn empty_array_object_return_empty() {
+        let value: Value = json!({
+            "array": [],
+            "object": {},
+        });
+        let value: &Document = value.as_object().unwrap();
+
+        let res: Value = select_values(value, vec!["array.name", "object.name"]).into();
+        assert_eq!(
+            res,
+            json!({
+                "array": [],
+                "object": {},
+            })
+        );
+    }
+
    #[test]
    fn all_conflict_variation() {
        let value: Value = json!({
Author	SHA1	Message	Date
ManyTheFish	7ed9ad024e	Activate only the necessary features for Japanese	2023-10-16 11:31:59 +02:00
meili-bors[bot]	f343ef5f2f	Merge #4108 4108: Fix bug where search with distinct attribute and no ranking, returns offset+limit hits r=curquiza a=vivek-26 # Pull Request ## Related issue Fixes #4078 ## What does this PR do? This PR - - Fixes bug where search with distinct attribute and no ranking, returns offset+limit hits. - Adds unit and integration tests. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Vivek Kumar <vivek.26@outlook.com>	2023-10-12 07:51:29 +00:00
meili-bors[bot]	67a678cfb6	Merge #4089 4089: Use a bufreader and bufwriter everytime there is a grenad<file> r=curquiza a=irevoire # Pull Request Wrap all the files we give to a grenad in a `BufReader` or `BufWriter`. The dump import I tried in the issue went from 2h to 10 minutes on my machine. I also ran a bunch of benchmarks on my machine, and we're faster by a few seconds everywhere but nothing huge. ----- The one thing I’m afraid about is if we used to get the inner file in a grenad and then do a read right after without a seek at the beginning of the file or a reopen. Since we now use a bufreader our read would return the bytes one buffer later and probably completely corrupt what we were supposed to read. From what I see, it looks like it works, but I may have missed something, I don't know much about this part of the codebase. This issue should not arise on the bufwriter, though, because if we're not able to write the content of the buffer I ensured that the `into_inner` of the bufwriter should return an internal error. ## Related issue Fixes #4087 Co-authored-by: Tamo <tamo@meilisearch.com>	2023-10-11 14:27:00 +00:00
Vivek Kumar	d1331d8abf	add integration test for distinct search with no ranking	2023-10-11 19:12:56 +05:30
Vivek Kumar	19ba129165	add unit test for distinct search with no ranking	2023-10-11 19:02:27 +05:30
Vivek Kumar	d4da06ff47	fix bug where distinct search with no ranking returns offset+limit hits	2023-10-11 19:02:16 +05:30
Tamo	c0f2724c2d	get rids of the new introduced error code in favor of an io::Error	2023-10-10 15:12:23 +02:00
Tamo	d772073dfa	use a bufreader everytime there is a grenad<file>	2023-10-10 15:00:30 +02:00
meili-bors[bot]	8fe8ddea79	Merge #4112 4112: Update version for the next release (v1.4.1) in Cargo.toml r=curquiza a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: curquiza <curquiza@users.noreply.github.com>	2023-10-10 09:05:10 +00:00
curquiza	8a95bf28e5	Update version for the next release (v1.4.1) in Cargo.toml	2023-10-10 09:01:45 +00:00
meili-bors[bot]	76c05d1b20	Merge #4053 4053: Fix the stats of the documents deletion by filter r=Kerollmops a=irevoire # Pull Request The issue was that the operation « DocumentDeletionByFilter » was not declared as an index operation. That means the index stats were not reprocessed after the application of the operation. ## Related issue Fixes #4018 ## What does this PR do? - Move the `DocumentDeletionByFilter` internal operation into the category of the `IndexOperation`. This means that the stats will automatically be re-processed after a batch is processed. - Update a test to ensure that the stats are valid after each operation ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Tamo <tamo@meilisearch.com>	2023-09-11 15:53:26 +00:00
Tamo	34fac115d5	fix clippy	2023-09-11 17:15:57 +02:00
meili-bors[bot]	a09686fcbd	Merge #3997 3997: Refactor empty arrays/objects should return empty instead of null r=Kerollmops a=dogukanakkaya # Pull Request ## What does this PR do? At the moment if we select empty objects and array of object properties with dot notations like: ```json { "array": [], "object": {} } ``` ```rs GetDocumentOptions { fields: Some(vec!["array.name", "object.name"]) } ``` returns null if the array/object has no property yet. I am not sure if this is expected or it's the correct behaviour but I add my document with a property that is assigned to an empty array/object, later on when I select it, returns null which is kinda weird and unexpected in my opinion. This PR fixes that issue by returning an empty vector if the array is empty or an empty map if object is empty. This is not added for `permissive-json-pointer/src/lib.rs:224` because `create_array` loops over each item. Selecting a single property that is an object, in an array of objects would result other objects to be empty maps instead of none. ```json "doggos": [ { "jean": { "race": { "name": "bernese mountain", } } }, { "marc": { "age": 4, "race": { "name": "golden retriever", } } } ] ``` ```rs GetDocumentOptions { fields: Some(vec!["doggos.jean"]) } ``` Would result in `jean` object and an extra empty object for `marc`. ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: dogukanakkaya <doguakkaya27@hotmail.com>	2023-09-11 13:46:02 +00:00
dogukanakkaya	393be40179	Refactor empty arrays/objects should return empty instead of null	2023-09-11 15:56:15 +03:00
meili-bors[bot]	487d493f49	Merge #4043 4043: Bring back hotfixes from v1.3.3 into v1.4.0 r=Kerollmops a=curquiza Co-authored-by: curquiza <curquiza@users.noreply.github.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: curquiza <clementine@meilisearch.com>	2023-09-11 12:27:34 +00:00
Tamo	9258e5b5bf	Fix the stats of the documents deletion by filter The issue was that the operation « DocumentDeletionByFilter » was not declared as an index operation. That means the indexes stats were not reprocessed after the application of the operation.	2023-09-11 14:04:10 +02:00
meili-bors[bot]	462b4654c4	Merge #4028 4028: Fix highlighting bug when searching for a phrase with cropping r=ManyTheFish a=vivek-26 # Pull Request ## Related issue Fixes #3975 ## What does this PR do? This PR - - Fixes the bug where searching only for a phrase (containing multiple words) along with cropping, highlighted only the first word of the phrase. - Adds unit test case for the above mentioned scenario. ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Vivek Kumar <vivek.26@outlook.com>	2023-09-11 07:58:41 +00:00
Vivek Kumar	abfa7ded25	use a new temp index in the test	2023-09-08 12:32:47 +05:30
Vivek Kumar	f2837aaec2	add another test case	2023-09-08 11:39:54 +05:30
Vivek Kumar	11df155598	fix highlighting bug when searching for a phrase with cropping	2023-09-08 11:39:52 +05:30
curquiza	651657c03e	Fix git conflicts	2023-09-07 16:48:13 +02:00
meili-bors[bot]	b9ad59c969	Merge #4041 4041: Register the swap indexe task in a spawn blocking to be sure to never… r=ManyTheFish a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4040 ## What does this PR do? - Register the swap indexes task in a spawn blocking task Co-authored-by: Tamo <tamo@meilisearch.com>	2023-09-07 10:22:01 +00:00
Tamo	66aa682e23	Register the swap indexe task in a spawn blocking to be sure to never block the main thread	2023-09-07 11:37:02 +02:00
meili-bors[bot]	256cf33bca	Merge #4039 4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-07 09:25:58 +00:00
meili-bors[bot]	9945cbf9db	Merge #4038 4038: Fix filter escaping issues r=ManyTheFish a=Kerollmops This PR fixes #4034 by always escaping the sequences. Users must always put quotes (simple or double) to escape the filter values. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-06 12:29:29 +00:00
Kerollmops	03d0f628bd	Use the unescaper crate to unescape any char sequence	2023-09-06 13:59:45 +02:00
Kerollmops	ea78060916	Fix tests that were supposed to escape characters	2023-09-06 13:59:45 +02:00
Kerollmops	b42d48187a	Add a test case scenario	2023-09-06 13:59:44 +02:00
Kerollmops	679c0b0f97	Extract the vectors from the non-flattened version of the documents	2023-09-06 12:26:00 +02:00
Kerollmops	e02d0064bd	Add a test case scenario	2023-09-06 12:26:00 +02:00
meili-bors[bot]	7ef3572f11	Merge #4037 4037: Update version for the next release (v1.3.3) in Cargo.toml r=curquiza a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: curquiza <curquiza@users.noreply.github.com>	2023-09-06 09:50:58 +00:00
curquiza	93285041a9	Update version for the next release (v1.3.3) in Cargo.toml	2023-09-06 09:23:20 +00:00
meili-bors[bot]	dc3d9c90d9	Merge #3994 3994: Fix synonyms with separators r=Kerollmops a=ManyTheFish # Pull Request ## Related issue Fixes #3977 ## Available prototype ``` $ docker pull getmeili/meilisearch:prototype-fix-synonyms-with-separators-0 ``` ## What does this PR do? - add a new test - filter the empty synonyms after normalization Co-authored-by: ManyTheFish <many@meilisearch.com>	2023-09-05 14:42:46 +00:00
meili-bors[bot]	287cf25d39	Merge #4033 4033: Fix thai synonyms r=Kerollmops a=Kerollmops Fixes #4031 Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>	2023-09-05 13:54:33 +00:00
ManyTheFish	66aa6d5871	Ignore tokens with empty normalized value during indexing process	2023-09-05 15:44:14 +02:00
Kerollmops	8ac5b765bc	Fix synonyms normalization	2023-09-04 16:12:48 +02:00
meili-bors[bot]	cea93e9a37	Merge #4016 4016: Define the full Homebrew formula path r=curquiza a=Kerollmops This PR fixes #4015 by defining the full Homebrew formula path. Co-authored-by: Clément Renault <clement@meilisearch.com>	2023-09-04 13:10:28 +00:00
Kerollmops	085aad0a94	Add a test	2023-09-04 14:39:33 +02:00
Clément Renault	6db80b0836	Define the full Homebrew formula path	2023-08-24 11:24:47 +02:00
ManyTheFish	8dc5acf998	Try fix	2023-08-08 16:52:36 +02:00
ManyTheFish	fc2590fc9d	Add a test	2023-08-08 16:43:08 +02:00