Merge #4039

4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>
Merge #4038
2025-07-21 05:41:01 +00:00 · 2023-09-07 09:25:58 +00:00 · 2023-09-06 12:29:29 +00:00 · 2023-09-06 13:59:45 +02:00 · 2023-09-06 13:59:45 +02:00 · 2023-09-06 13:59:44 +02:00
8 changed files with 125 additions and 50 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -469,7 +469,7 @@ checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b"

 [[package]]
 name = "benchmarks"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "anyhow",
 "bytes",
@ -1199,7 +1199,7 @@ dependencies = [

 [[package]]
 name = "dump"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "anyhow",
 "big_s",
@ -1413,7 +1413,7 @@ dependencies = [

 [[package]]
 name = "file-store"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "faux",
 "tempfile",
@ -1435,11 +1435,12 @@ dependencies = [

 [[package]]
 name = "filter-parser"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "insta",
 "nom",
 "nom_locate",
+ "unescaper",
 ]

 [[package]]
@ -1454,7 +1455,7 @@ dependencies = [

 [[package]]
 name = "flatten-serde-json"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "criterion",
 "serde_json",
@ -1572,7 +1573,7 @@ dependencies = [

 [[package]]
 name = "fuzzers"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "arbitrary",
 "clap",
@ -1894,7 +1895,7 @@ dependencies = [

 [[package]]
 name = "index-scheduler"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "anyhow",
 "big_s",
@ -2081,7 +2082,7 @@ dependencies = [

 [[package]]
 name = "json-depth-checker"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "criterion",
 "serde_json",
@ -2493,7 +2494,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"

 [[package]]
 name = "meili-snap"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "insta",
 "md5",
@ -2502,7 +2503,7 @@ dependencies = [

 [[package]]
 name = "meilisearch"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "actix-cors",
 "actix-http",
@ -2591,7 +2592,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-auth"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "base64 0.21.2",
 "enum-iterator",
@ -2610,7 +2611,7 @@ dependencies = [

 [[package]]
 name = "meilisearch-types"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "actix-web",
 "anyhow",
@ -2664,7 +2665,7 @@ dependencies = [

 [[package]]
 name = "milli"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "big_s",
 "bimap",
@ -2994,7 +2995,7 @@ checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"

 [[package]]
 name = "permissive-json-pointer"
-version = "1.3.2"
+version = "1.3.3"
 dependencies = [
 "big_s",
 "serde_json",
@ -4147,6 +4148,15 @@ version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9e79c4d996edb816c91e4308506774452e55e95c3c9de07b6729e17e15a5ef81"

+[[package]]
+name = "unescaper"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a96a44ae11e25afb520af4534fd7b0bd8cd613e35a78def813b8cf41631fa3c8"
+dependencies = [
+ "thiserror",
+]
+
 [[package]]
 name = "unicase"
 version = "2.6.0"
--- a/Cargo.toml
+++ b/Cargo.toml
@ -18,7 +18,7 @@ members = [
 ]

 [workspace.package]
-version = "1.3.2"
+version = "1.3.3"
 authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
 description = "Meilisearch HTTP server"
 homepage = "https://meilisearch.com"
--- a/filter-parser/Cargo.toml
+++ b/filter-parser/Cargo.toml
@ -14,6 +14,7 @@ license.workspace = true
 [dependencies]
 nom = "7.1.3"
 nom_locate = "4.1.0"
+unescaper = "0.1.2"

 [dev-dependencies]
 insta = "1.29.0"
--- a/filter-parser/src/error.rs
+++ b/filter-parser/src/error.rs
@ -62,6 +62,7 @@ pub enum ErrorKind<'a> {
    MisusedGeoRadius,
    MisusedGeoBoundingBox,
    InvalidPrimary,
+    InvalidEscapedNumber,
    ExpectedEof,
    ExpectedValue(ExpectedValueKind),
    MalformedValue,
@ -147,6 +148,9 @@ impl<'a> Display for Error<'a> {
                let text = if input.trim().is_empty() { "but instead got nothing.".to_string() } else { format!("at `{}`.", escaped_input) };
                writeln!(f, "Was expecting an operation `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox` {}", text)?
            }
+            ErrorKind::InvalidEscapedNumber => {
+                writeln!(f, "Found an invalid escaped sequence number: `{}`.", escaped_input)?
+            }
            ErrorKind::ExpectedEof => {
                writeln!(f, "Found unexpected characters at the end of the filter: `{}`. You probably forgot an `OR` or an `AND` rule.", escaped_input)?
            }
--- a/filter-parser/src/lib.rs
+++ b/filter-parser/src/lib.rs
@ -545,6 +545,8 @@ impl<'a> std::fmt::Display for Token<'a> {

 #[cfg(test)]
 pub mod tests {
+    use FilterCondition as Fc;
+
    use super::*;

    /// Create a raw [Token]. You must specify the string that appear BEFORE your element followed by your element
@ -556,14 +558,22 @@ pub mod tests {
        unsafe { Span::new_from_raw_offset(offset, lines as u32, value, "") }.into()
    }

+    fn p(s: &str) -> impl std::fmt::Display + '_ {
+        Fc::parse(s).unwrap().unwrap()
+    }
+
+    #[test]
+    fn parse_escaped() {
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
+        insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
+        // but it also works with other sequencies
+        insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
+    }
+
    #[test]
    fn parse() {
-        use FilterCondition as Fc;
-
-        fn p(s: &str) -> impl std::fmt::Display + '_ {
-            Fc::parse(s).unwrap().unwrap()
-        }
-
        // Test equal
        insta::assert_display_snapshot!(p("channel = Ponce"), @"{channel} = {Ponce}");
        insta::assert_display_snapshot!(p("subscribers = 12"), @"{subscribers} = {12}");
--- a/filter-parser/src/value.rs
+++ b/filter-parser/src/value.rs
@ -171,7 +171,24 @@ pub fn parse_value(input: Span) -> IResult<Token> {
        })
    })?;

-    Ok((input, value))
+    match unescaper::unescape(value.value()) {
+        Ok(content) => {
+            if content.len() != value.value().len() {
+                Ok((input, Token::new(value.original_span(), Some(content))))
+            } else {
+                Ok((input, value))
+            }
+        }
+        Err(unescaper::Error::IncompleteStr(_)) => Err(nom::Err::Incomplete(nom::Needed::Unknown)),
+        Err(unescaper::Error::ParseIntError { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::InvalidEscapedNumber,
+        ))),
+        Err(unescaper::Error::InvalidChar { .. }) => Err(nom::Err::Error(Error::new_from_kind(
+            value.original_span(),
+            ErrorKind::MalformedValue,
+        ))),
+    }
 }

 fn is_value_component(c: char) -> bool {
@ -318,17 +335,17 @@ pub mod test {
            ("\"cha'nnel\"", "cha'nnel", false),
            ("I'm tamo", "I", false),
            // escaped thing but not quote
-            (r#""\\""#, r#"\\"#, false),
-            (r#""\\\\\\""#, r#"\\\\\\"#, false),
-            (r#""aa\\aa""#, r#"aa\\aa"#, false),
+            (r#""\\""#, r#"\"#, true),
+            (r#""\\\\\\""#, r#"\\\"#, true),
+            (r#""aa\\aa""#, r#"aa\aa"#, true),
            // with double quote
            (r#""Hello \"world\"""#, r#"Hello "world""#, true),
-            (r#""Hello \\\"world\\\"""#, r#"Hello \\"world\\""#, true),
+            (r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
            (r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
            (r#""\"\"""#, r#""""#, true),
            // with simple quote
            (r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
-            (r#"'Hello \\\'world\\\''"#, r#"Hello \\'world\\'"#, true),
+            (r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
            (r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
            (r#"'\'\''"#, r#"''"#, true),
        ];
@ -350,7 +367,14 @@ pub mod test {
                "Filter `{}` was not supposed to be escaped",
                input
            );
-            assert_eq!(token.value(), expected, "Filter `{}` failed.", input);
+            assert_eq!(
+                token.value(),
+                expected,
+                "Filter `{}` failed by giving `{}` instead of `{}`.",
+                input,
+                token.value(),
+                expected
+            );
        }
    }

--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -55,7 +55,13 @@ pub(crate) fn data_from_obkv_documents(
    original_obkv_chunks
        .par_bridge()
        .map(|original_documents_chunk| {
-            send_original_documents_data(original_documents_chunk, lmdb_writer_sx.clone())
+            send_original_documents_data(
+                original_documents_chunk,
+                indexer,
+                lmdb_writer_sx.clone(),
+                vectors_field_id,
+                primary_key_id,
+            )
        })
        .collect::<Result<()>>()?;

@ -72,7 +78,6 @@ pub(crate) fn data_from_obkv_documents(
                    &faceted_fields,
                    primary_key_id,
                    geo_fields_ids,
-                    vectors_field_id,
                    &stop_words,
                    max_positions_per_attributes,
                )
@ -257,11 +262,33 @@ fn spawn_extraction_task<FE, FS, M>(
 /// - documents
 fn send_original_documents_data(
    original_documents_chunk: Result<grenad::Reader<File>>,
+    indexer: GrenadParameters,
    lmdb_writer_sx: Sender<Result<TypedChunk>>,
+    vectors_field_id: Option<FieldId>,
+    primary_key_id: FieldId,
 ) -> Result<()> {
    let original_documents_chunk =
        original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;

+    if let Some(vectors_field_id) = vectors_field_id {
+        let documents_chunk_cloned = original_documents_chunk.clone();
+        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
+        rayon::spawn(move || {
+            let result = extract_vector_points(
+                documents_chunk_cloned,
+                indexer,
+                primary_key_id,
+                vectors_field_id,
+            );
+            let _ = match result {
+                Ok(vector_points) => {
+                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
+                }
+                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
+            };
+        });
+    }
+
    // TODO: create a custom internal error
    lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))).unwrap();
    Ok(())
@ -283,7 +310,6 @@ fn send_and_extract_flattened_documents_data(
    faceted_fields: &HashSet<FieldId>,
    primary_key_id: FieldId,
    geo_fields_ids: Option<(FieldId, FieldId)>,
-    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
@ -312,25 +338,6 @@ fn send_and_extract_flattened_documents_data(
        });
    }

-    if let Some(vectors_field_id) = vectors_field_id {
-        let documents_chunk_cloned = flattened_documents_chunk.clone();
-        let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
-        rayon::spawn(move || {
-            let result = extract_vector_points(
-                documents_chunk_cloned,
-                indexer,
-                primary_key_id,
-                vectors_field_id,
-            );
-            let _ = match result {
-                Ok(vector_points) => {
-                    lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
-                }
-                Err(error) => lmdb_writer_sx_cloned.send(Err(error)),
-            };
-        });
-    }
-
    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
        rayon::join(
            || {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -2519,6 +2519,25 @@ mod tests {
        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
    }

+    /// Index multiple different number of vectors in documents.
+    /// Vectors must be of the same length.
+    #[test]
+    fn test_multiple_vectors() {
+        let index = TempIndex::new();
+
+        index.add_documents(documents!([{"id": 0, "_vectors": [[0, 1, 2], [3, 4, 5]] }])).unwrap();
+        index.add_documents(documents!([{"id": 1, "_vectors": [6, 7, 8] }])).unwrap();
+        index
+            .add_documents(
+                documents!([{"id": 2, "_vectors": [[9, 10, 11], [12, 13, 14], [15, 16, 17]] }]),
+            )
+            .unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let res = index.search(&rtxn).vector([0.0, 1.0, 2.0]).execute().unwrap();
+        assert_eq!(res.documents_ids.len(), 3);
+    }
+
    #[test]
    fn reproduce_the_bug() {
        /*
Author	SHA1	Message	Date
meili-bors[bot]	256cf33bca	Merge #4039 4039: Fix multiple vectors dimensions r=ManyTheFish a=Kerollmops This PR fixes #4035, making providing multiple vectors in documents possible. This is fixed by extracting the vectors from the non-flattened version of the documents. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-07 09:25:58 +00:00
meili-bors[bot]	9945cbf9db	Merge #4038 4038: Fix filter escaping issues r=ManyTheFish a=Kerollmops This PR fixes #4034 by always escaping the sequences. Users must always put quotes (simple or double) to escape the filter values. Co-authored-by: Kerollmops <clement@meilisearch.com>	2023-09-06 12:29:29 +00:00
Kerollmops	03d0f628bd	Use the unescaper crate to unescape any char sequence	2023-09-06 13:59:45 +02:00
Kerollmops	ea78060916	Fix tests that were supposed to escape characters	2023-09-06 13:59:45 +02:00
Kerollmops	b42d48187a	Add a test case scenario	2023-09-06 13:59:44 +02:00
Kerollmops	679c0b0f97	Extract the vectors from the non-flattened version of the documents	2023-09-06 12:26:00 +02:00
Kerollmops	e02d0064bd	Add a test case scenario	2023-09-06 12:26:00 +02:00
meili-bors[bot]	7ef3572f11	Merge #4037 4037: Update version for the next release (v1.3.3) in Cargo.toml r=curquiza a=meili-bot ⚠️ This PR is automatically generated. Check the new version is the expected one and Cargo.lock has been updated before merging. Co-authored-by: curquiza <curquiza@users.noreply.github.com>	2023-09-06 09:50:58 +00:00
curquiza	93285041a9	Update version for the next release (v1.3.3) in Cargo.toml	2023-09-06 09:23:20 +00:00