Fix test

Add test
Add disableOnNumber setting
2025-11-23 13:16:33 +00:00 · 2025-04-03 17:31:06 +02:00 · 2025-04-03 17:31:06 +02:00 · 2025-04-03 17:31:05 +02:00 · 2025-03-31 15:31:29 +00:00 · 2025-03-31 15:27:49 +02:00
19 changed files with 359 additions and 36 deletions
--- a/crates/dump/src/reader/compat/v5_to_v6.rs
+++ b/crates/dump/src/reader/compat/v5_to_v6.rs
@@ -373,6 +373,7 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
                    },
                    disable_on_words: typo.disable_on_words.into(),
                    disable_on_attributes: typo.disable_on_attributes.into(),
+                    disable_on_numbers: v6::Setting::NotSet,
                }),
                v5::Setting::Reset => v6::Setting::Reset,
                v5::Setting::NotSet => v6::Setting::NotSet,
--- a/crates/meilisearch-types/src/error.rs
+++ b/crates/meilisearch-types/src/error.rs
@@ -454,7 +454,10 @@ impl ErrorCode for milli::Error {
                    }
                    UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
                    UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
-                    UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
+                    UserError::InvalidVectorDimensions { .. }
+                    | UserError::InvalidIndexingVectorDimensions { .. } => {
+                        Code::InvalidVectorDimensions
+                    }
                    UserError::InvalidVectorsMapType { .. }
                    | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
                    UserError::TooManyVectors(_, _) => Code::TooManyVectors,
--- a/crates/meilisearch-types/src/settings.rs
+++ b/crates/meilisearch-types/src/settings.rs
@@ -8,6 +8,7 @@ use std::str::FromStr;

 use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
 use fst::IntoStreamer;
+use milli::disabled_typos_terms::DisabledTyposTerms;
 use milli::index::{IndexEmbeddingConfig, PrefixSearch};
 use milli::proximity::ProximityPrecision;
 use milli::update::Setting;
@@ -104,6 +105,10 @@ pub struct TypoSettings {
    #[deserr(default)]
    #[schema(value_type = Option<BTreeSet<String>>, example = json!(["uuid", "url"]))]
    pub disable_on_attributes: Setting<BTreeSet<String>>,
+    #[serde(default, skip_serializing_if = "Setting::is_not_set")]
+    #[deserr(default)]
+    #[schema(value_type = Option<bool>, example = json!(true))]
+    pub disable_on_numbers: Setting<bool>,
 }

 #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)]
@@ -701,6 +706,12 @@ pub fn apply_settings_to_builder(
                Setting::Reset => builder.reset_exact_attributes(),
                Setting::NotSet => (),
            }
+
+            match value.disable_on_numbers {
+                Setting::Set(val) => builder.set_disable_on_numbers(val),
+                Setting::Reset => builder.reset_disable_on_numbers(),
+                Setting::NotSet => (),
+            }
        }
        Setting::Reset => {
            // all typo settings need to be reset here.
@@ -826,12 +837,14 @@ pub fn settings(
    };

    let disabled_attributes = index.exact_attributes(rtxn)?.into_iter().map(String::from).collect();
+    let DisabledTyposTerms { disable_on_numbers } = index.disabled_typos_terms(rtxn)?;

    let typo_tolerance = TypoSettings {
        enabled: Setting::Set(index.authorize_typos(rtxn)?),
        min_word_size_for_typos: Setting::Set(min_typo_word_len),
        disable_on_words: Setting::Set(disabled_words),
        disable_on_attributes: Setting::Set(disabled_attributes),
+        disable_on_numbers: Setting::Set(disable_on_numbers),
    };

    let faceting = FacetingSettings {
--- a/crates/meilisearch/tests/dumps/mod.rs
+++ b/crates/meilisearch/tests/dumps/mod.rs
@@ -87,7 +87,8 @@ async fn import_dump_v1_movie_raw() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -260,7 +261,8 @@ async fn import_dump_v1_movie_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -432,7 +434,8 @@ async fn import_dump_v1_rubygems_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -590,7 +593,8 @@ async fn import_dump_v2_movie_raw() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -760,7 +764,8 @@ async fn import_dump_v2_movie_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -929,7 +934,8 @@ async fn import_dump_v2_rubygems_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -1087,7 +1093,8 @@ async fn import_dump_v3_movie_raw() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -1257,7 +1264,8 @@ async fn import_dump_v3_movie_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -1426,7 +1434,8 @@ async fn import_dump_v3_rubygems_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -1584,7 +1593,8 @@ async fn import_dump_v4_movie_raw() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -1754,7 +1764,8 @@ async fn import_dump_v4_movie_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -1923,7 +1934,8 @@ async fn import_dump_v4_rubygems_with_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -2212,7 +2224,8 @@ async fn import_dump_v6_containing_experimental_features() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -2444,7 +2457,8 @@ async fn generate_and_import_dump_containing_vectors() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
--- a/crates/meilisearch/tests/search/mod.rs
+++ b/crates/meilisearch/tests/search/mod.rs
@@ -1976,3 +1976,93 @@ async fn change_facet_casing() {
        })
        .await;
 }
+
+#[actix_rt::test]
+async fn test_exact_typos_terms() {
+    let documents = json!([
+        {
+            "id": 0,
+            "title": "The zeroth document 1298484",
+        },
+        {
+            "id": 1,
+            "title": "The first document 234342",
+            "nested": {
+                "object": "field 22231",
+                "machin": "bidule 23443.32111",
+            },
+        },
+        {
+            "id": 2,
+            "title": "The second document 3398499",
+            "nested": [
+                "array",
+                {
+                    "object": "field 23245121,23223",
+                },
+                {
+                    "prout": "truc 123980612321",
+                    "machin": "lol 12345645333447879",
+                },
+            ],
+        },
+        {
+            "id": 3,
+            "title": "The third document 12333",
+            "nested": "I lied 98878",
+        },
+    ]);
+
+    // Test prefix search
+    test_settings_documents_indexing_swapping_and_search(
+        &documents,
+        &json!({
+            "searchableAttributes": ["title", "nested.object", "nested.machin"],
+            "typoTolerance": {
+              "enabled": true,
+              "disableOnNumbers": true
+            }
+        }),
+        &json!({"q": "12345"}),
+        |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 2,
+                "title": "The second document 3398499",
+                "nested": [
+                  "array",
+                  {
+                    "object": "field 23245121,23223"
+                  },
+                  {
+                    "prout": "truc 123980612321",
+                    "machin": "lol 12345645333447879"
+                  }
+                ]
+              }
+            ]
+            "###);
+        },
+    )
+    .await;
+
+    // Test typo search
+    test_settings_documents_indexing_swapping_and_search(
+        &documents,
+        &json!({
+            "searchableAttributes": ["title", "nested.object", "nested.machin"],
+            "typoTolerance": {
+              "enabled": true,
+              "disableOnNumbers": true
+            }
+        }),
+        &json!({"q": "123457"}),
+        |response, code| {
+            assert_eq!(code, 200, "{}", response);
+            snapshot!(json_string!(response["hits"]), @r###"[]"###);
+        },
+    )
+    .await;
+}
--- a/crates/meilisearch/tests/settings/errors.rs
+++ b/crates/meilisearch/tests/settings/errors.rs
@@ -274,7 +274,7 @@ async fn settings_bad_typo_tolerance() {
    snapshot!(code, @"400 Bad Request");
    snapshot!(json_string!(response), @r###"
    {
-      "message": "Unknown field `typoTolerance`: expected one of `enabled`, `minWordSizeForTypos`, `disableOnWords`, `disableOnAttributes`",
+      "message": "Unknown field `typoTolerance`: expected one of `enabled`, `minWordSizeForTypos`, `disableOnWords`, `disableOnAttributes`, `disableOnNumbers`",
      "code": "invalid_settings_typo_tolerance",
      "type": "invalid_request",
      "link": "https://docs.meilisearch.com/errors#invalid_settings_typo_tolerance"
--- a/crates/meilisearch/tests/settings/get_settings.rs
+++ b/crates/meilisearch/tests/settings/get_settings.rs
@@ -276,7 +276,7 @@ async fn secrets_are_hidden_in_settings() {

    let (response, code) = index.settings().await;
    meili_snap::snapshot!(code, @"200 OK");
-    meili_snap::snapshot!(meili_snap::json_string!(response), @r#"
+    meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
    {
      "displayedAttributes": [
        "*"
@@ -308,7 +308,8 @@ async fn secrets_are_hidden_in_settings() {
          "twoTypos": 9
        },
        "disableOnWords": [],
-        "disableOnAttributes": []
+        "disableOnAttributes": [],
+        "disableOnNumbers": false
      },
      "faceting": {
        "maxValuesPerFacet": 100,
@@ -337,7 +338,7 @@ async fn secrets_are_hidden_in_settings() {
      "facetSearch": true,
      "prefixSearch": "indexingTime"
    }
-    "#);
+    "###);

    let (response, code) = server.get_task(settings_update_uid).await;
    meili_snap::snapshot!(code, @"200 OK");
--- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap
+++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap
@@ -1,6 +1,5 @@
 ---
 source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs
-snapshot_kind: text
 ---
 {
  "displayedAttributes": [
@@ -49,7 +48,8 @@ snapshot_kind: text
    ],
    "disableOnAttributes": [
      "surname"
-    ]
+    ],
+    "disableOnNumbers": false
  },
  "faceting": {
    "maxValuesPerFacet": 99,
--- a/crates/meilisearch/tests/vector/mod.rs
+++ b/crates/meilisearch/tests/vector/mod.rs
@@ -164,6 +164,87 @@ async fn add_remove_user_provided() {
    "###);
 }

+#[actix_rt::test]
+async fn user_provide_mismatched_embedding_dimension() {
+    let server = Server::new().await;
+    let index = server.index("doggo");
+
+    let (response, code) = index
+        .update_settings(json!({
+          "embedders": {
+              "manual": {
+                  "source": "userProvided",
+                  "dimensions": 3,
+              }
+          },
+        }))
+        .await;
+    snapshot!(code, @"202 Accepted");
+    server.wait_task(response.uid()).await.succeeded();
+
+    let documents = json!([
+      {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
+    ]);
+    let (value, code) = index.add_documents(documents, None).await;
+    snapshot!(code, @"202 Accepted");
+    let task = index.wait_task(value.uid()).await;
+    snapshot!(task, @r###"
+    {
+      "uid": "[uid]",
+      "batchUid": "[batch_uid]",
+      "indexUid": "doggo",
+      "status": "failed",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 1,
+        "indexedDocuments": 0
+      },
+      "error": {
+        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
+        "code": "invalid_vector_dimensions",
+        "type": "invalid_request",
+        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
+      },
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
+    }
+    "###);
+
+    let new_document = json!([
+      {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
+    ]);
+    let (response, code) = index.add_documents(new_document, None).await;
+    snapshot!(code, @"202 Accepted");
+    let task = index.wait_task(response.uid()).await;
+    snapshot!(task, @r###"
+    {
+      "uid": "[uid]",
+      "batchUid": "[batch_uid]",
+      "indexUid": "doggo",
+      "status": "failed",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 1,
+        "indexedDocuments": 0
+      },
+      "error": {
+        "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n  - note: embedding #0 has dimensions 2\n  - note: embedder `manual` requires 3",
+        "code": "invalid_vector_dimensions",
+        "type": "invalid_request",
+        "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
+      },
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
+    }
+    "###);
+}
+
 async fn generate_default_user_provided_documents(server: &Server) -> Index {
    let index = server.index("doggo");

--- a/crates/milli/src/disabled_typos_terms.rs
+++ b/crates/milli/src/disabled_typos_terms.rs
@@ -0,0 +1,50 @@
+use heed::{
+    types::{SerdeJson, Str},
+    RoTxn, RwTxn,
+};
+use serde::{Deserialize, Serialize};
+
+use crate::{index::main_key, Index};
+
+#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
+#[serde(rename_all = "camelCase")]
+pub struct DisabledTyposTerms {
+    pub disable_on_numbers: bool,
+}
+
+impl Index {
+    pub fn disabled_typos_terms(&self, txn: &RoTxn<'_>) -> heed::Result<DisabledTyposTerms> {
+        self.main
+            .remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
+            .get(txn, main_key::DISABLED_TYPOS_TERMS)
+            .map(|option| option.unwrap_or_default())
+    }
+
+    pub(crate) fn put_disabled_typos_terms(
+        &self,
+        txn: &mut RwTxn<'_>,
+        disabled_typos_terms: &DisabledTyposTerms,
+    ) -> heed::Result<()> {
+        self.main.remap_types::<Str, SerdeJson<DisabledTyposTerms>>().put(
+            txn,
+            main_key::DISABLED_TYPOS_TERMS,
+            &disabled_typos_terms,
+        )?;
+
+        Ok(())
+    }
+
+    pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> {
+        self.main
+            .remap_types::<Str, SerdeJson<DisabledTyposTerms>>()
+            .delete(txn, main_key::DISABLED_TYPOS_TERMS)?;
+        Ok(())
+    }
+}
+
+impl DisabledTyposTerms {
+    pub fn is_exact(&self, word: &str) -> bool {
+        // If disable_on_numbers is true, we disable the word if it contains only numbers or punctuation
+        self.disable_on_numbers && word.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation())
+    }
+}
--- a/crates/milli/src/error.rs
+++ b/crates/milli/src/error.rs
@@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
    InvalidGeoField(#[from] GeoError),
    #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
    InvalidVectorDimensions { expected: usize, found: usize },
+    #[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n  - note: embedding #{embedding_index} has dimensions {found}\n  - note: embedder `{embedder_name}` requires {expected}")]
+    InvalidIndexingVectorDimensions {
+        embedder_name: String,
+        document_id: String,
+        embedding_index: usize,
+        expected: usize,
+        found: usize,
+    },
    #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
    InvalidVectorsMapType { document_id: String, value: Value },
    #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@@ -78,6 +78,7 @@ pub mod main_key {
    pub const FACET_SEARCH: &str = "facet_search";
    pub const PREFIX_SEARCH: &str = "prefix_search";
    pub const DOCUMENTS_STATS: &str = "documents_stats";
+    pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms";
 }

 pub mod db_name {
--- a/crates/milli/src/lib.rs
+++ b/crates/milli/src/lib.rs
@@ -12,6 +12,7 @@ mod asc_desc;
 mod attribute_patterns;
 mod criterion;
 pub mod database_stats;
+pub mod disabled_typos_terms;
 mod error;
 mod external_documents_ids;
 pub mod facet;
--- a/crates/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/crates/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -127,7 +127,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
        // merge all deletions
        let obkv = KvReaderDelAdd::from_slice(value);
        if let Some(value) = obkv.get(DelAdd::Deletion) {
-            let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
+            let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid)
+                || settings_diff.old.disabled_typos_terms.is_exact(&w);
            buffer.clear();
            let mut obkv = KvWriterDelAdd::new(&mut buffer);
            obkv.insert(DelAdd::Deletion, value)?;
@@ -139,7 +140,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
        }
        // merge all additions
        if let Some(value) = obkv.get(DelAdd::Addition) {
-            let add_in_exact = settings_diff.new.exact_attributes.contains(&fid);
+            let add_in_exact = settings_diff.new.exact_attributes.contains(&fid)
+                || settings_diff.new.disabled_typos_terms.is_exact(&w);
            buffer.clear();
            let mut obkv = KvWriterDelAdd::new(&mut buffer);
            obkv.insert(DelAdd::Addition, value)?;
--- a/crates/milli/src/update/index_documents/typed_chunk.rs
+++ b/crates/milli/src/update/index_documents/typed_chunk.rs
@@ -273,14 +273,11 @@ pub(crate) fn write_typed_chunk_into_index(
                    unreachable!();
                };
                let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
-                let clonable_exact_word_docids =
-                    unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;

                word_docids_builder.push(word_docids_reader.into_cursor()?);
                exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?);
                word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?);
                fst_merger_builder.push(clonable_word_docids.into_cursor()?);
-                fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?);
            }

            let word_docids_merger = word_docids_builder.build();
--- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
+++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs
@@ -319,8 +319,11 @@ impl WordDocidsExtractors {
        let doc_alloc = &context.doc_alloc;

        let exact_attributes = index.exact_attributes(rtxn)?;
-        let is_exact_attribute =
-            |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
+        let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
+        let is_exact = |fname: &str, word: &str| {
+            exact_attributes.iter().any(|attr| contained_in(fname, attr))
+                || disabled_typos_terms.is_exact(word)
+        };
        match document_change {
            DocumentChange::Deletion(inner) => {
                let mut token_fn = |fname: &str, fid, pos, word: &str| {
@@ -328,7 +331,7 @@ impl WordDocidsExtractors {
                        fid,
                        pos,
                        word,
-                        is_exact_attribute(fname),
+                        is_exact(fname, word),
                        inner.docid(),
                        doc_alloc,
                    )
@@ -356,7 +359,7 @@ impl WordDocidsExtractors {
                        fid,
                        pos,
                        word,
-                        is_exact_attribute(fname),
+                        is_exact(fname, word),
                        inner.docid(),
                        doc_alloc,
                    )
@@ -372,7 +375,7 @@ impl WordDocidsExtractors {
                        fid,
                        pos,
                        word,
-                        is_exact_attribute(fname),
+                        is_exact(fname, word),
                        inner.docid(),
                        doc_alloc,
                    )
@@ -389,7 +392,7 @@ impl WordDocidsExtractors {
                        fid,
                        pos,
                        word,
-                        is_exact_attribute(fname),
+                        is_exact(fname, word),
                        inner.docid(),
                        doc_alloc,
                    )
--- a/crates/milli/src/update/new/extract/vectors/mod.rs
+++ b/crates/milli/src/update/new/extract/vectors/mod.rs
@@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                            // do we have set embeddings?
                            if let Some(embeddings) = new_vectors.embeddings {
                                chunks.set_vectors(
+                                    update.external_document_id(),
                                    update.docid(),
                                    embeddings
                                        .into_vec(&context.doc_alloc, embedder_name)
@@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                            document_id: update.external_document_id().to_string(),
                                            error: error.to_string(),
                                        })?,
-                                );
+                                )?;
                            } else if new_vectors.regenerate {
                                let new_rendered = prompt.render_document(
                                    update.external_document_id(),
@@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                            chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
                            if let Some(embeddings) = new_vectors.embeddings {
                                chunks.set_vectors(
+                                    insertion.external_document_id(),
                                    insertion.docid(),
                                    embeddings
                                        .into_vec(&context.doc_alloc, embedder_name)
@@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
                                                .to_string(),
                                            error: error.to_string(),
                                        })?,
-                                );
+                                )?;
                            } else if new_vectors.regenerate {
                                let rendered = prompt.render_document(
                                    insertion.external_document_id(),
@@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
    embedder: &'a Embedder,
    embedder_id: u8,
    embedder_name: &'a str,
+    dimensions: usize,
    prompt: &'a Prompt,
    possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
    user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
@@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
        let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
        let texts = BVec::with_capacity_in(capacity, doc_alloc);
        let ids = BVec::with_capacity_in(capacity, doc_alloc);
+        let dimensions = embedder.dimensions();
        Self {
            texts,
            ids,
@@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
            embedder_name,
            user_provided,
            has_manual_generation: None,
+            dimensions,
        }
    }

@@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
        }
    }

-    fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
+    fn set_vectors(
+        &self,
+        external_docid: &'a str,
+        docid: DocumentId,
+        embeddings: Vec<Embedding>,
+    ) -> Result<()> {
+        for (embedding_index, embedding) in embeddings.iter().enumerate() {
+            if embedding.len() != self.dimensions {
+                return Err(UserError::InvalidIndexingVectorDimensions {
+                    expected: self.dimensions,
+                    found: embedding.len(),
+                    embedder_name: self.embedder_name.to_string(),
+                    document_id: external_docid.to_string(),
+                    embedding_index,
+                }
+                .into());
+            }
+        }
        self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
+        Ok(())
    }
 }
--- a/crates/milli/src/update/settings.rs
+++ b/crates/milli/src/update/settings.rs
@@ -17,6 +17,7 @@ use super::IndexerConfig;
 use crate::attribute_patterns::PatternMatch;
 use crate::constants::RESERVED_GEO_FIELD_NAME;
 use crate::criterion::Criterion;
+use crate::disabled_typos_terms::DisabledTyposTerms;
 use crate::error::UserError;
 use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
 use crate::filterable_attributes_rules::match_faceted_field;
@@ -169,6 +170,7 @@ pub struct Settings<'a, 't, 'i> {
    synonyms: Setting<BTreeMap<String, Vec<String>>>,
    primary_key: Setting<String>,
    authorize_typos: Setting<bool>,
+    disable_on_numbers: Setting<bool>,
    min_word_len_two_typos: Setting<u8>,
    min_word_len_one_typo: Setting<u8>,
    exact_words: Setting<BTreeSet<String>>,
@@ -207,6 +209,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
            synonyms: Setting::NotSet,
            primary_key: Setting::NotSet,
            authorize_typos: Setting::NotSet,
+            disable_on_numbers: Setting::NotSet,
            exact_words: Setting::NotSet,
            min_word_len_two_typos: Setting::NotSet,
            min_word_len_one_typo: Setting::NotSet,
@@ -354,6 +357,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
        self.min_word_len_one_typo = Setting::Reset;
    }

+    pub fn set_disable_on_numbers(&mut self, disable_on_numbers: bool) {
+        self.disable_on_numbers = Setting::Set(disable_on_numbers);
+    }
+
+    pub fn reset_disable_on_numbers(&mut self) {
+        self.disable_on_numbers = Setting::Reset;
+    }
+
    pub fn set_exact_words(&mut self, words: BTreeSet<String>) {
        self.exact_words = Setting::Set(words);
    }
@@ -866,6 +877,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
        Ok(())
    }

+    fn update_disabled_typos_terms(&mut self) -> Result<()> {
+        let mut disabled_typos_terms = self.index.disabled_typos_terms(self.wtxn)?;
+        match self.disable_on_numbers {
+            Setting::Set(disable_on_numbers) => {
+                disabled_typos_terms.disable_on_numbers = disable_on_numbers;
+            }
+            Setting::Reset => {
+                self.index.delete_disabled_typos_terms(self.wtxn)?;
+                disabled_typos_terms.disable_on_numbers =
+                    DisabledTyposTerms::default().disable_on_numbers;
+            }
+            Setting::NotSet => (),
+        }
+
+        self.index.put_disabled_typos_terms(self.wtxn, &disabled_typos_terms)?;
+        Ok(())
+    }
+
    fn update_exact_words(&mut self) -> Result<()> {
        match self.exact_words {
            Setting::Set(ref mut words) => {
@@ -1246,6 +1275,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
        self.update_prefix_search()?;
        self.update_facet_search()?;
        self.update_localized_attributes_rules()?;
+        self.update_disabled_typos_terms()?;

        let embedding_config_updates = self.update_embedding_configs()?;

@@ -1327,6 +1357,7 @@ impl InnerIndexSettingsDiff {
                || old_settings.prefix_search != new_settings.prefix_search
                || old_settings.localized_attributes_rules
                    != new_settings.localized_attributes_rules
+                || old_settings.disabled_typos_terms != new_settings.disabled_typos_terms
        };

        let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes;
@@ -1526,6 +1557,7 @@ pub(crate) struct InnerIndexSettings {
    pub user_defined_searchable_attributes: Option<Vec<String>>,
    pub sortable_fields: HashSet<String>,
    pub exact_attributes: HashSet<FieldId>,
+    pub disabled_typos_terms: DisabledTyposTerms,
    pub proximity_precision: ProximityPrecision,
    pub embedding_configs: EmbeddingConfigs,
    pub geo_fields_ids: Option<(FieldId, FieldId)>,
@@ -1574,7 +1606,7 @@ impl InnerIndexSettings {
            .map(|fields| fields.into_iter().map(|f| f.to_string()).collect());
        let builder = MetadataBuilder::from_index(index, rtxn)?;
        let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder);
-
+        let disabled_typos_terms = index.disabled_typos_terms(rtxn)?;
        Ok(Self {
            stop_words,
            allowed_separators,
@@ -1592,6 +1624,7 @@ impl InnerIndexSettings {
            geo_fields_ids,
            prefix_search,
            facet_search,
+            disabled_typos_terms,
        })
    }

--- a/crates/milli/src/update/test_settings.rs
+++ b/crates/milli/src/update/test_settings.rs
@@ -896,6 +896,7 @@ fn test_correct_settings_init() {
                localized_attributes_rules,
                prefix_search,
                facet_search,
+                disable_on_numbers,
            } = settings;
            assert!(matches!(searchable_fields, Setting::NotSet));
            assert!(matches!(displayed_fields, Setting::NotSet));
@@ -923,6 +924,7 @@ fn test_correct_settings_init() {
            assert!(matches!(localized_attributes_rules, Setting::NotSet));
            assert!(matches!(prefix_search, Setting::NotSet));
            assert!(matches!(facet_search, Setting::NotSet));
+            assert!(matches!(disable_on_numbers, Setting::NotSet));
        })
        .unwrap();
 }
Author	SHA1	Message	Date
ManyTheFish	ed826a8c8b	Fix test	2025-04-03 17:31:06 +02:00
ManyTheFish	d2ef1cb425	Add test	2025-04-03 17:31:06 +02:00
ManyTheFish	bb389276aa	Add disableOnNumber setting	2025-04-03 17:31:05 +02:00
Tamo	e36a8c50b9	Merge pull request #5478 from meilisearch/enforce-embedding-dimensions Enforce embedding dimensions	2025-03-31 15:31:29 +00:00
Louis Dureuil	08ff135ad6	Fix test	2025-03-31 15:27:49 +02:00
Louis Dureuil	f729864466	Check dimension mismatch at insertion time	2025-03-31 15:27:49 +02:00
Louis Dureuil	94ea263bef	Add new error for dimensions mismatch during indexing	2025-03-31 15:27:49 +02:00
Tamo	0e475cb5e6	fix warn and show what meilisearch understood of the vectors in the cursed test	2025-03-31 13:49:22 +02:00
vuthanhtung2412	62de70b73c	Document problematic case in test and acknowledge PR comment	2025-03-31 13:49:22 +02:00
vuthanhtung2412	7707fb18dd	add embedding with dimension mismatch test case	2025-03-31 13:49:22 +02:00