Fix clippy

Make the search and the indexing work
Be able to set and reset settings
2025-12-03 03:05:34 +00:00 · 2023-07-24 18:42:26 +02:00 · 2023-07-24 18:35:20 +02:00 · 2023-07-24 17:00:18 +02:00 · 2023-07-20 11:15:10 +02:00
16 changed files with 1316 additions and 130 deletions
--- a/dump/src/lib.rs
+++ b/dump/src/lib.rs
@@ -261,6 +261,9 @@ pub(crate) mod test {
            sortable_attributes: Setting::Set(btreeset! { S("age") }),
            ranking_rules: Setting::NotSet,
            stop_words: Setting::NotSet,
+            non_separator_tokens: Setting::NotSet,
+            separator_tokens: Setting::NotSet,
+            dictionary: Setting::NotSet,
            synonyms: Setting::NotSet,
            distinct_attribute: Setting::NotSet,
            typo_tolerance: Setting::NotSet,
--- a/dump/src/reader/compat/v5_to_v6.rs
+++ b/dump/src/reader/compat/v5_to_v6.rs
@@ -340,6 +340,9 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
                }
            },
            stop_words: settings.stop_words.into(),
+            non_separator_tokens: v6::Setting::NotSet,
+            separator_tokens: v6::Setting::NotSet,
+            dictionary: v6::Setting::NotSet,
            synonyms: settings.synonyms.into(),
            distinct_attribute: settings.distinct_attribute.into(),
            typo_tolerance: match settings.typo_tolerance {
--- a/meilisearch-types/src/error.rs
+++ b/meilisearch-types/src/error.rs
@@ -259,6 +259,9 @@ InvalidSettingsRankingRules           , InvalidRequest       , BAD_REQUEST ;
 InvalidSettingsSearchableAttributes   , InvalidRequest       , BAD_REQUEST ;
 InvalidSettingsSortableAttributes     , InvalidRequest       , BAD_REQUEST ;
 InvalidSettingsStopWords              , InvalidRequest       , BAD_REQUEST ;
+InvalidSettingsNonSeparatorTokens     , InvalidRequest       , BAD_REQUEST ;
+InvalidSettingsSeparatorTokens        , InvalidRequest       , BAD_REQUEST ;
+InvalidSettingsDictionary             , InvalidRequest       , BAD_REQUEST ;
 InvalidSettingsSynonyms               , InvalidRequest       , BAD_REQUEST ;
 InvalidSettingsTypoTolerance          , InvalidRequest       , BAD_REQUEST ;
 InvalidState                          , Internal             , INTERNAL_SERVER_ERROR ;
--- a/meilisearch-types/src/settings.rs
+++ b/meilisearch-types/src/settings.rs
@@ -171,6 +171,15 @@ pub struct Settings<T> {
    #[deserr(default, error = DeserrJsonError<InvalidSettingsStopWords>)]
    pub stop_words: Setting<BTreeSet<String>>,
    #[serde(default, skip_serializing_if = "Setting::is_not_set")]
+    #[deserr(default, error = DeserrJsonError<InvalidSettingsNonSeparatorTokens>)]
+    pub non_separator_tokens: Setting<BTreeSet<String>>,
+    #[serde(default, skip_serializing_if = "Setting::is_not_set")]
+    #[deserr(default, error = DeserrJsonError<InvalidSettingsSeparatorTokens>)]
+    pub separator_tokens: Setting<BTreeSet<String>>,
+    #[serde(default, skip_serializing_if = "Setting::is_not_set")]
+    #[deserr(default, error = DeserrJsonError<InvalidSettingsDictionary>)]
+    pub dictionary: Setting<BTreeSet<String>>,
+    #[serde(default, skip_serializing_if = "Setting::is_not_set")]
    #[deserr(default, error = DeserrJsonError<InvalidSettingsSynonyms>)]
    pub synonyms: Setting<BTreeMap<String, Vec<String>>>,
    #[serde(default, skip_serializing_if = "Setting::is_not_set")]
@@ -201,6 +210,9 @@ impl Settings<Checked> {
            ranking_rules: Setting::Reset,
            stop_words: Setting::Reset,
            synonyms: Setting::Reset,
+            non_separator_tokens: Setting::Reset,
+            separator_tokens: Setting::Reset,
+            dictionary: Setting::Reset,
            distinct_attribute: Setting::Reset,
            typo_tolerance: Setting::Reset,
            faceting: Setting::Reset,
@@ -217,6 +229,9 @@ impl Settings<Checked> {
            sortable_attributes,
            ranking_rules,
            stop_words,
+            non_separator_tokens,
+            separator_tokens,
+            dictionary,
            synonyms,
            distinct_attribute,
            typo_tolerance,
@@ -232,6 +247,9 @@ impl Settings<Checked> {
            sortable_attributes,
            ranking_rules,
            stop_words,
+            non_separator_tokens,
+            separator_tokens,
+            dictionary,
            synonyms,
            distinct_attribute,
            typo_tolerance,
@@ -274,6 +292,9 @@ impl Settings<Unchecked> {
            ranking_rules: self.ranking_rules,
            stop_words: self.stop_words,
            synonyms: self.synonyms,
+            non_separator_tokens: self.non_separator_tokens,
+            separator_tokens: self.separator_tokens,
+            dictionary: self.dictionary,
            distinct_attribute: self.distinct_attribute,
            typo_tolerance: self.typo_tolerance,
            faceting: self.faceting,
@@ -335,6 +356,28 @@ pub fn apply_settings_to_builder(
        Setting::NotSet => (),
    }

+    match settings.non_separator_tokens {
+        Setting::Set(ref non_separator_tokens) => {
+            builder.set_non_separator_tokens(non_separator_tokens.clone())
+        }
+        Setting::Reset => builder.reset_non_separator_tokens(),
+        Setting::NotSet => (),
+    }
+
+    match settings.separator_tokens {
+        Setting::Set(ref separator_tokens) => {
+            builder.set_separator_tokens(separator_tokens.clone())
+        }
+        Setting::Reset => builder.reset_separator_tokens(),
+        Setting::NotSet => (),
+    }
+
+    match settings.dictionary {
+        Setting::Set(ref dictionary) => builder.set_dictionary(dictionary.clone()),
+        Setting::Reset => builder.reset_dictionary(),
+        Setting::NotSet => (),
+    }
+
    match settings.synonyms {
        Setting::Set(ref synonyms) => builder.set_synonyms(synonyms.clone().into_iter().collect()),
        Setting::Reset => builder.reset_synonyms(),
@@ -459,6 +502,11 @@ pub fn settings(
        })
        .transpose()?
        .unwrap_or_default();
+
+    let non_separator_tokens = index.non_separator_tokens(rtxn)?.unwrap_or_default();
+    let separator_tokens = index.separator_tokens(rtxn)?.unwrap_or_default();
+    let dictionary = index.dictionary(rtxn)?.unwrap_or_default();
+
    let distinct_field = index.distinct_field(rtxn)?.map(String::from);

    // in milli each word in the synonyms map were split on their separator. Since we lost
@@ -520,6 +568,9 @@ pub fn settings(
        sortable_attributes: Setting::Set(sortable_attributes),
        ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()),
        stop_words: Setting::Set(stop_words),
+        non_separator_tokens: Setting::Set(non_separator_tokens),
+        separator_tokens: Setting::Set(separator_tokens),
+        dictionary: Setting::Set(dictionary),
        distinct_attribute: match distinct_field {
            Some(field) => Setting::Set(field),
            None => Setting::Reset,
@@ -642,6 +693,9 @@ pub(crate) mod test {
            sortable_attributes: Setting::NotSet,
            ranking_rules: Setting::NotSet,
            stop_words: Setting::NotSet,
+            non_separator_tokens: Setting::NotSet,
+            separator_tokens: Setting::NotSet,
+            dictionary: Setting::NotSet,
            synonyms: Setting::NotSet,
            distinct_attribute: Setting::NotSet,
            typo_tolerance: Setting::NotSet,
@@ -663,6 +717,9 @@ pub(crate) mod test {
            sortable_attributes: Setting::NotSet,
            ranking_rules: Setting::NotSet,
            stop_words: Setting::NotSet,
+            non_separator_tokens: Setting::NotSet,
+            separator_tokens: Setting::NotSet,
+            dictionary: Setting::NotSet,
            synonyms: Setting::NotSet,
            distinct_attribute: Setting::NotSet,
            typo_tolerance: Setting::NotSet,
--- a/meilisearch/src/routes/indexes/settings.rs
+++ b/meilisearch/src/routes/indexes/settings.rs
@@ -309,6 +309,81 @@ make_setting_route!(
    }
 );

+make_setting_route!(
+    "/non-separator-tokens",
+    put,
+    std::collections::BTreeSet<String>,
+    meilisearch_types::deserr::DeserrJsonError<
+        meilisearch_types::error::deserr_codes::InvalidSettingsNonSeparatorTokens,
+    >,
+    non_separator_tokens,
+    "nonSeparatorTokens",
+    analytics,
+    |non_separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
+        use serde_json::json;
+
+        analytics.publish(
+            "nonSeparatorTokens Updated".to_string(),
+            json!({
+                "non_separator_tokens": {
+                    "total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()),
+                },
+            }),
+            Some(req),
+        );
+    }
+);
+
+make_setting_route!(
+    "/separator-tokens",
+    put,
+    std::collections::BTreeSet<String>,
+    meilisearch_types::deserr::DeserrJsonError<
+        meilisearch_types::error::deserr_codes::InvalidSettingsSeparatorTokens,
+    >,
+    separator_tokens,
+    "separatorTokens",
+    analytics,
+    |separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
+        use serde_json::json;
+
+        analytics.publish(
+            "separatorTokens Updated".to_string(),
+            json!({
+                "separator_tokens": {
+                    "total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()),
+                },
+            }),
+            Some(req),
+        );
+    }
+);
+
+make_setting_route!(
+    "/dictionary",
+    put,
+    std::collections::BTreeSet<String>,
+    meilisearch_types::deserr::DeserrJsonError<
+        meilisearch_types::error::deserr_codes::InvalidSettingsDictionary,
+    >,
+    dictionary,
+    "dictionary",
+    analytics,
+    |dictionary: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
+        use serde_json::json;
+
+        analytics.publish(
+            "dictionary Updated".to_string(),
+            json!({
+                "dictionary": {
+                    "total": dictionary.as_ref().map(|dictionary| dictionary.len()),
+                },
+            }),
+            Some(req),
+        );
+    }
+);
+
 make_setting_route!(
    "/synonyms",
    put,
--- a/meilisearch/src/search.rs
+++ b/meilisearch/src/search.rs
@@ -491,6 +491,20 @@ pub fn perform_search(
        tokenizer_builder.allow_list(&script_lang_map);
    }

+    let separators = index.allowed_separators(&rtxn)?;
+    let separators: Option<Vec<_>> =
+        separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+    if let Some(ref separators) = separators {
+        tokenizer_builder.separators(separators);
+    }
+
+    let dictionary = index.dictionary(&rtxn)?;
+    let dictionary: Option<Vec<_>> =
+        dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
+    if let Some(ref dictionary) = dictionary {
+        tokenizer_builder.words_dict(dictionary);
+    }
+
    let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
    formatter_builder.crop_marker(query.crop_marker);
    formatter_builder.highlight_prefix(query.highlight_pre_tag);
--- a/meilisearch/tests/dumps/mod.rs
+++ b/meilisearch/tests/dumps/mod.rs
--- a/meilisearch/tests/settings/get_settings.rs
+++ b/meilisearch/tests/settings/get_settings.rs
@@ -16,6 +16,9 @@ static DEFAULT_SETTINGS_VALUES: Lazy<HashMap<&'static str, Value>> = Lazy::new(|
        json!(["words", "typo", "proximity", "attribute", "sort", "exactness"]),
    );
    map.insert("stop_words", json!([]));
+    map.insert("non_separator_tokens", json!([]));
+    map.insert("separator_tokens", json!([]));
+    map.insert("dictionary", json!([]));
    map.insert("synonyms", json!({}));
    map.insert(
        "faceting",
@@ -51,7 +54,7 @@ async fn get_settings() {
    let (response, code) = index.settings().await;
    assert_eq!(code, 200);
    let settings = response.as_object().unwrap();
-    assert_eq!(settings.keys().len(), 11);
+    assert_eq!(settings.keys().len(), 14);
    assert_eq!(settings["displayedAttributes"], json!(["*"]));
    assert_eq!(settings["searchableAttributes"], json!(["*"]));
    assert_eq!(settings["filterableAttributes"], json!([]));
@@ -62,6 +65,9 @@ async fn get_settings() {
        json!(["words", "typo", "proximity", "attribute", "sort", "exactness"])
    );
    assert_eq!(settings["stopWords"], json!([]));
+    assert_eq!(settings["non_separator_tokens"], json!([]));
+    assert_eq!(settings["separator_tokens"], json!([]));
+    assert_eq!(settings["dictionary"], json!([]));
    assert_eq!(
        settings["faceting"],
        json!({
--- a/meilisearch/tests/settings/mod.rs
+++ b/meilisearch/tests/settings/mod.rs
@@ -1,3 +1,4 @@
 mod distinct;
 mod errors;
 mod get_settings;
+mod tokenizer_customization;
--- a/meilisearch/tests/settings/tokenizer_customization.rs
+++ b/meilisearch/tests/settings/tokenizer_customization.rs
@@ -0,0 +1,196 @@
+use meili_snap::{json_string, snapshot};
+use serde_json::json;
+
+use crate::common::Server;
+
+#[actix_rt::test]
+async fn set_and_reset() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let (_response, _code) = index
+        .update_settings(json!({
+            "nonSeparatorTokens": ["#", "&"],
+            "separatorTokens": ["&sep", "<br/>"],
+            "dictionary": ["J.R.R.", "J. R. R."],
+        }))
+        .await;
+    index.wait_task(0).await;
+
+    let (response, _) = index.settings().await;
+    snapshot!(json_string!(response["nonSeparatorTokens"]), @r###"
+    [
+      "#",
+      "&"
+    ]
+    "###);
+    snapshot!(json_string!(response["separatorTokens"]), @r###"
+    [
+      "&sep",
+      "<br/>"
+    ]
+    "###);
+    snapshot!(json_string!(response["dictionary"]), @r###"
+    [
+      "J. R. R.",
+      "J.R.R."
+    ]
+    "###);
+
+    index
+        .update_settings(json!({
+            "nonSeparatorTokens": null,
+            "separatorTokens": null,
+            "dictionary": null,
+        }))
+        .await;
+
+    index.wait_task(1).await;
+
+    let (response, _) = index.settings().await;
+    snapshot!(json_string!(response["nonSeparatorTokens"]), @"[]");
+    snapshot!(json_string!(response["separatorTokens"]), @"[]");
+    snapshot!(json_string!(response["dictionary"]), @"[]");
+}
+
+#[actix_rt::test]
+async fn set_and_search() {
+    let documents = json!([
+        {
+            "id": 1,
+            "content": "Mac & cheese",
+        },
+        {
+            "id": 2,
+            "content": "G#D#G#D#G#C#D#G#C#",
+        },
+        {
+            "id": 3,
+            "content": "Mac&sep&&sepcheese",
+        },
+    ]);
+
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    index.add_documents(documents, None).await;
+    index.wait_task(0).await;
+
+    let (_response, _code) = index
+        .update_settings(json!({
+            "nonSeparatorTokens": ["#", "&"],
+            "separatorTokens": ["<br/>", "&sep"],
+            "dictionary": ["#", "A#", "B#", "C#", "D#", "E#", "F#", "G#"],
+        }))
+        .await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "&", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 1,
+                "content": "Mac & cheese",
+                "_formatted": {
+                  "id": "1",
+                  "content": "Mac <em>&</em> cheese"
+                }
+              },
+              {
+                "id": 3,
+                "content": "Mac&sep&&sepcheese",
+                "_formatted": {
+                  "id": "3",
+                  "content": "Mac&sep<em>&</em>&sepcheese"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(
+            json!({"q": "Mac & cheese", "attributesToHighlight": ["content"]}),
+            |response, code| {
+                snapshot!(code, @"200 OK");
+                snapshot!(json_string!(response["hits"]), @r###"
+                [
+                  {
+                    "id": 1,
+                    "content": "Mac & cheese",
+                    "_formatted": {
+                      "id": "1",
+                      "content": "<em>Mac</em> <em>&</em> <em>cheese</em>"
+                    }
+                  },
+                  {
+                    "id": 3,
+                    "content": "Mac&sep&&sepcheese",
+                    "_formatted": {
+                      "id": "3",
+                      "content": "<em>Mac</em>&sep<em>&</em>&sep<em>cheese</em>"
+                    }
+                  }
+                ]
+                "###);
+            },
+        )
+        .await;
+
+    index
+        .search(
+            json!({"q": "Mac&sep&&sepcheese", "attributesToHighlight": ["content"]}),
+            |response, code| {
+                snapshot!(code, @"200 OK");
+                snapshot!(json_string!(response["hits"]), @r###"
+                [
+                  {
+                    "id": 1,
+                    "content": "Mac & cheese",
+                    "_formatted": {
+                      "id": "1",
+                      "content": "<em>Mac</em> <em>&</em> <em>cheese</em>"
+                    }
+                  },
+                  {
+                    "id": 3,
+                    "content": "Mac&sep&&sepcheese",
+                    "_formatted": {
+                      "id": "3",
+                      "content": "<em>Mac</em>&sep<em>&</em>&sep<em>cheese</em>"
+                    }
+                  }
+                ]
+                "###);
+            },
+        )
+        .await;
+
+    index
+        .search(json!({"q": "C#D#G", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 2,
+                "content": "G#D#G#D#G#C#D#G#C#",
+                "_formatted": {
+                  "id": "2",
+                  "content": "<em>G</em>#<em>D#</em><em>G</em>#<em>D#</em><em>G</em>#<em>C#</em><em>D#</em><em>G</em>#<em>C#</em>"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "#", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @"[]");
+        })
+        .await;
+}
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1,5 +1,5 @@
 use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
+use std::collections::{BTreeSet, HashMap, HashSet};
 use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
@@ -60,6 +60,9 @@ pub mod main_key {
    pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
    pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
    pub const STOP_WORDS_KEY: &str = "stop-words";
+    pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
+    pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
+    pub const DICTIONARY_KEY: &str = "dictionary";
    pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
    pub const SYNONYMS_KEY: &str = "synonyms";
    pub const WORDS_FST_KEY: &str = "words-fst";
@@ -1048,6 +1051,87 @@ impl Index {
        }
    }

+    /* non separator tokens */
+
+    pub(crate) fn put_non_separator_tokens(
+        &self,
+        wtxn: &mut RwTxn,
+        set: &BTreeSet<String>,
+    ) -> heed::Result<()> {
+        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY, set)
+    }
+
+    pub(crate) fn delete_non_separator_tokens(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.delete::<_, Str>(wtxn, main_key::NON_SEPARATOR_TOKENS_KEY)
+    }
+
+    pub fn non_separator_tokens(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
+        Ok(self.main.get::<_, Str, SerdeBincode<BTreeSet<String>>>(
+            rtxn,
+            main_key::NON_SEPARATOR_TOKENS_KEY,
+        )?)
+    }
+
+    /* separator tokens */
+
+    pub(crate) fn put_separator_tokens(
+        &self,
+        wtxn: &mut RwTxn,
+        set: &BTreeSet<String>,
+    ) -> heed::Result<()> {
+        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SEPARATOR_TOKENS_KEY, set)
+    }
+
+    pub(crate) fn delete_separator_tokens(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.delete::<_, Str>(wtxn, main_key::SEPARATOR_TOKENS_KEY)
+    }
+
+    pub fn separator_tokens(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
+        Ok(self
+            .main
+            .get::<_, Str, SerdeBincode<BTreeSet<String>>>(rtxn, main_key::SEPARATOR_TOKENS_KEY)?)
+    }
+
+    /* separators easing method */
+
+    pub fn allowed_separators(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
+        let default_separators =
+            charabia::separators::DEFAULT_SEPARATORS.iter().map(|s| s.to_string());
+        let mut separators: Option<BTreeSet<_>> = None;
+        if let Some(mut separator_tokens) = self.separator_tokens(rtxn)? {
+            separator_tokens.extend(default_separators.clone());
+            separators = Some(separator_tokens);
+        }
+
+        if let Some(non_separator_tokens) = self.non_separator_tokens(rtxn)? {
+            separators = separators
+                .or_else(|| Some(default_separators.collect()))
+                .map(|separators| &separators - &non_separator_tokens);
+        }
+
+        Ok(separators)
+    }
+
+    /* dictionary */
+
+    pub(crate) fn put_dictionary(
+        &self,
+        wtxn: &mut RwTxn,
+        set: &BTreeSet<String>,
+    ) -> heed::Result<()> {
+        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::DICTIONARY_KEY, set)
+    }
+
+    pub(crate) fn delete_dictionary(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.delete::<_, Str>(wtxn, main_key::DICTIONARY_KEY)
+    }
+
+    pub fn dictionary(&self, rtxn: &RoTxn) -> Result<Option<BTreeSet<String>>> {
+        Ok(self
+            .main
+            .get::<_, Str, SerdeBincode<BTreeSet<String>>>(rtxn, main_key::DICTIONARY_KEY)?)
+    }
+
    /* synonyms */

    pub(crate) fn put_synonyms(
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -479,6 +479,20 @@ pub fn execute_search(
            tokbuilder.stop_words(stop_words);
        }

+        let separators = ctx.index.allowed_separators(ctx.txn)?;
+        let separators: Option<Vec<_>> =
+            separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+        if let Some(ref separators) = separators {
+            tokbuilder.separators(separators);
+        }
+
+        let dictionary = ctx.index.dictionary(ctx.txn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
+        if let Some(ref dictionary) = dictionary {
+            tokbuilder.words_dict(dictionary);
+        }
+
        let script_lang_map = ctx.index.script_language(ctx.txn)?;
        if !script_lang_map.is_empty() {
            tokbuilder.allow_list(&script_lang_map);
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -28,6 +28,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    indexer: GrenadParameters,
    searchable_fields: &Option<HashSet<FieldId>>,
    stop_words: Option<&fst::Set<&[u8]>>,
+    allowed_separators: Option<&Vec<&str>>,
+    dictionary: Option<&Vec<&str>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(RoaringBitmap, grenad::Reader<File>, ScriptLanguageDocidsMap)> {
    puffin::profile_function!();
@@ -52,6 +54,14 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
    if let Some(stop_words) = stop_words {
        tokenizer_builder.stop_words(stop_words);
    }
+    if let Some(dictionary) = dictionary {
+        // let dictionary: Vec<_> = dictionary.iter().map(String::as_str).collect();
+        tokenizer_builder.words_dict(dictionary.as_slice());
+    }
+    if let Some(separators) = allowed_separators {
+        // let separators: Vec<_> = separators.iter().map(String::as_str).collect();
+        tokenizer_builder.separators(separators.as_slice());
+    }
    let tokenizer = tokenizer_builder.build();

    let mut cursor = obkv_documents.into_cursor()?;
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -49,6 +49,8 @@ pub(crate) fn data_from_obkv_documents(
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vectors_field_id: Option<FieldId>,
    stop_words: Option<fst::Set<&[u8]>>,
+    allowed_separators: Option<Vec<&str>>,
+    dictionary: Option<Vec<&str>>,
    max_positions_per_attributes: Option<u32>,
    exact_attributes: HashSet<FieldId>,
 ) -> Result<()> {
@@ -76,6 +78,8 @@ pub(crate) fn data_from_obkv_documents(
                    geo_fields_ids,
                    vectors_field_id,
                    &stop_words,
+                    &allowed_separators,
+                    &dictionary,
                    max_positions_per_attributes,
                )
            })
@@ -289,6 +293,8 @@ fn send_and_extract_flattened_documents_data(
    geo_fields_ids: Option<(FieldId, FieldId)>,
    vectors_field_id: Option<FieldId>,
    stop_words: &Option<fst::Set<&[u8]>>,
+    allowed_separators: &Option<Vec<&str>>,
+    dictionary: &Option<Vec<&str>>,
    max_positions_per_attributes: Option<u32>,
 ) -> Result<(
    grenad::Reader<CursorClonableMmap>,
@@ -344,6 +350,8 @@ fn send_and_extract_flattened_documents_data(
                        indexer,
                        searchable_fields,
                        stop_words.as_ref(),
+                        allowed_separators.as_ref(),
+                        dictionary.as_ref(),
                        max_positions_per_attributes,
                    )?;

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -316,6 +316,12 @@ where
        let vectors_field_id = self.index.fields_ids_map(self.wtxn)?.id("_vectors");

        let stop_words = self.index.stop_words(self.wtxn)?;
+        let separators = self.index.allowed_separators(self.wtxn)?;
+        let separators: Option<Vec<_>> =
+            separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+        let dictionary = self.index.dictionary(self.wtxn)?;
+        let dictionary: Option<Vec<_>> =
+            dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
        let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?;

        let pool_params = GrenadParameters {
@@ -353,6 +359,8 @@ where
                    geo_fields_ids,
                    vectors_field_id,
                    stop_words,
+                    separators,
+                    dictionary,
                    max_positions_per_attributes,
                    exact_attributes,
                )
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -112,6 +112,9 @@ pub struct Settings<'a, 't, 'u, 'i> {
    sortable_fields: Setting<HashSet<String>>,
    criteria: Setting<Vec<Criterion>>,
    stop_words: Setting<BTreeSet<String>>,
+    non_separator_tokens: Setting<BTreeSet<String>>,
+    separator_tokens: Setting<BTreeSet<String>>,
+    dictionary: Setting<BTreeSet<String>>,
    distinct_field: Setting<String>,
    synonyms: Setting<HashMap<String, Vec<String>>>,
    primary_key: Setting<String>,
@@ -141,6 +144,9 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
            sortable_fields: Setting::NotSet,
            criteria: Setting::NotSet,
            stop_words: Setting::NotSet,
+            non_separator_tokens: Setting::NotSet,
+            separator_tokens: Setting::NotSet,
+            dictionary: Setting::NotSet,
            distinct_field: Setting::NotSet,
            synonyms: Setting::NotSet,
            primary_key: Setting::NotSet,
@@ -205,6 +211,39 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
            if stop_words.is_empty() { Setting::Reset } else { Setting::Set(stop_words) }
    }

+    pub fn reset_non_separator_tokens(&mut self) {
+        self.non_separator_tokens = Setting::Reset;
+    }
+
+    pub fn set_non_separator_tokens(&mut self, non_separator_tokens: BTreeSet<String>) {
+        self.non_separator_tokens = if non_separator_tokens.is_empty() {
+            Setting::Reset
+        } else {
+            Setting::Set(non_separator_tokens)
+        }
+    }
+
+    pub fn reset_separator_tokens(&mut self) {
+        self.separator_tokens = Setting::Reset;
+    }
+
+    pub fn set_separator_tokens(&mut self, separator_tokens: BTreeSet<String>) {
+        self.separator_tokens = if separator_tokens.is_empty() {
+            Setting::Reset
+        } else {
+            Setting::Set(separator_tokens)
+        }
+    }
+
+    pub fn reset_dictionary(&mut self) {
+        self.dictionary = Setting::Reset;
+    }
+
+    pub fn set_dictionary(&mut self, dictionary: BTreeSet<String>) {
+        self.dictionary =
+            if dictionary.is_empty() { Setting::Reset } else { Setting::Set(dictionary) }
+    }
+
    pub fn reset_distinct_field(&mut self) {
        self.distinct_field = Setting::Reset;
    }
@@ -451,6 +490,60 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        }
    }

+    fn update_non_separator_tokens(&mut self) -> Result<bool> {
+        match self.non_separator_tokens {
+            Setting::Set(ref non_separator_tokens) => {
+                let current = self.index.non_separator_tokens(self.wtxn)?;
+
+                // Does the new list differ from the previous one?
+                if current.map_or(true, |current| &current != non_separator_tokens) {
+                    self.index.put_non_separator_tokens(self.wtxn, non_separator_tokens)?;
+                    Ok(true)
+                } else {
+                    Ok(false)
+                }
+            }
+            Setting::Reset => Ok(self.index.delete_non_separator_tokens(self.wtxn)?),
+            Setting::NotSet => Ok(false),
+        }
+    }
+
+    fn update_separator_tokens(&mut self) -> Result<bool> {
+        match self.separator_tokens {
+            Setting::Set(ref separator_tokens) => {
+                let current = self.index.separator_tokens(self.wtxn)?;
+
+                // Does the new list differ from the previous one?
+                if current.map_or(true, |current| &current != separator_tokens) {
+                    self.index.put_separator_tokens(self.wtxn, separator_tokens)?;
+                    Ok(true)
+                } else {
+                    Ok(false)
+                }
+            }
+            Setting::Reset => Ok(self.index.delete_separator_tokens(self.wtxn)?),
+            Setting::NotSet => Ok(false),
+        }
+    }
+
+    fn update_dictionary(&mut self) -> Result<bool> {
+        match self.dictionary {
+            Setting::Set(ref dictionary) => {
+                let current = self.index.dictionary(self.wtxn)?;
+
+                // Does the new list differ from the previous one?
+                if current.map_or(true, |current| &current != dictionary) {
+                    self.index.put_dictionary(self.wtxn, dictionary)?;
+                    Ok(true)
+                } else {
+                    Ok(false)
+                }
+            }
+            Setting::Reset => Ok(self.index.delete_dictionary(self.wtxn)?),
+            Setting::NotSet => Ok(false),
+        }
+    }
+
    fn update_synonyms(&mut self) -> Result<bool> {
        match self.synonyms {
            Setting::Set(ref synonyms) => {
@@ -756,11 +849,17 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        let faceted_updated = old_faceted_fields != new_faceted_fields;

        let stop_words_updated = self.update_stop_words()?;
+        let non_separator_tokens_updated = self.update_non_separator_tokens()?;
+        let separator_tokens_updated = self.update_separator_tokens()?;
+        let dictionary_updated = self.update_dictionary()?;
        let synonyms_updated = self.update_synonyms()?;
        let searchable_updated = self.update_searchable()?;
        let exact_attributes_updated = self.update_exact_attributes()?;

        if stop_words_updated
+            || non_separator_tokens_updated
+            || separator_tokens_updated
+            || dictionary_updated
            || faceted_updated
            || synonyms_updated
            || searchable_updated
@@ -1539,6 +1638,9 @@ mod tests {
                    sortable_fields,
                    criteria,
                    stop_words,
+                    non_separator_tokens,
+                    separator_tokens,
+                    dictionary,
                    distinct_field,
                    synonyms,
                    primary_key,
@@ -1557,6 +1659,9 @@ mod tests {
                assert!(matches!(sortable_fields, Setting::NotSet));
                assert!(matches!(criteria, Setting::NotSet));
                assert!(matches!(stop_words, Setting::NotSet));
+                assert!(matches!(non_separator_tokens, Setting::NotSet));
+                assert!(matches!(separator_tokens, Setting::NotSet));
+                assert!(matches!(dictionary, Setting::NotSet));
                assert!(matches!(distinct_field, Setting::NotSet));
                assert!(matches!(synonyms, Setting::NotSet));
                assert!(matches!(primary_key, Setting::NotSet));
Author	SHA1	Message	Date
ManyTheFish	d4ff59fcf5	Fix clippy	2023-07-24 18:42:26 +02:00
ManyTheFish	9c485f8563	Make the search and the indexing work	2023-07-24 18:35:20 +02:00
ManyTheFish	d8d12d5979	Be able to set and reset settings	2023-07-24 17:00:18 +02:00
ManyTheFish	0597a97c84	Update tests	2023-07-20 11:15:10 +02:00