Fix the synonyms settings display

ensure the synonyms are updated when the tokenizer settings are changed
Support synonyms sinergies
2025-07-18 12:20:48 +00:00 · 2023-07-27 14:12:23 +02:00 · 2023-07-26 09:33:42 +02:00 · 2023-07-25 15:01:42 +02:00 · 2023-07-25 10:55:37 +02:00
9 changed files with 362 additions and 43 deletions
--- a/meilisearch-types/src/settings.rs
+++ b/meilisearch-types/src/settings.rs
@ -509,13 +509,7 @@ pub fn settings(

    let distinct_field = index.distinct_field(rtxn)?.map(String::from);

-    // in milli each word in the synonyms map were split on their separator. Since we lost
-    // this information we are going to put space between words.
-    let synonyms = index
-        .synonyms(rtxn)?
-        .iter()
-        .map(|(key, values)| (key.join(" "), values.iter().map(|value| value.join(" ")).collect()))
-        .collect();
+    let synonyms = index.user_defined_synonyms(rtxn)?;

    let min_typo_word_len = MinWordSizeTyposSetting {
        one_typo: Setting::Set(index.min_word_len_one_typo(rtxn)?),
--- a/meilisearch/tests/settings/get_settings.rs
+++ b/meilisearch/tests/settings/get_settings.rs
@ -65,8 +65,8 @@ async fn get_settings() {
        json!(["words", "typo", "proximity", "attribute", "sort", "exactness"])
    );
    assert_eq!(settings["stopWords"], json!([]));
-    assert_eq!(settings["non_separator_tokens"], json!([]));
-    assert_eq!(settings["separator_tokens"], json!([]));
+    assert_eq!(settings["nonSeparatorTokens"], json!([]));
+    assert_eq!(settings["separatorTokens"], json!([]));
    assert_eq!(settings["dictionary"], json!([]));
    assert_eq!(
        settings["faceting"],
--- a/meilisearch/tests/settings/tokenizer_customization.rs
+++ b/meilisearch/tests/settings/tokenizer_customization.rs
@ -194,3 +194,274 @@ async fn set_and_search() {
        })
        .await;
 }
+
+#[actix_rt::test]
+async fn advanced_synergies() {
+    let documents = json!([
+        {
+            "id": 1,
+            "content": "J.R.R. Tolkien",
+        },
+        {
+            "id": 2,
+            "content": "J. R. R. Tolkien",
+        },
+        {
+            "id": 3,
+            "content": "jrr Tolkien",
+        },
+        {
+            "id": 4,
+            "content": "J.K. Rowlings",
+        },
+        {
+            "id": 5,
+            "content": "J. K. Rowlings",
+        },
+        {
+            "id": 6,
+            "content": "jk Rowlings",
+        },
+    ]);
+
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    index.add_documents(documents, None).await;
+    index.wait_task(0).await;
+
+    let (_response, _code) = index
+        .update_settings(json!({
+            "dictionary": ["J.R.R.", "J. R. R."],
+            "synonyms": {
+                "J.R.R.": ["jrr", "J. R. R."],
+                "J. R. R.": ["jrr", "J.R.R."],
+                "jrr": ["J.R.R.", "J. R. R."],
+                "J.K.": ["jk", "J. K."],
+                "J. K.": ["jk", "J.K."],
+                "jk": ["J.K.", "J. K."],
+            }
+        }))
+        .await;
+    index.wait_task(1).await;
+
+    index
+        .search(json!({"q": "J.R.R.", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 1,
+                "content": "J.R.R. Tolkien",
+                "_formatted": {
+                  "id": "1",
+                  "content": "<em>J.R.R.</em> Tolkien"
+                }
+              },
+              {
+                "id": 2,
+                "content": "J. R. R. Tolkien",
+                "_formatted": {
+                  "id": "2",
+                  "content": "<em>J. R. R.</em> Tolkien"
+                }
+              },
+              {
+                "id": 3,
+                "content": "jrr Tolkien",
+                "_formatted": {
+                  "id": "3",
+                  "content": "<em>jrr</em> Tolkien"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "jrr", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 3,
+                "content": "jrr Tolkien",
+                "_formatted": {
+                  "id": "3",
+                  "content": "<em>jrr</em> Tolkien"
+                }
+              },
+              {
+                "id": 1,
+                "content": "J.R.R. Tolkien",
+                "_formatted": {
+                  "id": "1",
+                  "content": "<em>J.R.R.</em> Tolkien"
+                }
+              },
+              {
+                "id": 2,
+                "content": "J. R. R. Tolkien",
+                "_formatted": {
+                  "id": "2",
+                  "content": "<em>J. R. R.</em> Tolkien"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "J. R. R.", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 2,
+                "content": "J. R. R. Tolkien",
+                "_formatted": {
+                  "id": "2",
+                  "content": "<em>J. R. R.</em> Tolkien"
+                }
+              },
+              {
+                "id": 1,
+                "content": "J.R.R. Tolkien",
+                "_formatted": {
+                  "id": "1",
+                  "content": "<em>J.R.R.</em> Tolkien"
+                }
+              },
+              {
+                "id": 3,
+                "content": "jrr Tolkien",
+                "_formatted": {
+                  "id": "3",
+                  "content": "<em>jrr</em> Tolkien"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    // Only update dictionary, the synonyms should be recomputed.
+    let (_response, _code) = index
+        .update_settings(json!({
+            "dictionary": ["J.R.R.", "J. R. R.", "J.K.", "J. K."],
+        }))
+        .await;
+    index.wait_task(2).await;
+
+    index
+        .search(json!({"q": "jk", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 6,
+                "content": "jk Rowlings",
+                "_formatted": {
+                  "id": "6",
+                  "content": "<em>jk</em> Rowlings"
+                }
+              },
+              {
+                "id": 4,
+                "content": "J.K. Rowlings",
+                "_formatted": {
+                  "id": "4",
+                  "content": "<em>J.K.</em> Rowlings"
+                }
+              },
+              {
+                "id": 5,
+                "content": "J. K. Rowlings",
+                "_formatted": {
+                  "id": "5",
+                  "content": "<em>J. K.</em> Rowlings"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "J.K.", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 4,
+                "content": "J.K. Rowlings",
+                "_formatted": {
+                  "id": "4",
+                  "content": "<em>J.K.</em> Rowlings"
+                }
+              },
+              {
+                "id": 5,
+                "content": "J. K. Rowlings",
+                "_formatted": {
+                  "id": "5",
+                  "content": "<em>J. K.</em> Rowlings"
+                }
+              },
+              {
+                "id": 6,
+                "content": "jk Rowlings",
+                "_formatted": {
+                  "id": "6",
+                  "content": "<em>jk</em> Rowlings"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+
+    index
+        .search(json!({"q": "J. K.", "attributesToHighlight": ["content"]}), |response, code| {
+            snapshot!(code, @"200 OK");
+            snapshot!(json_string!(response["hits"]), @r###"
+            [
+              {
+                "id": 5,
+                "content": "J. K. Rowlings",
+                "_formatted": {
+                  "id": "5",
+                  "content": "<em>J. K.</em> Rowlings"
+                }
+              },
+              {
+                "id": 4,
+                "content": "J.K. Rowlings",
+                "_formatted": {
+                  "id": "4",
+                  "content": "<em>J.K.</em> Rowlings"
+                }
+              },
+              {
+                "id": 6,
+                "content": "jk Rowlings",
+                "_formatted": {
+                  "id": "6",
+                  "content": "<em>jk</em> Rowlings"
+                }
+              },
+              {
+                "id": 2,
+                "content": "J. R. R. Tolkien",
+                "_formatted": {
+                  "id": "2",
+                  "content": "<em>J. R.</em> R. Tolkien"
+                }
+              }
+            ]
+            "###);
+        })
+        .await;
+}
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -1,5 +1,5 @@
 use std::borrow::Cow;
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fs::File;
 use std::mem::size_of;
 use std::path::Path;
@ -65,6 +65,7 @@ pub mod main_key {
    pub const DICTIONARY_KEY: &str = "dictionary";
    pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
    pub const SYNONYMS_KEY: &str = "synonyms";
+    pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
    pub const WORDS_FST_KEY: &str = "words-fst";
    pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
    pub const CREATED_AT_KEY: &str = "created-at";
@ -1138,12 +1139,29 @@ impl Index {
        &self,
        wtxn: &mut RwTxn,
        synonyms: &HashMap<Vec<String>, Vec<Vec<String>>>,
+        user_defined_synonyms: &BTreeMap<String, Vec<String>>,
    ) -> heed::Result<()> {
-        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)
+        self.main.put::<_, Str, SerdeBincode<_>>(wtxn, main_key::SYNONYMS_KEY, synonyms)?;
+        self.main.put::<_, Str, SerdeBincode<_>>(
+            wtxn,
+            main_key::USER_DEFINED_SYNONYMS_KEY,
+            user_defined_synonyms,
+        )
    }

    pub(crate) fn delete_synonyms(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
-        self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)
+        self.main.delete::<_, Str>(wtxn, main_key::SYNONYMS_KEY)?;
+        self.main.delete::<_, Str>(wtxn, main_key::USER_DEFINED_SYNONYMS_KEY)
+    }
+
+    pub fn user_defined_synonyms(
+        &self,
+        rtxn: &RoTxn,
+    ) -> heed::Result<BTreeMap<String, Vec<String>>> {
+        Ok(self
+            .main
+            .get::<_, Str, SerdeBincode<_>>(rtxn, main_key::USER_DEFINED_SYNONYMS_KEY)?
+            .unwrap_or_default())
    }

    pub fn synonyms(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Vec<String>, Vec<Vec<String>>>> {
--- a/milli/src/search/new/tests/integration.rs
+++ b/milli/src/search/new/tests/integration.rs
@ -2,7 +2,7 @@ use std::io::Cursor;

 use big_s::S;
 use heed::EnvOpenOptions;
-use maplit::{hashmap, hashset};
+use maplit::{btreemap, hashset};

 use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
@ -33,7 +33,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
        S("tag"),
        S("asc_desc_rank"),
    });
-    builder.set_synonyms(hashmap! {
+    builder.set_synonyms(btreemap! {
        S("hello") => vec![S("good morning")],
        S("world") => vec![S("earth")],
        S("america") => vec![S("the united states")],
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@ -15,7 +15,7 @@ they store fewer sprximities than the regular word sprximity DB.

 */

-use std::collections::HashMap;
+use std::collections::BTreeMap;

 use crate::index::tests::TempIndex;
 use crate::search::new::tests::collect_field_values;
@ -336,7 +336,7 @@ fn test_proximity_split_word() {

    index
        .update_settings(|s| {
-            let mut syns = HashMap::new();
+            let mut syns = BTreeMap::new();
            syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]);
            s.set_synonyms(syns);
        })
--- a/milli/src/search/new/tests/typo.rs
+++ b/milli/src/search/new/tests/typo.rs
@ -18,7 +18,7 @@ if `words` doesn't exist before it.
 14. Synonyms cost nothing according to the typo ranking rule
 */

-use std::collections::HashMap;
+use std::collections::BTreeMap;

 use crate::index::tests::TempIndex;
 use crate::search::new::tests::collect_field_values;
@ -591,7 +591,7 @@ fn test_typo_synonyms() {
        .update_settings(|s| {
            s.set_criteria(vec![Criterion::Typo]);

-            let mut synonyms = HashMap::new();
+            let mut synonyms = BTreeMap::new();
            synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]);
            synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]);

--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -1,4 +1,4 @@
-use std::collections::{BTreeSet, HashMap, HashSet};
+use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::result::Result as StdResult;

 use charabia::{Normalize, Tokenizer, TokenizerBuilder};
@ -116,7 +116,7 @@ pub struct Settings<'a, 't, 'u, 'i> {
    separator_tokens: Setting<BTreeSet<String>>,
    dictionary: Setting<BTreeSet<String>>,
    distinct_field: Setting<String>,
-    synonyms: Setting<HashMap<String, Vec<String>>>,
+    synonyms: Setting<BTreeMap<String, Vec<String>>>,
    primary_key: Setting<String>,
    authorize_typos: Setting<bool>,
    min_word_len_two_typos: Setting<u8>,
@ -256,7 +256,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        self.synonyms = Setting::Reset;
    }

-    pub fn set_synonyms(&mut self, synonyms: HashMap<String, Vec<String>>) {
+    pub fn set_synonyms(&mut self, synonyms: BTreeMap<String, Vec<String>>) {
        self.synonyms = if synonyms.is_empty() { Setting::Reset } else { Setting::Set(synonyms) }
    }

@ -491,62 +491,83 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
    }

    fn update_non_separator_tokens(&mut self) -> Result<bool> {
-        match self.non_separator_tokens {
+        let changes = match self.non_separator_tokens {
            Setting::Set(ref non_separator_tokens) => {
                let current = self.index.non_separator_tokens(self.wtxn)?;

                // Does the new list differ from the previous one?
                if current.map_or(true, |current| &current != non_separator_tokens) {
                    self.index.put_non_separator_tokens(self.wtxn, non_separator_tokens)?;
-                    Ok(true)
+                    true
                } else {
-                    Ok(false)
+                    false
                }
            }
-            Setting::Reset => Ok(self.index.delete_non_separator_tokens(self.wtxn)?),
-            Setting::NotSet => Ok(false),
+            Setting::Reset => self.index.delete_non_separator_tokens(self.wtxn)?,
+            Setting::NotSet => false,
+        };
+
+        // the synonyms must be updated if non separator tokens have been updated.
+        if changes && self.synonyms == Setting::NotSet {
+            self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
        }
+
+        Ok(changes)
    }

    fn update_separator_tokens(&mut self) -> Result<bool> {
-        match self.separator_tokens {
+        let changes = match self.separator_tokens {
            Setting::Set(ref separator_tokens) => {
                let current = self.index.separator_tokens(self.wtxn)?;

                // Does the new list differ from the previous one?
                if current.map_or(true, |current| &current != separator_tokens) {
                    self.index.put_separator_tokens(self.wtxn, separator_tokens)?;
-                    Ok(true)
+                    true
                } else {
-                    Ok(false)
+                    false
                }
            }
-            Setting::Reset => Ok(self.index.delete_separator_tokens(self.wtxn)?),
-            Setting::NotSet => Ok(false),
+            Setting::Reset => self.index.delete_separator_tokens(self.wtxn)?,
+            Setting::NotSet => false,
+        };
+
+        // the synonyms must be updated if separator tokens have been updated.
+        if changes && self.synonyms == Setting::NotSet {
+            self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
        }
+
+        Ok(changes)
    }

    fn update_dictionary(&mut self) -> Result<bool> {
-        match self.dictionary {
+        let changes = match self.dictionary {
            Setting::Set(ref dictionary) => {
                let current = self.index.dictionary(self.wtxn)?;

                // Does the new list differ from the previous one?
                if current.map_or(true, |current| &current != dictionary) {
                    self.index.put_dictionary(self.wtxn, dictionary)?;
-                    Ok(true)
+                    true
                } else {
-                    Ok(false)
+                    false
                }
            }
-            Setting::Reset => Ok(self.index.delete_dictionary(self.wtxn)?),
-            Setting::NotSet => Ok(false),
+            Setting::Reset => self.index.delete_dictionary(self.wtxn)?,
+            Setting::NotSet => false,
+        };
+
+        // the synonyms must be updated if dictionary has been updated.
+        if changes && self.synonyms == Setting::NotSet {
+            self.synonyms = Setting::Set(self.index.user_defined_synonyms(self.wtxn)?);
        }
+
+        Ok(changes)
    }

    fn update_synonyms(&mut self) -> Result<bool> {
        match self.synonyms {
-            Setting::Set(ref synonyms) => {
+            Setting::Set(ref user_synonyms) => {
                fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
                    tokenizer
                        .tokenize(text)
@ -565,10 +586,25 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                if let Some(ref stop_words) = stop_words {
                    builder.stop_words(stop_words);
                }
+
+                let separators = self.index.allowed_separators(self.wtxn)?;
+                let separators: Option<Vec<_>> =
+                    separators.as_ref().map(|x| x.iter().map(String::as_str).collect());
+                if let Some(ref separators) = separators {
+                    builder.separators(separators);
+                }
+
+                let dictionary = self.index.dictionary(self.wtxn)?;
+                let dictionary: Option<Vec<_>> =
+                    dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect());
+                if let Some(ref dictionary) = dictionary {
+                    builder.words_dict(dictionary);
+                }
+
                let tokenizer = builder.build();

                let mut new_synonyms = HashMap::new();
-                for (word, synonyms) in synonyms {
+                for (word, synonyms) in user_synonyms {
                    // Normalize both the word and associated synonyms.
                    let normalized_word = normalize(&tokenizer, word);
                    let normalized_synonyms =
@ -589,7 +625,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                let old_synonyms = self.index.synonyms(self.wtxn)?;

                if new_synonyms != old_synonyms {
-                    self.index.put_synonyms(self.wtxn, &new_synonyms)?;
+                    self.index.put_synonyms(self.wtxn, &new_synonyms, &user_synonyms)?;
                    Ok(true)
                } else {
                    Ok(false)
@ -876,7 +912,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
 mod tests {
    use big_s::S;
    use heed::types::ByteSlice;
-    use maplit::{btreeset, hashmap, hashset};
+    use maplit::{btreemap, btreeset, hashset};

    use super::*;
    use crate::error::Error;
@ -1342,7 +1378,7 @@ mod tests {
        // In the same transaction provide some synonyms
        index
            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_synonyms(hashmap! {
+                settings.set_synonyms(btreemap! {
                    "blini".to_string() => vec!["crepes".to_string()],
                    "super like".to_string() => vec!["love".to_string()],
                    "puppies".to_string() => vec!["dogs".to_string(), "doggos".to_string()]
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@ -5,7 +5,7 @@ use std::io::Cursor;
 use big_s::S;
 use either::{Either, Left, Right};
 use heed::EnvOpenOptions;
-use maplit::{hashmap, hashset};
+use maplit::{btreemap, hashset};
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object, TermsMatchingStrategy};
@ -51,7 +51,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
        S("tag"),
        S("asc_desc_rank"),
    });
-    builder.set_synonyms(hashmap! {
+    builder.set_synonyms(btreemap! {
        S("hello") => vec![S("good morning")],
        S("world") => vec![S("earth")],
        S("america") => vec![S("the united states")],
Author	SHA1	Message	Date
ManyTheFish	04694071fe	Fix the synonyms settings display	2023-07-27 14:12:23 +02:00
ManyTheFish	b0c1a9504a	ensure the synonyms are updated when the tokenizer settings are changed	2023-07-26 09:33:42 +02:00
ManyTheFish	d57026cd96	Support synonyms sinergies	2023-07-25 15:01:42 +02:00
ManyTheFish	41c9e8856a	Fix test	2023-07-25 10:55:37 +02:00