mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-25 04:56:28 +00:00 
			
		
		
		
	Merge #4819
4819: Language settings r=dureuill a=ManyTheFish # Pull Request ## Related issue Fixes #4749 ## What does this PR do? - [Implement localized search](c0c6955c0d) - [Implement localized attributes settings](bde827b055) ## Related PRD - [PRD](https://www.notion.so/meilisearch/Define-language-settings-to-impact-relevancy-bee62e18b7584c4f87d18a7654855329) - [Public usage](https://www.notion.so/meilisearch/v1-10-Language-settings-usage-26c5d98b553349d9abacbe7aff698e4e) Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
		
							
								
								
									
										37
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										37
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -934,19 +934,15 @@ dependencies = [ | |||||||
| [[package]] | [[package]] | ||||||
| name = "charabia" | name = "charabia" | ||||||
| version = "0.8.12" | version = "0.8.12" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#2992583137458afcebff5d44cae93fa46d9cf664" | ||||||
| checksum = "9868a22f10dee80498a8a2b6c641d80bf28ea4495fcf71c2dc4836c2dd23958c" |  | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "aho-corasick", |  "aho-corasick", | ||||||
|  "cow-utils", |  | ||||||
|  "csv", |  "csv", | ||||||
|  "deunicode", |  | ||||||
|  "either", |  "either", | ||||||
|  "fst", |  "fst", | ||||||
|  "irg-kvariants", |  "irg-kvariants", | ||||||
|  "jieba-rs", |  "jieba-rs", | ||||||
|  "lindera", |  "lindera", | ||||||
|  "litemap", |  | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "pinyin", |  "pinyin", | ||||||
|  "serde", |  "serde", | ||||||
| @@ -954,7 +950,6 @@ dependencies = [ | |||||||
|  "unicode-normalization", |  "unicode-normalization", | ||||||
|  "wana_kana", |  "wana_kana", | ||||||
|  "whatlang", |  "whatlang", | ||||||
|  "zerovec", |  | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -1145,12 +1140,6 @@ version = "0.8.4" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" | checksum = "e496a50fda8aacccc86d7529e2c1e0892dbd0f898a6b5645b5561b89c3210efa" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "cow-utils" |  | ||||||
| version = "0.1.2" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "79bb3adfaf5f75d24b01aee375f7555907840fa2800e5ec8fa3b9e2031830173" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "cpufeatures" | name = "cpufeatures" | ||||||
| version = "0.2.12" | version = "0.2.12" | ||||||
| @@ -1551,12 +1540,6 @@ dependencies = [ | |||||||
|  "syn 2.0.60", |  "syn 2.0.60", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "deunicode" |  | ||||||
| version = "1.6.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "digest" | name = "digest" | ||||||
| version = "0.10.7" | version = "0.10.7" | ||||||
| @@ -2666,8 +2649,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6" | |||||||
| [[package]] | [[package]] | ||||||
| name = "irg-kvariants" | name = "irg-kvariants" | ||||||
| version = "0.1.1" | version = "0.1.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "git+https://github.com/meilisearch/charabia.git?branch=simplify-lang-detection#2992583137458afcebff5d44cae93fa46d9cf664" | ||||||
| checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26" |  | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "csv", |  "csv", | ||||||
|  "once_cell", |  "once_cell", | ||||||
| @@ -3278,12 +3260,6 @@ dependencies = [ | |||||||
|  "unicode-segmentation", |  "unicode-segmentation", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "litemap" |  | ||||||
| version = "0.7.3" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "643cb0b8d4fcc284004d5fd0d67ccf61dfffadb7f75e1e71bc420f4688a3a704" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lmdb-master-sys" | name = "lmdb-master-sys" | ||||||
| version = "0.2.2" | version = "0.2.2" | ||||||
| @@ -6506,15 +6482,6 @@ dependencies = [ | |||||||
|  "syn 2.0.60", |  "syn 2.0.60", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "zerovec" |  | ||||||
| version = "0.10.4" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" |  | ||||||
| dependencies = [ |  | ||||||
|  "zerofrom", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "zip" | name = "zip" | ||||||
| version = "1.1.4" | version = "1.1.4" | ||||||
|   | |||||||
| @@ -286,6 +286,7 @@ pub(crate) mod test { | |||||||
|             pagination: Setting::NotSet, |             pagination: Setting::NotSet, | ||||||
|             embedders: Setting::NotSet, |             embedders: Setting::NotSet, | ||||||
|             search_cutoff_ms: Setting::NotSet, |             search_cutoff_ms: Setting::NotSet, | ||||||
|  |             localized_attributes: Setting::NotSet, | ||||||
|             _kind: std::marker::PhantomData, |             _kind: std::marker::PhantomData, | ||||||
|         }; |         }; | ||||||
|         settings.check() |         settings.check() | ||||||
|   | |||||||
| @@ -379,6 +379,7 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> { | |||||||
|                 v5::Setting::NotSet => v6::Setting::NotSet, |                 v5::Setting::NotSet => v6::Setting::NotSet, | ||||||
|             }, |             }, | ||||||
|             embedders: v6::Setting::NotSet, |             embedders: v6::Setting::NotSet, | ||||||
|  |             localized_attributes: v6::Setting::NotSet, | ||||||
|             search_cutoff_ms: v6::Setting::NotSet, |             search_cutoff_ms: v6::Setting::NotSet, | ||||||
|             _kind: std::marker::PhantomData, |             _kind: std::marker::PhantomData, | ||||||
|         } |         } | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | ||||||
| 2 {uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} | 2 {uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | ||||||
| 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} | 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | 1 {uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
| enqueued [0,] | enqueued [0,] | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(384), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), api_key: NotSet, dimensions: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), url: NotSet, request: NotSet, response: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
| enqueued [] | enqueued [] | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
| enqueued [0,] | enqueued [0,] | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
| enqueued [] | enqueued [] | ||||||
|   | |||||||
| @@ -256,6 +256,7 @@ InvalidSearchCropLength               , InvalidRequest       , BAD_REQUEST ; | |||||||
| InvalidSearchCropMarker               , InvalidRequest       , BAD_REQUEST ; | InvalidSearchCropMarker               , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidSearchFacets                   , InvalidRequest       , BAD_REQUEST ; | InvalidSearchFacets                   , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidSearchSemanticRatio            , InvalidRequest       , BAD_REQUEST ; | InvalidSearchSemanticRatio            , InvalidRequest       , BAD_REQUEST ; | ||||||
|  | InvalidSearchLocales                  , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidFacetSearchFacetName           , InvalidRequest       , BAD_REQUEST ; | InvalidFacetSearchFacetName           , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidSimilarId                      , InvalidRequest       , BAD_REQUEST ; | InvalidSimilarId                      , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidSearchFilter                   , InvalidRequest       , BAD_REQUEST ; | InvalidSearchFilter                   , InvalidRequest       , BAD_REQUEST ; | ||||||
| @@ -297,6 +298,7 @@ InvalidSettingsSeparatorTokens        , InvalidRequest       , BAD_REQUEST ; | |||||||
| InvalidSettingsDictionary             , InvalidRequest       , BAD_REQUEST ; | InvalidSettingsDictionary             , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidSettingsSynonyms               , InvalidRequest       , BAD_REQUEST ; | InvalidSettingsSynonyms               , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidSettingsTypoTolerance          , InvalidRequest       , BAD_REQUEST ; | InvalidSettingsTypoTolerance          , InvalidRequest       , BAD_REQUEST ; | ||||||
|  | InvalidSettingsLocalizedAttributes    , InvalidRequest       , BAD_REQUEST ; | ||||||
| InvalidState                          , Internal             , INTERNAL_SERVER_ERROR ; | InvalidState                          , Internal             , INTERNAL_SERVER_ERROR ; | ||||||
| InvalidStoreFile                      , Internal             , INTERNAL_SERVER_ERROR ; | InvalidStoreFile                      , Internal             , INTERNAL_SERVER_ERROR ; | ||||||
| InvalidSwapDuplicateIndexFound        , InvalidRequest       , BAD_REQUEST ; | InvalidSwapDuplicateIndexFound        , InvalidRequest       , BAD_REQUEST ; | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ pub mod features; | |||||||
| pub mod index_uid; | pub mod index_uid; | ||||||
| pub mod index_uid_pattern; | pub mod index_uid_pattern; | ||||||
| pub mod keys; | pub mod keys; | ||||||
|  | pub mod locales; | ||||||
| pub mod settings; | pub mod settings; | ||||||
| pub mod star_or; | pub mod star_or; | ||||||
| pub mod task_view; | pub mod task_view; | ||||||
|   | |||||||
							
								
								
									
										157
									
								
								meilisearch-types/src/locales.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										157
									
								
								meilisearch-types/src/locales.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,157 @@ | |||||||
|  | use deserr::Deserr; | ||||||
|  | use milli::LocalizedAttributesRule; | ||||||
|  | use serde::{Deserialize, Serialize}; | ||||||
|  | use serde_json::json; | ||||||
|  |  | ||||||
|  | /// Generate a Locale enum and its From and Into implementations for milli::tokenizer::Language. | ||||||
|  | /// | ||||||
|  | /// this enum implements `Deserr` in order to be used in the API. | ||||||
|  | macro_rules! make_locale { | ||||||
|  |  | ||||||
|  |     ($($language:tt), +) => { | ||||||
|  |         #[derive(Debug, Copy, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize, Ord, PartialOrd)] | ||||||
|  |         #[deserr(rename_all = camelCase)] | ||||||
|  |         #[serde(rename_all = "camelCase")] | ||||||
|  |         pub enum Locale { | ||||||
|  |             $($language),+, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         impl From<milli::tokenizer::Language> for Locale { | ||||||
|  |             fn from(other: milli::tokenizer::Language) -> Locale { | ||||||
|  |                 match other { | ||||||
|  |                     $(milli::tokenizer::Language::$language => Locale::$language), + | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         impl From<Locale> for milli::tokenizer::Language { | ||||||
|  |             fn from(other: Locale) -> milli::tokenizer::Language { | ||||||
|  |                 match other { | ||||||
|  |                     $(Locale::$language => milli::tokenizer::Language::$language), +, | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         #[derive(Debug)] | ||||||
|  |         pub struct LocaleFormatError { | ||||||
|  |             pub invalid_locale: String, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         impl std::fmt::Display for LocaleFormatError { | ||||||
|  |             fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { | ||||||
|  |                 let valid_locales = [$(Locale::$language),+].iter().map(|l| format!("`{}`", json!(l).as_str().unwrap())).collect::<Vec<_>>().join(", "); | ||||||
|  |                 write!(f, "Unsupported locale `{}`, expected one of {}", self.invalid_locale, valid_locales) | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |     }; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | make_locale! { | ||||||
|  |     Epo, | ||||||
|  |     Eng, | ||||||
|  |     Rus, | ||||||
|  |     Cmn, | ||||||
|  |     Spa, | ||||||
|  |     Por, | ||||||
|  |     Ita, | ||||||
|  |     Ben, | ||||||
|  |     Fra, | ||||||
|  |     Deu, | ||||||
|  |     Ukr, | ||||||
|  |     Kat, | ||||||
|  |     Ara, | ||||||
|  |     Hin, | ||||||
|  |     Jpn, | ||||||
|  |     Heb, | ||||||
|  |     Yid, | ||||||
|  |     Pol, | ||||||
|  |     Amh, | ||||||
|  |     Jav, | ||||||
|  |     Kor, | ||||||
|  |     Nob, | ||||||
|  |     Dan, | ||||||
|  |     Swe, | ||||||
|  |     Fin, | ||||||
|  |     Tur, | ||||||
|  |     Nld, | ||||||
|  |     Hun, | ||||||
|  |     Ces, | ||||||
|  |     Ell, | ||||||
|  |     Bul, | ||||||
|  |     Bel, | ||||||
|  |     Mar, | ||||||
|  |     Kan, | ||||||
|  |     Ron, | ||||||
|  |     Slv, | ||||||
|  |     Hrv, | ||||||
|  |     Srp, | ||||||
|  |     Mkd, | ||||||
|  |     Lit, | ||||||
|  |     Lav, | ||||||
|  |     Est, | ||||||
|  |     Tam, | ||||||
|  |     Vie, | ||||||
|  |     Urd, | ||||||
|  |     Tha, | ||||||
|  |     Guj, | ||||||
|  |     Uzb, | ||||||
|  |     Pan, | ||||||
|  |     Aze, | ||||||
|  |     Ind, | ||||||
|  |     Tel, | ||||||
|  |     Pes, | ||||||
|  |     Mal, | ||||||
|  |     Ori, | ||||||
|  |     Mya, | ||||||
|  |     Nep, | ||||||
|  |     Sin, | ||||||
|  |     Khm, | ||||||
|  |     Tuk, | ||||||
|  |     Aka, | ||||||
|  |     Zul, | ||||||
|  |     Sna, | ||||||
|  |     Afr, | ||||||
|  |     Lat, | ||||||
|  |     Slk, | ||||||
|  |     Cat, | ||||||
|  |     Tgl, | ||||||
|  |     Hye | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl std::error::Error for LocaleFormatError {} | ||||||
|  |  | ||||||
|  | impl std::str::FromStr for Locale { | ||||||
|  |     type Err = LocaleFormatError; | ||||||
|  |  | ||||||
|  |     fn from_str(s: &str) -> Result<Self, Self::Err> { | ||||||
|  |         milli::tokenizer::Language::from_code(s) | ||||||
|  |             .map(Self::from) | ||||||
|  |             .ok_or(LocaleFormatError { invalid_locale: s.to_string() }) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, PartialEq, Eq, Deserr, Serialize, Deserialize)] | ||||||
|  | #[deserr(rename_all = camelCase)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | pub struct LocalizedAttributesRuleView { | ||||||
|  |     pub attribute_patterns: Vec<String>, | ||||||
|  |     pub locales: Vec<Locale>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl From<LocalizedAttributesRule> for LocalizedAttributesRuleView { | ||||||
|  |     fn from(rule: LocalizedAttributesRule) -> Self { | ||||||
|  |         Self { | ||||||
|  |             attribute_patterns: rule.attribute_patterns, | ||||||
|  |             locales: rule.locales.into_iter().map(|l| l.into()).collect(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl From<LocalizedAttributesRuleView> for LocalizedAttributesRule { | ||||||
|  |     fn from(view: LocalizedAttributesRuleView) -> Self { | ||||||
|  |         Self { | ||||||
|  |             attribute_patterns: view.attribute_patterns, | ||||||
|  |             locales: view.locales.into_iter().map(|l| l.into()).collect(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -17,6 +17,7 @@ use serde::{Deserialize, Serialize, Serializer}; | |||||||
| use crate::deserr::DeserrJsonError; | use crate::deserr::DeserrJsonError; | ||||||
| use crate::error::deserr_codes::*; | use crate::error::deserr_codes::*; | ||||||
| use crate::facet_values_sort::FacetValuesSort; | use crate::facet_values_sort::FacetValuesSort; | ||||||
|  | use crate::locales::LocalizedAttributesRuleView; | ||||||
|  |  | ||||||
| /// The maximum number of results that the engine | /// The maximum number of results that the engine | ||||||
| /// will be able to return in one search call. | /// will be able to return in one search call. | ||||||
| @@ -198,6 +199,9 @@ pub struct Settings<T> { | |||||||
|     #[serde(default, skip_serializing_if = "Setting::is_not_set")] |     #[serde(default, skip_serializing_if = "Setting::is_not_set")] | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSettingsSearchCutoffMs>)] |     #[deserr(default, error = DeserrJsonError<InvalidSettingsSearchCutoffMs>)] | ||||||
|     pub search_cutoff_ms: Setting<u64>, |     pub search_cutoff_ms: Setting<u64>, | ||||||
|  |     #[serde(default, skip_serializing_if = "Setting::is_not_set")] | ||||||
|  |     #[deserr(default, error = DeserrJsonError<InvalidSettingsLocalizedAttributes>)] | ||||||
|  |     pub localized_attributes: Setting<Vec<LocalizedAttributesRuleView>>, | ||||||
|  |  | ||||||
|     #[serde(skip)] |     #[serde(skip)] | ||||||
|     #[deserr(skip)] |     #[deserr(skip)] | ||||||
| @@ -261,6 +265,7 @@ impl Settings<Checked> { | |||||||
|             pagination: Setting::Reset, |             pagination: Setting::Reset, | ||||||
|             embedders: Setting::Reset, |             embedders: Setting::Reset, | ||||||
|             search_cutoff_ms: Setting::Reset, |             search_cutoff_ms: Setting::Reset, | ||||||
|  |             localized_attributes: Setting::Reset, | ||||||
|             _kind: PhantomData, |             _kind: PhantomData, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -284,7 +289,8 @@ impl Settings<Checked> { | |||||||
|             pagination, |             pagination, | ||||||
|             embedders, |             embedders, | ||||||
|             search_cutoff_ms, |             search_cutoff_ms, | ||||||
|             .. |             localized_attributes: localized_attributes_rules, | ||||||
|  |             _kind, | ||||||
|         } = self; |         } = self; | ||||||
|  |  | ||||||
|         Settings { |         Settings { | ||||||
| @@ -305,6 +311,7 @@ impl Settings<Checked> { | |||||||
|             pagination, |             pagination, | ||||||
|             embedders, |             embedders, | ||||||
|             search_cutoff_ms, |             search_cutoff_ms, | ||||||
|  |             localized_attributes: localized_attributes_rules, | ||||||
|             _kind: PhantomData, |             _kind: PhantomData, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -352,6 +359,7 @@ impl Settings<Unchecked> { | |||||||
|             pagination: self.pagination, |             pagination: self.pagination, | ||||||
|             embedders: self.embedders, |             embedders: self.embedders, | ||||||
|             search_cutoff_ms: self.search_cutoff_ms, |             search_cutoff_ms: self.search_cutoff_ms, | ||||||
|  |             localized_attributes: self.localized_attributes, | ||||||
|             _kind: PhantomData, |             _kind: PhantomData, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -402,6 +410,7 @@ pub fn apply_settings_to_builder( | |||||||
|         pagination, |         pagination, | ||||||
|         embedders, |         embedders, | ||||||
|         search_cutoff_ms, |         search_cutoff_ms, | ||||||
|  |         localized_attributes: localized_attributes_rules, | ||||||
|         _kind, |         _kind, | ||||||
|     } = settings; |     } = settings; | ||||||
|  |  | ||||||
| @@ -485,6 +494,13 @@ pub fn apply_settings_to_builder( | |||||||
|         Setting::NotSet => (), |         Setting::NotSet => (), | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     match localized_attributes_rules { | ||||||
|  |         Setting::Set(ref rules) => builder | ||||||
|  |             .set_localized_attributes_rules(rules.iter().cloned().map(|r| r.into()).collect()), | ||||||
|  |         Setting::Reset => builder.reset_localized_attributes_rules(), | ||||||
|  |         Setting::NotSet => (), | ||||||
|  |     } | ||||||
|  |  | ||||||
|     match typo_tolerance { |     match typo_tolerance { | ||||||
|         Setting::Set(ref value) => { |         Setting::Set(ref value) => { | ||||||
|             match value.enabled { |             match value.enabled { | ||||||
| @@ -679,6 +695,8 @@ pub fn settings( | |||||||
|  |  | ||||||
|     let search_cutoff_ms = index.search_cutoff(rtxn)?; |     let search_cutoff_ms = index.search_cutoff(rtxn)?; | ||||||
|  |  | ||||||
|  |     let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; | ||||||
|  |  | ||||||
|     let mut settings = Settings { |     let mut settings = Settings { | ||||||
|         displayed_attributes: match displayed_attributes { |         displayed_attributes: match displayed_attributes { | ||||||
|             Some(attrs) => Setting::Set(attrs), |             Some(attrs) => Setting::Set(attrs), | ||||||
| @@ -711,6 +729,10 @@ pub fn settings( | |||||||
|             Some(cutoff) => Setting::Set(cutoff), |             Some(cutoff) => Setting::Set(cutoff), | ||||||
|             None => Setting::Reset, |             None => Setting::Reset, | ||||||
|         }, |         }, | ||||||
|  |         localized_attributes: match localized_attributes_rules { | ||||||
|  |             Some(rules) => Setting::Set(rules.into_iter().map(|r| r.into()).collect()), | ||||||
|  |             None => Setting::Reset, | ||||||
|  |         }, | ||||||
|         _kind: PhantomData, |         _kind: PhantomData, | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
| @@ -902,6 +924,7 @@ pub(crate) mod test { | |||||||
|             faceting: Setting::NotSet, |             faceting: Setting::NotSet, | ||||||
|             pagination: Setting::NotSet, |             pagination: Setting::NotSet, | ||||||
|             embedders: Setting::NotSet, |             embedders: Setting::NotSet, | ||||||
|  |             localized_attributes: Setting::NotSet, | ||||||
|             search_cutoff_ms: Setting::NotSet, |             search_cutoff_ms: Setting::NotSet, | ||||||
|             _kind: PhantomData::<Unchecked>, |             _kind: PhantomData::<Unchecked>, | ||||||
|         }; |         }; | ||||||
| @@ -930,6 +953,7 @@ pub(crate) mod test { | |||||||
|             faceting: Setting::NotSet, |             faceting: Setting::NotSet, | ||||||
|             pagination: Setting::NotSet, |             pagination: Setting::NotSet, | ||||||
|             embedders: Setting::NotSet, |             embedders: Setting::NotSet, | ||||||
|  |             localized_attributes: Setting::NotSet, | ||||||
|             search_cutoff_ms: Setting::NotSet, |             search_cutoff_ms: Setting::NotSet, | ||||||
|             _kind: PhantomData::<Unchecked>, |             _kind: PhantomData::<Unchecked>, | ||||||
|         }; |         }; | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| use std::collections::{BinaryHeap, HashMap, HashSet}; | use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet}; | ||||||
| use std::fs; | use std::fs; | ||||||
| use std::mem::take; | use std::mem::take; | ||||||
| use std::path::{Path, PathBuf}; | use std::path::{Path, PathBuf}; | ||||||
| @@ -10,6 +10,7 @@ use actix_web::HttpRequest; | |||||||
| use byte_unit::Byte; | use byte_unit::Byte; | ||||||
| use index_scheduler::IndexScheduler; | use index_scheduler::IndexScheduler; | ||||||
| use meilisearch_auth::{AuthController, AuthFilter}; | use meilisearch_auth::{AuthController, AuthFilter}; | ||||||
|  | use meilisearch_types::locales::Locale; | ||||||
| use meilisearch_types::InstanceUid; | use meilisearch_types::InstanceUid; | ||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
| use regex::Regex; | use regex::Regex; | ||||||
| @@ -653,6 +654,9 @@ pub struct SearchAggregator { | |||||||
|     // every time a search is done, we increment the counter linked to the used settings |     // every time a search is done, we increment the counter linked to the used settings | ||||||
|     matching_strategy: HashMap<String, usize>, |     matching_strategy: HashMap<String, usize>, | ||||||
|  |  | ||||||
|  |     // List of the unique Locales passed as parameter | ||||||
|  |     locales: BTreeSet<Locale>, | ||||||
|  |  | ||||||
|     // pagination |     // pagination | ||||||
|     max_limit: usize, |     max_limit: usize, | ||||||
|     max_offset: usize, |     max_offset: usize, | ||||||
| @@ -707,6 +711,7 @@ impl SearchAggregator { | |||||||
|             attributes_to_search_on, |             attributes_to_search_on, | ||||||
|             hybrid, |             hybrid, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = query; |         } = query; | ||||||
|  |  | ||||||
|         let mut ret = Self::default(); |         let mut ret = Self::default(); | ||||||
| @@ -774,6 +779,10 @@ impl SearchAggregator { | |||||||
|  |  | ||||||
|         ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); |         ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1); | ||||||
|  |  | ||||||
|  |         if let Some(locales) = locales { | ||||||
|  |             ret.locales = locales.iter().copied().collect(); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); |         ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG(); | ||||||
|         ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); |         ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG(); | ||||||
|         ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); |         ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER(); | ||||||
| @@ -859,6 +868,7 @@ impl SearchAggregator { | |||||||
|             total_degraded, |             total_degraded, | ||||||
|             total_used_negative_operator, |             total_used_negative_operator, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             ref mut locales, | ||||||
|         } = other; |         } = other; | ||||||
|  |  | ||||||
|         if self.timestamp.is_none() { |         if self.timestamp.is_none() { | ||||||
| @@ -947,6 +957,9 @@ impl SearchAggregator { | |||||||
|         self.show_ranking_score |= show_ranking_score; |         self.show_ranking_score |= show_ranking_score; | ||||||
|         self.show_ranking_score_details |= show_ranking_score_details; |         self.show_ranking_score_details |= show_ranking_score_details; | ||||||
|         self.ranking_score_threshold |= ranking_score_threshold; |         self.ranking_score_threshold |= ranking_score_threshold; | ||||||
|  |  | ||||||
|  |         // locales | ||||||
|  |         self.locales.append(locales); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { |     pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { | ||||||
| @@ -991,6 +1004,7 @@ impl SearchAggregator { | |||||||
|             total_degraded, |             total_degraded, | ||||||
|             total_used_negative_operator, |             total_used_negative_operator, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = self; |         } = self; | ||||||
|  |  | ||||||
|         if total_received == 0 { |         if total_received == 0 { | ||||||
| @@ -1060,6 +1074,7 @@ impl SearchAggregator { | |||||||
|                 "matching_strategy": { |                 "matching_strategy": { | ||||||
|                     "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), |                     "most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), | ||||||
|                 }, |                 }, | ||||||
|  |                 "locales": locales, | ||||||
|                 "scoring": { |                 "scoring": { | ||||||
|                     "show_ranking_score": show_ranking_score, |                     "show_ranking_score": show_ranking_score, | ||||||
|                     "show_ranking_score_details": show_ranking_score_details, |                     "show_ranking_score_details": show_ranking_score_details, | ||||||
| @@ -1150,6 +1165,7 @@ impl MultiSearchAggregator { | |||||||
|                     attributes_to_search_on: _, |                     attributes_to_search_on: _, | ||||||
|                     hybrid: _, |                     hybrid: _, | ||||||
|                     ranking_score_threshold: _, |                     ranking_score_threshold: _, | ||||||
|  |                     locales: _, | ||||||
|                 } = query; |                 } = query; | ||||||
|  |  | ||||||
|                 index_uid.as_str() |                 index_uid.as_str() | ||||||
| @@ -1307,6 +1323,7 @@ impl FacetSearchAggregator { | |||||||
|             attributes_to_search_on, |             attributes_to_search_on, | ||||||
|             hybrid, |             hybrid, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = query; |         } = query; | ||||||
|  |  | ||||||
|         let mut ret = Self::default(); |         let mut ret = Self::default(); | ||||||
| @@ -1322,7 +1339,8 @@ impl FacetSearchAggregator { | |||||||
|             || *matching_strategy != MatchingStrategy::default() |             || *matching_strategy != MatchingStrategy::default() | ||||||
|             || attributes_to_search_on.is_some() |             || attributes_to_search_on.is_some() | ||||||
|             || hybrid.is_some() |             || hybrid.is_some() | ||||||
|             || ranking_score_threshold.is_some(); |             || ranking_score_threshold.is_some() | ||||||
|  |             || locales.is_some(); | ||||||
|  |  | ||||||
|         ret |         ret | ||||||
|     } |     } | ||||||
|   | |||||||
| @@ -6,6 +6,7 @@ use meilisearch_types::deserr::DeserrJsonError; | |||||||
| use meilisearch_types::error::deserr_codes::*; | use meilisearch_types::error::deserr_codes::*; | ||||||
| use meilisearch_types::error::ResponseError; | use meilisearch_types::error::ResponseError; | ||||||
| use meilisearch_types::index_uid::IndexUid; | use meilisearch_types::index_uid::IndexUid; | ||||||
|  | use meilisearch_types::locales::Locale; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
| use tracing::debug; | use tracing::debug; | ||||||
|  |  | ||||||
| @@ -48,6 +49,8 @@ pub struct FacetSearchQuery { | |||||||
|     pub attributes_to_search_on: Option<Vec<String>>, |     pub attributes_to_search_on: Option<Vec<String>>, | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)] |     #[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)] | ||||||
|     pub ranking_score_threshold: Option<RankingScoreThreshold>, |     pub ranking_score_threshold: Option<RankingScoreThreshold>, | ||||||
|  |     #[deserr(default, error = DeserrJsonError<InvalidSearchLocales>, default)] | ||||||
|  |     pub locales: Option<Vec<Locale>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| pub async fn search( | pub async fn search( | ||||||
| @@ -67,6 +70,7 @@ pub async fn search( | |||||||
|  |  | ||||||
|     let facet_query = query.facet_query.clone(); |     let facet_query = query.facet_query.clone(); | ||||||
|     let facet_name = query.facet_name.clone(); |     let facet_name = query.facet_name.clone(); | ||||||
|  |     let locales = query.locales.clone().map(|l| l.into_iter().map(Into::into).collect()); | ||||||
|     let mut search_query = SearchQuery::from(query); |     let mut search_query = SearchQuery::from(query); | ||||||
|  |  | ||||||
|     // Tenant token search_rules. |     // Tenant token search_rules. | ||||||
| @@ -86,6 +90,7 @@ pub async fn search( | |||||||
|             facet_name, |             facet_name, | ||||||
|             search_kind, |             search_kind, | ||||||
|             index_scheduler.features(), |             index_scheduler.features(), | ||||||
|  |             locales, | ||||||
|         ) |         ) | ||||||
|     }) |     }) | ||||||
|     .await?; |     .await?; | ||||||
| @@ -113,6 +118,7 @@ impl From<FacetSearchQuery> for SearchQuery { | |||||||
|             attributes_to_search_on, |             attributes_to_search_on, | ||||||
|             hybrid, |             hybrid, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = value; |         } = value; | ||||||
|  |  | ||||||
|         SearchQuery { |         SearchQuery { | ||||||
| @@ -141,6 +147,7 @@ impl From<FacetSearchQuery> for SearchQuery { | |||||||
|             attributes_to_search_on, |             attributes_to_search_on, | ||||||
|             hybrid, |             hybrid, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; | |||||||
| use meilisearch_types::error::deserr_codes::*; | use meilisearch_types::error::deserr_codes::*; | ||||||
| use meilisearch_types::error::ResponseError; | use meilisearch_types::error::ResponseError; | ||||||
| use meilisearch_types::index_uid::IndexUid; | use meilisearch_types::index_uid::IndexUid; | ||||||
|  | use meilisearch_types::locales::Locale; | ||||||
| use meilisearch_types::milli; | use meilisearch_types::milli; | ||||||
| use meilisearch_types::serde_cs::vec::CS; | use meilisearch_types::serde_cs::vec::CS; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
| @@ -89,6 +90,8 @@ pub struct SearchQueryGet { | |||||||
|     pub hybrid_semantic_ratio: Option<SemanticRatioGet>, |     pub hybrid_semantic_ratio: Option<SemanticRatioGet>, | ||||||
|     #[deserr(default, error = DeserrQueryParamError<InvalidSearchRankingScoreThreshold>)] |     #[deserr(default, error = DeserrQueryParamError<InvalidSearchRankingScoreThreshold>)] | ||||||
|     pub ranking_score_threshold: Option<RankingScoreThresholdGet>, |     pub ranking_score_threshold: Option<RankingScoreThresholdGet>, | ||||||
|  |     #[deserr(default, error = DeserrQueryParamError<InvalidSearchLocales>)] | ||||||
|  |     pub locales: Option<CS<Locale>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] | #[derive(Debug, Clone, Copy, PartialEq, deserr::Deserr)] | ||||||
| @@ -175,6 +178,7 @@ impl From<SearchQueryGet> for SearchQuery { | |||||||
|             attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), |             attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()), | ||||||
|             hybrid, |             hybrid, | ||||||
|             ranking_score_threshold: other.ranking_score_threshold.map(|o| o.0), |             ranking_score_threshold: other.ranking_score_threshold.map(|o| o.0), | ||||||
|  |             locales: other.locales.map(|o| o.into_iter().collect()), | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -474,6 +474,28 @@ make_setting_route!( | |||||||
|     } |     } | ||||||
| ); | ); | ||||||
|  |  | ||||||
|  | make_setting_route!( | ||||||
|  |     "/localized-attributes", | ||||||
|  |     put, | ||||||
|  |     Vec<meilisearch_types::locales::LocalizedAttributesRuleView>, | ||||||
|  |     meilisearch_types::deserr::DeserrJsonError< | ||||||
|  |         meilisearch_types::error::deserr_codes::InvalidSettingsLocalizedAttributes, | ||||||
|  |     >, | ||||||
|  |     localized_attributes, | ||||||
|  |     "localizedAttributes", | ||||||
|  |     analytics, | ||||||
|  |     |rules: &Option<Vec<meilisearch_types::locales::LocalizedAttributesRuleView>>, req: &HttpRequest| { | ||||||
|  |         use serde_json::json; | ||||||
|  |         analytics.publish( | ||||||
|  |             "LocalizedAttributesRules Updated".to_string(), | ||||||
|  |             json!({ | ||||||
|  |                 "locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>()) | ||||||
|  |             }), | ||||||
|  |             Some(req), | ||||||
|  |         ); | ||||||
|  |     } | ||||||
|  | ); | ||||||
|  |  | ||||||
| make_setting_route!( | make_setting_route!( | ||||||
|     "/ranking-rules", |     "/ranking-rules", | ||||||
|     put, |     put, | ||||||
| @@ -786,6 +808,7 @@ pub async fn update_all( | |||||||
|             }, |             }, | ||||||
|             "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), |             "embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()), | ||||||
|             "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), |             "search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(), | ||||||
|  |             "locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>()), | ||||||
|         }), |         }), | ||||||
|         Some(&req), |         Some(&req), | ||||||
|     ); |     ); | ||||||
|   | |||||||
| @@ -380,9 +380,6 @@ pub fn perform_federated_search( | |||||||
|  |  | ||||||
|         let criteria = index.criteria(&rtxn)?; |         let criteria = index.criteria(&rtxn)?; | ||||||
|  |  | ||||||
|         // stuff we need for the hitmaker |  | ||||||
|         let script_lang_map = index.script_language(&rtxn)?; |  | ||||||
|  |  | ||||||
|         let dictionary = index.dictionary(&rtxn)?; |         let dictionary = index.dictionary(&rtxn)?; | ||||||
|         let dictionary: Option<Vec<_>> = |         let dictionary: Option<Vec<_>> = | ||||||
|             dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); |             dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); | ||||||
| @@ -494,6 +491,7 @@ pub fn perform_federated_search( | |||||||
|                     sort: query.sort, |                     sort: query.sort, | ||||||
|                     show_ranking_score: query.show_ranking_score, |                     show_ranking_score: query.show_ranking_score, | ||||||
|                     show_ranking_score_details: query.show_ranking_score_details, |                     show_ranking_score_details: query.show_ranking_score_details, | ||||||
|  |                     locales: query.locales.map(|l| l.iter().copied().map(Into::into).collect()), | ||||||
|                 }; |                 }; | ||||||
|  |  | ||||||
|                 let milli::SearchResult { |                 let milli::SearchResult { | ||||||
| @@ -509,11 +507,7 @@ pub fn perform_federated_search( | |||||||
|                 degraded |= query_degraded; |                 degraded |= query_degraded; | ||||||
|                 used_negative_operator |= query_used_negative_operator; |                 used_negative_operator |= query_used_negative_operator; | ||||||
|  |  | ||||||
|                 let tokenizer = HitMaker::tokenizer( |                 let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); | ||||||
|                     &script_lang_map, |  | ||||||
|                     dictionary.as_deref(), |  | ||||||
|                     separators.as_deref(), |  | ||||||
|                 ); |  | ||||||
|  |  | ||||||
|                 let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); |                 let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| use core::fmt; | use core::fmt; | ||||||
| use std::cmp::min; | use std::cmp::min; | ||||||
| use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; | use std::collections::{BTreeMap, BTreeSet, HashSet}; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
| use std::time::{Duration, Instant}; | use std::time::{Duration, Instant}; | ||||||
| @@ -15,16 +15,17 @@ use meilisearch_types::error::deserr_codes::*; | |||||||
| use meilisearch_types::error::{Code, ResponseError}; | use meilisearch_types::error::{Code, ResponseError}; | ||||||
| use meilisearch_types::heed::RoTxn; | use meilisearch_types::heed::RoTxn; | ||||||
| use meilisearch_types::index_uid::IndexUid; | use meilisearch_types::index_uid::IndexUid; | ||||||
|  | use meilisearch_types::locales::Locale; | ||||||
| use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; | use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; | ||||||
| use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; | use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; | ||||||
| use meilisearch_types::milli::vector::Embedder; | use meilisearch_types::milli::vector::Embedder; | ||||||
| use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; | use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; | ||||||
| use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; | use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; | ||||||
| use meilisearch_types::{milli, Document}; | use meilisearch_types::{milli, Document}; | ||||||
| use milli::tokenizer::TokenizerBuilder; | use milli::tokenizer::{Language, TokenizerBuilder}; | ||||||
| use milli::{ | use milli::{ | ||||||
|     AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder, |     AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, LocalizedAttributesRule, | ||||||
|     SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, |     MatchBounds, MatcherBuilder, SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, | ||||||
| }; | }; | ||||||
| use regex::Regex; | use regex::Regex; | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| @@ -100,6 +101,8 @@ pub struct SearchQuery { | |||||||
|     pub attributes_to_search_on: Option<Vec<String>>, |     pub attributes_to_search_on: Option<Vec<String>>, | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)] |     #[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)] | ||||||
|     pub ranking_score_threshold: Option<RankingScoreThreshold>, |     pub ranking_score_threshold: Option<RankingScoreThreshold>, | ||||||
|  |     #[deserr(default, error = DeserrJsonError<InvalidSearchLocales>, default)] | ||||||
|  |     pub locales: Option<Vec<Locale>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy, PartialEq, Deserr)] | #[derive(Debug, Clone, Copy, PartialEq, Deserr)] | ||||||
| @@ -169,6 +172,7 @@ impl fmt::Debug for SearchQuery { | |||||||
|             matching_strategy, |             matching_strategy, | ||||||
|             attributes_to_search_on, |             attributes_to_search_on, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = self; |         } = self; | ||||||
|  |  | ||||||
|         let mut debug = f.debug_struct("SearchQuery"); |         let mut debug = f.debug_struct("SearchQuery"); | ||||||
| @@ -250,6 +254,10 @@ impl fmt::Debug for SearchQuery { | |||||||
|             debug.field("ranking_score_threshold", &ranking_score_threshold); |             debug.field("ranking_score_threshold", &ranking_score_threshold); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         if let Some(locales) = locales { | ||||||
|  |             debug.field("locales", &locales); | ||||||
|  |         } | ||||||
|  |  | ||||||
|         debug.finish() |         debug.finish() | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -425,6 +433,8 @@ pub struct SearchQueryWithIndex { | |||||||
|     pub attributes_to_search_on: Option<Vec<String>>, |     pub attributes_to_search_on: Option<Vec<String>>, | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)] |     #[deserr(default, error = DeserrJsonError<InvalidSearchRankingScoreThreshold>, default)] | ||||||
|     pub ranking_score_threshold: Option<RankingScoreThreshold>, |     pub ranking_score_threshold: Option<RankingScoreThreshold>, | ||||||
|  |     #[deserr(default, error = DeserrJsonError<InvalidSearchLocales>, default)] | ||||||
|  |     pub locales: Option<Vec<Locale>>, | ||||||
|  |  | ||||||
|     #[deserr(default)] |     #[deserr(default)] | ||||||
|     pub federation_options: Option<FederationOptions>, |     pub federation_options: Option<FederationOptions>, | ||||||
| @@ -477,6 +487,7 @@ impl SearchQueryWithIndex { | |||||||
|             attributes_to_search_on, |             attributes_to_search_on, | ||||||
|             hybrid, |             hybrid, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = self; |         } = self; | ||||||
|         ( |         ( | ||||||
|             index_uid, |             index_uid, | ||||||
| @@ -506,6 +517,7 @@ impl SearchQueryWithIndex { | |||||||
|                 attributes_to_search_on, |                 attributes_to_search_on, | ||||||
|                 hybrid, |                 hybrid, | ||||||
|                 ranking_score_threshold, |                 ranking_score_threshold, | ||||||
|  |                 locales, | ||||||
|                 // do not use ..Default::default() here, |                 // do not use ..Default::default() here, | ||||||
|                 // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` |                 // rather add any missing field from `SearchQuery` to `SearchQueryWithIndex` | ||||||
|             }, |             }, | ||||||
| @@ -866,6 +878,10 @@ fn prepare_search<'t>( | |||||||
|         search.sort_criteria(sort); |         search.sort_criteria(sort); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if let Some(ref locales) = query.locales { | ||||||
|  |         search.locales(locales.iter().copied().map(Into::into).collect()); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     Ok((search, is_finite_pagination, max_total_hits, offset)) |     Ok((search, is_finite_pagination, max_total_hits, offset)) | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -917,6 +933,7 @@ pub fn perform_search( | |||||||
|         highlight_pre_tag, |         highlight_pre_tag, | ||||||
|         highlight_post_tag, |         highlight_post_tag, | ||||||
|         crop_marker, |         crop_marker, | ||||||
|  |         locales, | ||||||
|         // already used in prepare_search |         // already used in prepare_search | ||||||
|         vector: _, |         vector: _, | ||||||
|         hybrid: _, |         hybrid: _, | ||||||
| @@ -941,6 +958,7 @@ pub fn perform_search( | |||||||
|         sort, |         sort, | ||||||
|         show_ranking_score, |         show_ranking_score, | ||||||
|         show_ranking_score_details, |         show_ranking_score_details, | ||||||
|  |         locales: locales.map(|l| l.iter().copied().map(Into::into).collect()), | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     let documents = make_hits( |     let documents = make_hits( | ||||||
| @@ -1046,6 +1064,7 @@ struct AttributesFormat { | |||||||
|     sort: Option<Vec<String>>, |     sort: Option<Vec<String>>, | ||||||
|     show_ranking_score: bool, |     show_ranking_score: bool, | ||||||
|     show_ranking_score_details: bool, |     show_ranking_score_details: bool, | ||||||
|  |     locales: Option<Vec<Language>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Copy, PartialEq, Eq)] | #[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||||||
| @@ -1093,19 +1112,16 @@ struct HitMaker<'a> { | |||||||
|     show_ranking_score_details: bool, |     show_ranking_score_details: bool, | ||||||
|     sort: Option<Vec<String>>, |     sort: Option<Vec<String>>, | ||||||
|     show_matches_position: bool, |     show_matches_position: bool, | ||||||
|  |     locales: Option<Vec<Language>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> HitMaker<'a> { | impl<'a> HitMaker<'a> { | ||||||
|     pub fn tokenizer<'b>( |     pub fn tokenizer<'b>( | ||||||
|         script_lang_map: &'b HashMap<milli::tokenizer::Script, Vec<milli::tokenizer::Language>>, |  | ||||||
|         dictionary: Option<&'b [&'b str]>, |         dictionary: Option<&'b [&'b str]>, | ||||||
|         separators: Option<&'b [&'b str]>, |         separators: Option<&'b [&'b str]>, | ||||||
|     ) -> milli::tokenizer::Tokenizer<'b> { |     ) -> milli::tokenizer::Tokenizer<'b> { | ||||||
|         let mut tokenizer_builder = TokenizerBuilder::default(); |         let mut tokenizer_builder = TokenizerBuilder::default(); | ||||||
|         tokenizer_builder.create_char_map(true); |         tokenizer_builder.create_char_map(true); | ||||||
|         if !script_lang_map.is_empty() { |  | ||||||
|             tokenizer_builder.allow_list(script_lang_map); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if let Some(separators) = separators { |         if let Some(separators) = separators { | ||||||
|             tokenizer_builder.separators(separators); |             tokenizer_builder.separators(separators); | ||||||
| @@ -1218,6 +1234,7 @@ impl<'a> HitMaker<'a> { | |||||||
|             show_ranking_score_details: format.show_ranking_score_details, |             show_ranking_score_details: format.show_ranking_score_details, | ||||||
|             show_matches_position: format.show_matches_position, |             show_matches_position: format.show_matches_position, | ||||||
|             sort: format.sort, |             sort: format.sort, | ||||||
|  |             locales: format.locales, | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1273,6 +1290,9 @@ impl<'a> HitMaker<'a> { | |||||||
|             document.insert("_vectors".into(), vectors.into()); |             document.insert("_vectors".into(), vectors.into()); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         let localized_attributes = | ||||||
|  |             self.index.localized_attributes_rules(self.rtxn)?.unwrap_or_default(); | ||||||
|  |  | ||||||
|         let (matches_position, formatted) = format_fields( |         let (matches_position, formatted) = format_fields( | ||||||
|             &displayed_document, |             &displayed_document, | ||||||
|             &self.fields_ids_map, |             &self.fields_ids_map, | ||||||
| @@ -1280,6 +1300,8 @@ impl<'a> HitMaker<'a> { | |||||||
|             &self.formatted_options, |             &self.formatted_options, | ||||||
|             self.show_matches_position, |             self.show_matches_position, | ||||||
|             &self.displayed_ids, |             &self.displayed_ids, | ||||||
|  |             self.locales.as_deref(), | ||||||
|  |             &localized_attributes, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         if let Some(sort) = self.sort.as_ref() { |         if let Some(sort) = self.sort.as_ref() { | ||||||
| @@ -1312,8 +1334,6 @@ fn make_hits<'a>( | |||||||
| ) -> Result<Vec<SearchHit>, MeilisearchHttpError> { | ) -> Result<Vec<SearchHit>, MeilisearchHttpError> { | ||||||
|     let mut documents = Vec::new(); |     let mut documents = Vec::new(); | ||||||
|  |  | ||||||
|     let script_lang_map = index.script_language(rtxn)?; |  | ||||||
|  |  | ||||||
|     let dictionary = index.dictionary(rtxn)?; |     let dictionary = index.dictionary(rtxn)?; | ||||||
|     let dictionary: Option<Vec<_>> = |     let dictionary: Option<Vec<_>> = | ||||||
|         dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); |         dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); | ||||||
| @@ -1321,8 +1341,7 @@ fn make_hits<'a>( | |||||||
|     let separators: Option<Vec<_>> = |     let separators: Option<Vec<_>> = | ||||||
|         separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); |         separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); | ||||||
|  |  | ||||||
|     let tokenizer = |     let tokenizer = HitMaker::tokenizer(dictionary.as_deref(), separators.as_deref()); | ||||||
|         HitMaker::tokenizer(&script_lang_map, dictionary.as_deref(), separators.as_deref()); |  | ||||||
|  |  | ||||||
|     let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); |     let formatter_builder = HitMaker::formatter_builder(matching_words, tokenizer); | ||||||
|  |  | ||||||
| @@ -1341,6 +1360,7 @@ pub fn perform_facet_search( | |||||||
|     facet_name: String, |     facet_name: String, | ||||||
|     search_kind: SearchKind, |     search_kind: SearchKind, | ||||||
|     features: RoFeatures, |     features: RoFeatures, | ||||||
|  |     locales: Option<Vec<Language>>, | ||||||
| ) -> Result<FacetSearchResult, ResponseError> { | ) -> Result<FacetSearchResult, ResponseError> { | ||||||
|     let before_search = Instant::now(); |     let before_search = Instant::now(); | ||||||
|     let rtxn = index.read_txn()?; |     let rtxn = index.read_txn()?; | ||||||
| @@ -1349,6 +1369,14 @@ pub fn perform_facet_search( | |||||||
|         None => TimeBudget::default(), |         None => TimeBudget::default(), | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|  |     let localized_attributes = index.localized_attributes_rules(&rtxn)?.unwrap_or_default(); | ||||||
|  |     let locales = locales.or_else(|| { | ||||||
|  |         localized_attributes | ||||||
|  |             .into_iter() | ||||||
|  |             .find(|attr| attr.match_str(&facet_name)) | ||||||
|  |             .map(|attr| attr.locales) | ||||||
|  |     }); | ||||||
|  |  | ||||||
|     let (search, _, _, _) = |     let (search, _, _, _) = | ||||||
|         prepare_search(index, &rtxn, &search_query, &search_kind, time_budget, features)?; |         prepare_search(index, &rtxn, &search_query, &search_kind, time_budget, features)?; | ||||||
|     let mut facet_search = SearchForFacetValues::new( |     let mut facet_search = SearchForFacetValues::new( | ||||||
| @@ -1363,6 +1391,10 @@ pub fn perform_facet_search( | |||||||
|         facet_search.max_values(max_facets as usize); |         facet_search.max_values(max_facets as usize); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     if let Some(locales) = locales { | ||||||
|  |         facet_search.locales(locales); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     Ok(FacetSearchResult { |     Ok(FacetSearchResult { | ||||||
|         facet_hits: facet_search.execute()?, |         facet_hits: facet_search.execute()?, | ||||||
|         facet_query, |         facet_query, | ||||||
| @@ -1443,6 +1475,7 @@ pub fn perform_similar( | |||||||
|         sort: None, |         sort: None, | ||||||
|         show_ranking_score, |         show_ranking_score, | ||||||
|         show_ranking_score_details, |         show_ranking_score_details, | ||||||
|  |         locales: None, | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     let hits = make_hits( |     let hits = make_hits( | ||||||
| @@ -1624,6 +1657,7 @@ fn make_document( | |||||||
|     Ok(document) |     Ok(document) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[allow(clippy::too_many_arguments)] | ||||||
| fn format_fields( | fn format_fields( | ||||||
|     document: &Document, |     document: &Document, | ||||||
|     field_ids_map: &FieldsIdsMap, |     field_ids_map: &FieldsIdsMap, | ||||||
| @@ -1631,6 +1665,8 @@ fn format_fields( | |||||||
|     formatted_options: &BTreeMap<FieldId, FormatOptions>, |     formatted_options: &BTreeMap<FieldId, FormatOptions>, | ||||||
|     compute_matches: bool, |     compute_matches: bool, | ||||||
|     displayable_ids: &BTreeSet<FieldId>, |     displayable_ids: &BTreeSet<FieldId>, | ||||||
|  |     locales: Option<&[Language]>, | ||||||
|  |     localized_attributes: &[LocalizedAttributesRule], | ||||||
| ) -> Result<(Option<MatchesPosition>, Document), MeilisearchHttpError> { | ) -> Result<(Option<MatchesPosition>, Document), MeilisearchHttpError> { | ||||||
|     let mut matches_position = compute_matches.then(BTreeMap::new); |     let mut matches_position = compute_matches.then(BTreeMap::new); | ||||||
|     let mut document = document.clone(); |     let mut document = document.clone(); | ||||||
| @@ -1663,7 +1699,22 @@ fn format_fields( | |||||||
|             .reduce(|acc, option| acc.merge(option)); |             .reduce(|acc, option| acc.merge(option)); | ||||||
|         let mut infos = Vec::new(); |         let mut infos = Vec::new(); | ||||||
|  |  | ||||||
|         *value = format_value(std::mem::take(value), builder, format, &mut infos, compute_matches); |         // if no locales has been provided, we try to find the locales in the localized_attributes. | ||||||
|  |         let locales = locales.or_else(|| { | ||||||
|  |             localized_attributes | ||||||
|  |                 .iter() | ||||||
|  |                 .find(|rule| rule.match_str(key)) | ||||||
|  |                 .map(LocalizedAttributesRule::locales) | ||||||
|  |         }); | ||||||
|  |  | ||||||
|  |         *value = format_value( | ||||||
|  |             std::mem::take(value), | ||||||
|  |             builder, | ||||||
|  |             format, | ||||||
|  |             &mut infos, | ||||||
|  |             compute_matches, | ||||||
|  |             locales, | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         if let Some(matches) = matches_position.as_mut() { |         if let Some(matches) = matches_position.as_mut() { | ||||||
|             if !infos.is_empty() { |             if !infos.is_empty() { | ||||||
| @@ -1688,10 +1739,11 @@ fn format_value( | |||||||
|     format_options: Option<FormatOptions>, |     format_options: Option<FormatOptions>, | ||||||
|     infos: &mut Vec<MatchBounds>, |     infos: &mut Vec<MatchBounds>, | ||||||
|     compute_matches: bool, |     compute_matches: bool, | ||||||
|  |     locales: Option<&[Language]>, | ||||||
| ) -> Value { | ) -> Value { | ||||||
|     match value { |     match value { | ||||||
|         Value::String(old_string) => { |         Value::String(old_string) => { | ||||||
|             let mut matcher = builder.build(&old_string); |             let mut matcher = builder.build(&old_string, locales); | ||||||
|             if compute_matches { |             if compute_matches { | ||||||
|                 let matches = matcher.matches(); |                 let matches = matcher.matches(); | ||||||
|                 infos.extend_from_slice(&matches[..]); |                 infos.extend_from_slice(&matches[..]); | ||||||
| @@ -1718,6 +1770,7 @@ fn format_value( | |||||||
|                         }), |                         }), | ||||||
|                         infos, |                         infos, | ||||||
|                         compute_matches, |                         compute_matches, | ||||||
|  |                         locales, | ||||||
|                     ) |                     ) | ||||||
|                 }) |                 }) | ||||||
|                 .collect(), |                 .collect(), | ||||||
| @@ -1737,6 +1790,7 @@ fn format_value( | |||||||
|                             }), |                             }), | ||||||
|                             infos, |                             infos, | ||||||
|                             compute_matches, |                             compute_matches, | ||||||
|  |                             locales, | ||||||
|                         ), |                         ), | ||||||
|                     ) |                     ) | ||||||
|                 }) |                 }) | ||||||
| @@ -1745,7 +1799,7 @@ fn format_value( | |||||||
|         Value::Number(number) => { |         Value::Number(number) => { | ||||||
|             let s = number.to_string(); |             let s = number.to_string(); | ||||||
|  |  | ||||||
|             let mut matcher = builder.build(&s); |             let mut matcher = builder.build(&s, locales); | ||||||
|             if compute_matches { |             if compute_matches { | ||||||
|                 let matches = matcher.matches(); |                 let matches = matcher.matches(); | ||||||
|                 infos.extend_from_slice(&matches[..]); |                 infos.extend_from_slice(&matches[..]); | ||||||
|   | |||||||
| @@ -78,7 +78,8 @@ async fn import_dump_v1_movie_raw() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -240,7 +241,8 @@ async fn import_dump_v1_movie_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -388,7 +390,8 @@ async fn import_dump_v1_rubygems_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -522,7 +525,8 @@ async fn import_dump_v2_movie_raw() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -668,7 +672,8 @@ async fn import_dump_v2_movie_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -813,7 +818,8 @@ async fn import_dump_v2_rubygems_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -947,7 +953,8 @@ async fn import_dump_v3_movie_raw() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -1093,7 +1100,8 @@ async fn import_dump_v3_movie_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -1238,7 +1246,8 @@ async fn import_dump_v3_rubygems_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -1372,7 +1381,8 @@ async fn import_dump_v4_movie_raw() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -1518,7 +1528,8 @@ async fn import_dump_v4_movie_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -1663,7 +1674,8 @@ async fn import_dump_v4_rubygems_with_settings() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "### |     "### | ||||||
|     ); |     ); | ||||||
| @@ -1909,7 +1921,8 @@ async fn import_dump_v6_containing_experimental_features() { | |||||||
|       "pagination": { |       "pagination": { | ||||||
|         "maxTotalHits": 1000 |         "maxTotalHits": 1000 | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "###); |     "###); | ||||||
|  |  | ||||||
| @@ -2087,7 +2100,8 @@ async fn generate_and_import_dump_containing_vectors() { | |||||||
|           "documentTemplate": "{{doc.doggo}}" |           "documentTemplate": "{{doc.doggo}}" | ||||||
|         } |         } | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "###); |     "###); | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										1255
									
								
								meilisearch/tests/search/locales.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1255
									
								
								meilisearch/tests/search/locales.rs
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -7,6 +7,7 @@ mod facet_search; | |||||||
| mod formatted; | mod formatted; | ||||||
| mod geo; | mod geo; | ||||||
| mod hybrid; | mod hybrid; | ||||||
|  | mod locales; | ||||||
| mod matching_strategy; | mod matching_strategy; | ||||||
| mod multi; | mod multi; | ||||||
| mod pagination; | mod pagination; | ||||||
|   | |||||||
| @@ -55,7 +55,7 @@ async fn get_settings() { | |||||||
|     let (response, code) = index.settings().await; |     let (response, code) = index.settings().await; | ||||||
|     assert_eq!(code, 200); |     assert_eq!(code, 200); | ||||||
|     let settings = response.as_object().unwrap(); |     let settings = response.as_object().unwrap(); | ||||||
|     assert_eq!(settings.keys().len(), 16); |     assert_eq!(settings.keys().len(), 17); | ||||||
|     assert_eq!(settings["displayedAttributes"], json!(["*"])); |     assert_eq!(settings["displayedAttributes"], json!(["*"])); | ||||||
|     assert_eq!(settings["searchableAttributes"], json!(["*"])); |     assert_eq!(settings["searchableAttributes"], json!(["*"])); | ||||||
|     assert_eq!(settings["filterableAttributes"], json!([])); |     assert_eq!(settings["filterableAttributes"], json!([])); | ||||||
| @@ -195,7 +195,8 @@ async fn secrets_are_hidden_in_settings() { | |||||||
|           "response": "{{embedding}}" |           "response": "{{embedding}}" | ||||||
|         } |         } | ||||||
|       }, |       }, | ||||||
|       "searchCutoffMs": null |       "searchCutoffMs": null, | ||||||
|  |       "localizedAttributes": null | ||||||
|     } |     } | ||||||
|     "###); |     "###); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -17,7 +17,7 @@ bincode = "1.3.3" | |||||||
| bstr = "1.9.1" | bstr = "1.9.1" | ||||||
| bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } | bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] } | ||||||
| byteorder = "1.5.0" | byteorder = "1.5.0" | ||||||
| charabia = { version = "0.8.12", default-features = false } | charabia = { git = "https://github.com/meilisearch/charabia.git", branch = "simplify-lang-detection", default-features = false } | ||||||
| concat-arrays = "0.1.2" | concat-arrays = "0.1.2" | ||||||
| crossbeam-channel = "0.5.13" | crossbeam-channel = "0.5.13" | ||||||
| deserr = "0.6.2" | deserr = "0.6.2" | ||||||
|   | |||||||
| @@ -68,6 +68,7 @@ fn main() -> Result<(), Box<dyn Error>> { | |||||||
|                 logger, |                 logger, | ||||||
|                 TimeBudget::max(), |                 TimeBudget::max(), | ||||||
|                 None, |                 None, | ||||||
|  |                 None, | ||||||
|             )?; |             )?; | ||||||
|             if let Some((logger, dir)) = detailed_logger { |             if let Some((logger, dir)) = detailed_logger { | ||||||
|                 logger.finish(&mut ctx, Path::new(dir))?; |                 logger.finish(&mut ctx, Path::new(dir))?; | ||||||
|   | |||||||
| @@ -7,7 +7,6 @@ mod fst_set_codec; | |||||||
| mod obkv_codec; | mod obkv_codec; | ||||||
| mod roaring_bitmap; | mod roaring_bitmap; | ||||||
| mod roaring_bitmap_length; | mod roaring_bitmap_length; | ||||||
| mod script_language_codec; |  | ||||||
| mod str_beu32_codec; | mod str_beu32_codec; | ||||||
| mod str_ref; | mod str_ref; | ||||||
| mod str_str_u8_codec; | mod str_str_u8_codec; | ||||||
| @@ -26,7 +25,6 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar | |||||||
| pub use self::roaring_bitmap_length::{ | pub use self::roaring_bitmap_length::{ | ||||||
|     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, |     BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, | ||||||
| }; | }; | ||||||
| pub use self::script_language_codec::ScriptLanguageCodec; |  | ||||||
| pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec}; | pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec}; | ||||||
| pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; | pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,39 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::ffi::CStr; |  | ||||||
| use std::str; |  | ||||||
|  |  | ||||||
| use charabia::{Language, Script}; |  | ||||||
| use heed::BoxedError; |  | ||||||
|  |  | ||||||
| pub struct ScriptLanguageCodec; |  | ||||||
|  |  | ||||||
| impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec { |  | ||||||
|     type DItem = (Script, Language); |  | ||||||
|  |  | ||||||
|     fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> { |  | ||||||
|         let cstr = CStr::from_bytes_until_nul(bytes)?; |  | ||||||
|         let script = cstr.to_str()?; |  | ||||||
|         let script_name = Script::from_name(script); |  | ||||||
|         // skip '\0' byte between the two strings. |  | ||||||
|         let lan = str::from_utf8(&bytes[script.len() + 1..])?; |  | ||||||
|         let lan_name = Language::from_name(lan); |  | ||||||
|  |  | ||||||
|         Ok((script_name, lan_name)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec { |  | ||||||
|     type EItem = (Script, Language); |  | ||||||
|  |  | ||||||
|     fn bytes_encode((script, lan): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> { |  | ||||||
|         let script_name = script.name().as_bytes(); |  | ||||||
|         let lan_name = lan.name().as_bytes(); |  | ||||||
|  |  | ||||||
|         let mut bytes = Vec::with_capacity(script_name.len() + lan_name.len() + 1); |  | ||||||
|         bytes.extend_from_slice(script_name); |  | ||||||
|         bytes.push(0); |  | ||||||
|         bytes.extend_from_slice(lan_name); |  | ||||||
|  |  | ||||||
|         Ok(Cow::Owned(bytes)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -4,7 +4,6 @@ use std::convert::TryInto; | |||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::path::Path; | use std::path::Path; | ||||||
|  |  | ||||||
| use charabia::{Language, Script}; |  | ||||||
| use heed::types::*; | use heed::types::*; | ||||||
| use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; | use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| @@ -19,9 +18,7 @@ use crate::heed_codec::facet::{ | |||||||
|     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, |     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, | ||||||
|     FieldIdCodec, OrderedF64Codec, |     FieldIdCodec, OrderedF64Codec, | ||||||
| }; | }; | ||||||
| use crate::heed_codec::{ | use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; | ||||||
|     BEU16StrCodec, FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec, |  | ||||||
| }; |  | ||||||
| use crate::order_by_map::OrderByMap; | use crate::order_by_map::OrderByMap; | ||||||
| use crate::proximity::ProximityPrecision; | use crate::proximity::ProximityPrecision; | ||||||
| use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; | use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; | ||||||
| @@ -29,8 +26,8 @@ use crate::vector::{Embedding, EmbeddingConfig}; | |||||||
| use crate::{ | use crate::{ | ||||||
|     default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, |     default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, | ||||||
|     FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, |     FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, | ||||||
|     FieldidsWeightsMap, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, |     FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec, | ||||||
|     Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, |     RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, | ||||||
| }; | }; | ||||||
|  |  | ||||||
| pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; | pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; | ||||||
| @@ -73,6 +70,7 @@ pub mod main_key { | |||||||
|     pub const PROXIMITY_PRECISION: &str = "proximity-precision"; |     pub const PROXIMITY_PRECISION: &str = "proximity-precision"; | ||||||
|     pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; |     pub const EMBEDDING_CONFIGS: &str = "embedding_configs"; | ||||||
|     pub const SEARCH_CUTOFF: &str = "search_cutoff"; |     pub const SEARCH_CUTOFF: &str = "search_cutoff"; | ||||||
|  |     pub const LOCALIZED_ATTRIBUTES_RULES: &str = "localized_attributes_rules"; | ||||||
| } | } | ||||||
|  |  | ||||||
| pub mod db_name { | pub mod db_name { | ||||||
| @@ -101,7 +99,6 @@ pub mod db_name { | |||||||
|     pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; |     pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; | ||||||
|     pub const VECTOR_ARROY: &str = "vector-arroy"; |     pub const VECTOR_ARROY: &str = "vector-arroy"; | ||||||
|     pub const DOCUMENTS: &str = "documents"; |     pub const DOCUMENTS: &str = "documents"; | ||||||
|     pub const SCRIPT_LANGUAGE_DOCIDS: &str = "script_language_docids"; |  | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Clone)] | #[derive(Clone)] | ||||||
| @@ -142,9 +139,6 @@ pub struct Index { | |||||||
|     /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field |     /// Maps the word prefix and a field id with all the docids where the prefix appears inside the field | ||||||
|     pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, |     pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, | ||||||
|  |  | ||||||
|     /// Maps the script and language with all the docids that corresponds to it. |  | ||||||
|     pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>, |  | ||||||
|  |  | ||||||
|     /// Maps the facet field id and the docids for which this field exists |     /// Maps the facet field id and the docids for which this field exists | ||||||
|     pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, |     pub facet_id_exists_docids: Database<FieldIdCodec, CboRoaringBitmapCodec>, | ||||||
|     /// Maps the facet field id and the docids for which this field is set as null |     /// Maps the facet field id and the docids for which this field is set as null | ||||||
| @@ -198,8 +192,6 @@ impl Index { | |||||||
|             env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?; |             env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?; | ||||||
|         let word_pair_proximity_docids = |         let word_pair_proximity_docids = | ||||||
|             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; |             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; | ||||||
|         let script_language_docids = |  | ||||||
|             env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; |  | ||||||
|         let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; |         let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; | ||||||
|         let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; |         let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; | ||||||
|         let field_id_word_count_docids = |         let field_id_word_count_docids = | ||||||
| @@ -243,7 +235,6 @@ impl Index { | |||||||
|             word_prefix_docids, |             word_prefix_docids, | ||||||
|             exact_word_prefix_docids, |             exact_word_prefix_docids, | ||||||
|             word_pair_proximity_docids, |             word_pair_proximity_docids, | ||||||
|             script_language_docids, |  | ||||||
|             word_position_docids, |             word_position_docids, | ||||||
|             word_fid_docids, |             word_fid_docids, | ||||||
|             word_prefix_position_docids, |             word_prefix_position_docids, | ||||||
| @@ -1562,46 +1553,32 @@ impl Index { | |||||||
|         self.main.remap_key_type::<Str>().delete(txn, main_key::PROXIMITY_PRECISION) |         self.main.remap_key_type::<Str>().delete(txn, main_key::PROXIMITY_PRECISION) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /* script  language docids */ |     pub fn localized_attributes_rules( | ||||||
|     /// Retrieve all the documents ids that correspond with (Script, Language) key, `None` if it is any. |  | ||||||
|     pub fn script_language_documents_ids( |  | ||||||
|         &self, |         &self, | ||||||
|         rtxn: &RoTxn<'_>, |         rtxn: &RoTxn<'_>, | ||||||
|         key: &(Script, Language), |     ) -> heed::Result<Option<Vec<LocalizedAttributesRule>>> { | ||||||
|     ) -> heed::Result<Option<RoaringBitmap>> { |         self.main | ||||||
|         self.script_language_docids.get(rtxn, key) |             .remap_types::<Str, SerdeJson<Vec<LocalizedAttributesRule>>>() | ||||||
|  |             .get(rtxn, main_key::LOCALIZED_ATTRIBUTES_RULES) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn script_language( |     pub(crate) fn put_localized_attributes_rules( | ||||||
|         &self, |         &self, | ||||||
|         rtxn: &RoTxn<'_>, |         txn: &mut RwTxn<'_>, | ||||||
|     ) -> heed::Result<HashMap<Script, Vec<Language>>> { |         val: Vec<LocalizedAttributesRule>, | ||||||
|         let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new(); |     ) -> heed::Result<()> { | ||||||
|         let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new(); |         self.main.remap_types::<Str, SerdeJson<Vec<LocalizedAttributesRule>>>().put( | ||||||
|         let mut total = 0; |             txn, | ||||||
|         for sl in self.script_language_docids.iter(rtxn)? { |             main_key::LOCALIZED_ATTRIBUTES_RULES, | ||||||
|             let ((script, language), docids) = sl?; |             &val, | ||||||
|  |         ) | ||||||
|  |     } | ||||||
|  |  | ||||||
|             // keep only Languages that contains at least 1 document. |     pub(crate) fn delete_localized_attributes_rules( | ||||||
|             let remaining_documents_count = docids.len(); |         &self, | ||||||
|             total += remaining_documents_count; |         txn: &mut RwTxn<'_>, | ||||||
|             if remaining_documents_count > 0 { |     ) -> heed::Result<bool> { | ||||||
|                 script_language_doc_count.push((script, language, remaining_documents_count)); |         self.main.remap_key_type::<Str>().delete(txn, main_key::LOCALIZED_ATTRIBUTES_RULES) | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let threshold = total / 20; // 5% (arbitrary) |  | ||||||
|         for (script, language, count) in script_language_doc_count { |  | ||||||
|             if count > threshold { |  | ||||||
|                 if let Some(languages) = script_language.get_mut(&script) { |  | ||||||
|                     (*languages).push(language); |  | ||||||
|                 } else { |  | ||||||
|                     script_language.insert(script, vec![language]); |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         Ok(script_language) |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     /// Put the embedding configs: |     /// Put the embedding configs: | ||||||
|   | |||||||
| @@ -16,6 +16,7 @@ pub mod facet; | |||||||
| mod fields_ids_map; | mod fields_ids_map; | ||||||
| pub mod heed_codec; | pub mod heed_codec; | ||||||
| pub mod index; | pub mod index; | ||||||
|  | mod localized_attributes_rules; | ||||||
| pub mod order_by_map; | pub mod order_by_map; | ||||||
| pub mod prompt; | pub mod prompt; | ||||||
| pub mod proximity; | pub mod proximity; | ||||||
| @@ -62,6 +63,8 @@ pub use self::heed_codec::{ | |||||||
|     UncheckedU8StrStrCodec, |     UncheckedU8StrStrCodec, | ||||||
| }; | }; | ||||||
| pub use self::index::Index; | pub use self::index::Index; | ||||||
|  | pub use self::localized_attributes_rules::LocalizedAttributesRule; | ||||||
|  | use self::localized_attributes_rules::LocalizedFieldIds; | ||||||
| pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; | pub use self::search::facet::{FacetValueHit, SearchForFacetValues}; | ||||||
| pub use self::search::similar::Similar; | pub use self::search::similar::Similar; | ||||||
| pub use self::search::{ | pub use self::search::{ | ||||||
|   | |||||||
							
								
								
									
										114
									
								
								milli/src/localized_attributes_rules.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										114
									
								
								milli/src/localized_attributes_rules.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,114 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  |  | ||||||
|  | use charabia::Language; | ||||||
|  | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
|  | use crate::fields_ids_map::FieldsIdsMap; | ||||||
|  | use crate::FieldId; | ||||||
|  |  | ||||||
|  | /// A rule that defines which locales are supported for a given attribute. | ||||||
|  | /// | ||||||
|  | /// The rule is a list of attribute patterns and a list of locales. | ||||||
|  | /// The attribute patterns are matched against the attribute name. | ||||||
|  | /// The pattern `*` matches any attribute name. | ||||||
|  | /// The pattern `attribute_name*` matches any attribute name that starts with `attribute_name`. | ||||||
|  | /// The pattern `*attribute_name` matches any attribute name that ends with `attribute_name`. | ||||||
|  | /// The pattern `*attribute_name*` matches any attribute name that contains `attribute_name`. | ||||||
|  | #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] | ||||||
|  | pub struct LocalizedAttributesRule { | ||||||
|  |     pub attribute_patterns: Vec<String>, | ||||||
|  |     pub locales: Vec<Language>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl LocalizedAttributesRule { | ||||||
|  |     pub fn new(attribute_patterns: Vec<String>, locales: Vec<Language>) -> Self { | ||||||
|  |         Self { attribute_patterns, locales } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn match_str(&self, str: &str) -> bool { | ||||||
|  |         self.attribute_patterns.iter().any(|pattern| match_pattern(pattern.as_str(), str)) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn locales(&self) -> &[Language] { | ||||||
|  |         &self.locales | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn match_pattern(pattern: &str, str: &str) -> bool { | ||||||
|  |     if pattern == "*" { | ||||||
|  |         true | ||||||
|  |     } else if pattern.starts_with('*') && pattern.ends_with('*') { | ||||||
|  |         str.contains(&pattern[1..pattern.len() - 1]) | ||||||
|  |     } else if let Some(pattern) = pattern.strip_prefix('*') { | ||||||
|  |         str.ends_with(pattern) | ||||||
|  |     } else if let Some(pattern) = pattern.strip_suffix('*') { | ||||||
|  |         str.starts_with(pattern) | ||||||
|  |     } else { | ||||||
|  |         pattern == str | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, PartialEq, Eq)] | ||||||
|  | pub struct LocalizedFieldIds { | ||||||
|  |     field_id_to_locales: HashMap<FieldId, Vec<Language>>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl LocalizedFieldIds { | ||||||
|  |     pub fn new<I: Iterator<Item = FieldId>>( | ||||||
|  |         rules: &Option<Vec<LocalizedAttributesRule>>, | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         fields_ids: I, | ||||||
|  |     ) -> Self { | ||||||
|  |         let mut field_id_to_locales = HashMap::new(); | ||||||
|  |  | ||||||
|  |         if let Some(rules) = rules { | ||||||
|  |             let fields = fields_ids.filter_map(|field_id| { | ||||||
|  |                 fields_ids_map.name(field_id).map(|field_name| (field_id, field_name)) | ||||||
|  |             }); | ||||||
|  |  | ||||||
|  |             for (field_id, field_name) in fields { | ||||||
|  |                 let mut locales = Vec::new(); | ||||||
|  |                 for rule in rules { | ||||||
|  |                     if rule.match_str(field_name) { | ||||||
|  |                         locales.extend(rule.locales.iter()); | ||||||
|  |                         // Take the first rule that matches | ||||||
|  |                         break; | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 if !locales.is_empty() { | ||||||
|  |                     locales.sort(); | ||||||
|  |                     locales.dedup(); | ||||||
|  |                     field_id_to_locales.insert(field_id, locales); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Self { field_id_to_locales } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn locales(&self, fields_id: FieldId) -> Option<&[Language]> { | ||||||
|  |         self.field_id_to_locales.get(&fields_id).map(Vec::as_slice) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn test_match_pattern() { | ||||||
|  |         assert!(match_pattern("*", "test")); | ||||||
|  |         assert!(match_pattern("test*", "test")); | ||||||
|  |         assert!(match_pattern("test*", "testa")); | ||||||
|  |         assert!(match_pattern("*test", "test")); | ||||||
|  |         assert!(match_pattern("*test", "atest")); | ||||||
|  |         assert!(match_pattern("*test*", "test")); | ||||||
|  |         assert!(match_pattern("*test*", "atesta")); | ||||||
|  |         assert!(match_pattern("*test*", "atest")); | ||||||
|  |         assert!(match_pattern("*test*", "testa")); | ||||||
|  |         assert!(!match_pattern("test*test", "test")); | ||||||
|  |         assert!(!match_pattern("*test", "testa")); | ||||||
|  |         assert!(!match_pattern("test*", "atest")); | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -3,7 +3,7 @@ use std::collections::BinaryHeap; | |||||||
| use std::ops::ControlFlow; | use std::ops::ControlFlow; | ||||||
|  |  | ||||||
| use charabia::normalizer::NormalizerOption; | use charabia::normalizer::NormalizerOption; | ||||||
| use charabia::Normalize; | use charabia::{Language, Normalize, StrDetection, Token}; | ||||||
| use fst::automaton::{Automaton, Str}; | use fst::automaton::{Automaton, Str}; | ||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| @@ -23,6 +23,7 @@ pub struct SearchForFacetValues<'a> { | |||||||
|     search_query: Search<'a>, |     search_query: Search<'a>, | ||||||
|     max_values: usize, |     max_values: usize, | ||||||
|     is_hybrid: bool, |     is_hybrid: bool, | ||||||
|  |     locales: Option<Vec<Language>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> SearchForFacetValues<'a> { | impl<'a> SearchForFacetValues<'a> { | ||||||
| @@ -37,6 +38,7 @@ impl<'a> SearchForFacetValues<'a> { | |||||||
|             search_query, |             search_query, | ||||||
|             max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, |             max_values: DEFAULT_MAX_NUMBER_OF_VALUES_PER_FACET, | ||||||
|             is_hybrid, |             is_hybrid, | ||||||
|  |             locales: None, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -50,6 +52,11 @@ impl<'a> SearchForFacetValues<'a> { | |||||||
|         self |         self | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn locales(&mut self, locales: Vec<Language>) -> &mut Self { | ||||||
|  |         self.locales = Some(locales); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|     fn one_original_value_of( |     fn one_original_value_of( | ||||||
|         &self, |         &self, | ||||||
|         field_id: FieldId, |         field_id: FieldId, | ||||||
| @@ -109,8 +116,7 @@ impl<'a> SearchForFacetValues<'a> { | |||||||
|  |  | ||||||
|         match self.query.as_ref() { |         match self.query.as_ref() { | ||||||
|             Some(query) => { |             Some(query) => { | ||||||
|                 let options = NormalizerOption { lossy: true, ..Default::default() }; |                 let query = normalize_facet_string(query, self.locales.as_deref()); | ||||||
|                 let query = query.normalize(&options); |  | ||||||
|                 let query = query.as_ref(); |                 let query = query.as_ref(); | ||||||
|  |  | ||||||
|                 let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; |                 let authorize_typos = self.search_query.index.authorize_typos(rtxn)?; | ||||||
| @@ -330,3 +336,15 @@ impl ValuesCollection { | |||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { | ||||||
|  |     let options = NormalizerOption { lossy: true, ..Default::default() }; | ||||||
|  |     let mut detection = StrDetection::new(facet_string, locales); | ||||||
|  |     let token = Token { | ||||||
|  |         lemma: std::borrow::Cow::Borrowed(facet_string), | ||||||
|  |         script: detection.script(), | ||||||
|  |         language: detection.language(), | ||||||
|  |         ..Default::default() | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     token.normalize(&options).lemma.into_owned() | ||||||
|  | } | ||||||
|   | |||||||
| @@ -174,6 +174,7 @@ impl<'a> Search<'a> { | |||||||
|             semantic: self.semantic.clone(), |             semantic: self.semantic.clone(), | ||||||
|             time_budget: self.time_budget.clone(), |             time_budget: self.time_budget.clone(), | ||||||
|             ranking_score_threshold: self.ranking_score_threshold, |             ranking_score_threshold: self.ranking_score_threshold, | ||||||
|  |             locales: self.locales.clone(), | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let semantic = search.semantic.take(); |         let semantic = search.semantic.take(); | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
|  |  | ||||||
|  | use charabia::Language; | ||||||
| use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; | ||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
| use roaring::bitmap::RoaringBitmap; | use roaring::bitmap::RoaringBitmap; | ||||||
| @@ -52,6 +53,7 @@ pub struct Search<'a> { | |||||||
|     semantic: Option<SemanticSearch>, |     semantic: Option<SemanticSearch>, | ||||||
|     time_budget: TimeBudget, |     time_budget: TimeBudget, | ||||||
|     ranking_score_threshold: Option<f64>, |     ranking_score_threshold: Option<f64>, | ||||||
|  |     locales: Option<Vec<Language>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a> Search<'a> { | impl<'a> Search<'a> { | ||||||
| @@ -72,6 +74,7 @@ impl<'a> Search<'a> { | |||||||
|             rtxn, |             rtxn, | ||||||
|             index, |             index, | ||||||
|             semantic: None, |             semantic: None, | ||||||
|  |             locales: None, | ||||||
|             time_budget: TimeBudget::max(), |             time_budget: TimeBudget::max(), | ||||||
|             ranking_score_threshold: None, |             ranking_score_threshold: None, | ||||||
|         } |         } | ||||||
| @@ -160,6 +163,11 @@ impl<'a> Search<'a> { | |||||||
|         self |         self | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn locales(&mut self, locales: Vec<Language>) -> &mut Search<'a> { | ||||||
|  |         self.locales = Some(locales); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> { |     pub fn execute_for_candidates(&self, has_vector_search: bool) -> Result<RoaringBitmap> { | ||||||
|         if has_vector_search { |         if has_vector_search { | ||||||
|             let ctx = SearchContext::new(self.index, self.rtxn)?; |             let ctx = SearchContext::new(self.index, self.rtxn)?; | ||||||
| @@ -232,6 +240,7 @@ impl<'a> Search<'a> { | |||||||
|                 &mut DefaultSearchLogger, |                 &mut DefaultSearchLogger, | ||||||
|                 self.time_budget.clone(), |                 self.time_budget.clone(), | ||||||
|                 self.ranking_score_threshold, |                 self.ranking_score_threshold, | ||||||
|  |                 self.locales.as_ref(), | ||||||
|             )?, |             )?, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
| @@ -272,6 +281,7 @@ impl fmt::Debug for Search<'_> { | |||||||
|             semantic, |             semantic, | ||||||
|             time_budget, |             time_budget, | ||||||
|             ranking_score_threshold, |             ranking_score_threshold, | ||||||
|  |             locales, | ||||||
|         } = self; |         } = self; | ||||||
|         f.debug_struct("Search") |         f.debug_struct("Search") | ||||||
|             .field("query", query) |             .field("query", query) | ||||||
| @@ -292,6 +302,7 @@ impl fmt::Debug for Search<'_> { | |||||||
|             ) |             ) | ||||||
|             .field("time_budget", time_budget) |             .field("time_budget", time_budget) | ||||||
|             .field("ranking_score_threshold", ranking_score_threshold) |             .field("ranking_score_threshold", ranking_score_threshold) | ||||||
|  |             .field("locales", locales) | ||||||
|             .finish() |             .finish() | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
|  |  | ||||||
| use charabia::{SeparatorKind, Token, Tokenizer}; | use charabia::{Language, SeparatorKind, Token, Tokenizer}; | ||||||
| pub use matching_words::MatchingWords; | pub use matching_words::MatchingWords; | ||||||
| use matching_words::{MatchType, PartialMatch, WordId}; | use matching_words::{MatchType, PartialMatch, WordId}; | ||||||
| use serde::Serialize; | use serde::Serialize; | ||||||
| @@ -46,7 +46,11 @@ impl<'m> MatcherBuilder<'m> { | |||||||
|         self |         self | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn build<'t>(&self, text: &'t str) -> Matcher<'t, 'm, '_> { |     pub fn build<'t, 'lang>( | ||||||
|  |         &self, | ||||||
|  |         text: &'t str, | ||||||
|  |         locales: Option<&'lang [Language]>, | ||||||
|  |     ) -> Matcher<'t, 'm, '_, 'lang> { | ||||||
|         let crop_marker = match &self.crop_marker { |         let crop_marker = match &self.crop_marker { | ||||||
|             Some(marker) => marker.as_str(), |             Some(marker) => marker.as_str(), | ||||||
|             None => DEFAULT_CROP_MARKER, |             None => DEFAULT_CROP_MARKER, | ||||||
| @@ -68,6 +72,7 @@ impl<'m> MatcherBuilder<'m> { | |||||||
|             highlight_prefix, |             highlight_prefix, | ||||||
|             highlight_suffix, |             highlight_suffix, | ||||||
|             matches: None, |             matches: None, | ||||||
|  |             locales, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -107,17 +112,18 @@ pub struct MatchBounds { | |||||||
|  |  | ||||||
| /// Structure used to analyze a string, compute words that match, | /// Structure used to analyze a string, compute words that match, | ||||||
| /// and format the source string, returning a highlighted and cropped sub-string. | /// and format the source string, returning a highlighted and cropped sub-string. | ||||||
| pub struct Matcher<'t, 'tokenizer, 'b> { | pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { | ||||||
|     text: &'t str, |     text: &'t str, | ||||||
|     matching_words: &'b MatchingWords, |     matching_words: &'b MatchingWords, | ||||||
|     tokenizer: &'b Tokenizer<'tokenizer>, |     tokenizer: &'b Tokenizer<'tokenizer>, | ||||||
|  |     locales: Option<&'lang [Language]>, | ||||||
|     crop_marker: &'b str, |     crop_marker: &'b str, | ||||||
|     highlight_prefix: &'b str, |     highlight_prefix: &'b str, | ||||||
|     highlight_suffix: &'b str, |     highlight_suffix: &'b str, | ||||||
|     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, |     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_> { | impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { | ||||||
|     /// Iterates over tokens and save any of them that matches the query. |     /// Iterates over tokens and save any of them that matches the query. | ||||||
|     fn compute_matches(&mut self) -> &mut Self { |     fn compute_matches(&mut self) -> &mut Self { | ||||||
|         /// some words are counted as matches only if they are close together and in the good order, |         /// some words are counted as matches only if they are close together and in the good order, | ||||||
| @@ -173,7 +179,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_> { | |||||||
|             false |             false | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); |         let tokens: Vec<_> = | ||||||
|  |             self.tokenizer.tokenize_with_allow_list(self.text, self.locales).collect(); | ||||||
|         let mut matches = Vec::new(); |         let mut matches = Vec::new(); | ||||||
|  |  | ||||||
|         let mut words_positions = tokens |         let mut words_positions = tokens | ||||||
| @@ -530,6 +537,7 @@ mod tests { | |||||||
|                 &mut crate::DefaultSearchLogger, |                 &mut crate::DefaultSearchLogger, | ||||||
|                 TimeBudget::max(), |                 TimeBudget::max(), | ||||||
|                 None, |                 None, | ||||||
|  |                 None, | ||||||
|             ) |             ) | ||||||
|             .unwrap(); |             .unwrap(); | ||||||
|  |  | ||||||
| @@ -553,19 +561,19 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop and no highlight should return complete text. |         // no crop and no highlight should return complete text. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop and no highlight should return complete text. |         // no crop and no highlight should return complete text. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop and no highlight should return complete text. |         // no crop and no highlight should return complete text. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|     } |     } | ||||||
| @@ -580,23 +588,23 @@ mod tests { | |||||||
|  |  | ||||||
|         // empty text. |         // empty text. | ||||||
|         let text = ""; |         let text = ""; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         assert_eq!(&matcher.format(format_options), ""); |         assert_eq!(&matcher.format(format_options), ""); | ||||||
|  |  | ||||||
|         // text containing only separators. |         // text containing only separators. | ||||||
|         let text = ":-)"; |         let text = ":-)"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         assert_eq!(&matcher.format(format_options), ":-)"); |         assert_eq!(&matcher.format(format_options), ":-)"); | ||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop should return complete text, because there is no matches. |         // no crop should return complete text, because there is no matches. | ||||||
|         assert_eq!(&matcher.format(format_options), &text); |         assert_eq!(&matcher.format(format_options), &text); | ||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -605,7 +613,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -622,7 +630,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing prefix match. |         // Text containing prefix match. | ||||||
|         let text = "Ŵôřlḑôle"; |         let text = "Ŵôřlḑôle"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -631,7 +639,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing unicode match. |         // Text containing unicode match. | ||||||
|         let text = "Ŵôřlḑ"; |         let text = "Ŵôřlḑ"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -643,7 +651,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing unicode match. |         // Text containing unicode match. | ||||||
|         let text = "Westfália"; |         let text = "Westfália"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no crop should return complete text with highlighted matches. |         // no crop should return complete text with highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -661,7 +669,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // empty text. |         // empty text. | ||||||
|         let text = ""; |         let text = ""; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
|             @"" |             @"" | ||||||
| @@ -669,7 +677,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // text containing only separators. |         // text containing only separators. | ||||||
|         let text = ":-)"; |         let text = ":-)"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
|             @":-)" |             @":-)" | ||||||
| @@ -677,7 +685,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no highlight should return 10 first words with a marker at the end. |         // no highlight should return 10 first words with a marker at the end. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -686,7 +694,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text without any match starting by a separator. |         // Text without any match starting by a separator. | ||||||
|         let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; |         let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no highlight should return 10 first words with a marker at the end. |         // no highlight should return 10 first words with a marker at the end. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -695,7 +703,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Test phrase propagation |         // Test phrase propagation | ||||||
|         let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; |         let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // should crop the phrase instead of croping around the match. |         // should crop the phrase instead of croping around the match. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -704,7 +712,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no highlight should return 10 last words with a marker at the start. |         // no highlight should return 10 last words with a marker at the start. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -713,7 +721,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // no highlight should return 10 last words with a marker at the start. |         // no highlight should return 10 last words with a marker at the start. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -722,7 +730,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing a match unordered and a match ordered. |         // Text containing a match unordered and a match ordered. | ||||||
|         let text = "The world split void void void void void void void void void split the world void void"; |         let text = "The world split void void void void void void void void void split the world void void"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -731,7 +739,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing matches with different density. |         // Text containing matches with different density. | ||||||
|         let text = "split void the void void world void void void void void void void void void void split the world void void"; |         let text = "split void the void void world void void void void void void void void void void split the world void void"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -740,7 +748,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing matches with same word. |         // Text containing matches with same word. | ||||||
|         let text = "split split split split split split void void void void void void void void void void split the world void void"; |         let text = "split split split split split split void void void void void void void void void void split the world void void"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -758,7 +766,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // empty text. |         // empty text. | ||||||
|         let text = ""; |         let text = ""; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
|             @"" |             @"" | ||||||
| @@ -766,7 +774,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // text containing only separators. |         // text containing only separators. | ||||||
|         let text = ":-)"; |         let text = ":-)"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
|             @":-)" |             @":-)" | ||||||
| @@ -774,7 +782,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text without any match. |         // Text without any match. | ||||||
|         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; |         let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // both should return 10 first words with a marker at the end. |         // both should return 10 first words with a marker at the end. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -783,7 +791,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing some matches. |         // Text containing some matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves."; |         let text = "Natalie risk her future to build a world with the boy she loves."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // both should return 10 last words with a marker at the start and highlighted matches. |         // both should return 10 last words with a marker at the start and highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -792,7 +800,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing all matches. |         // Text containing all matches. | ||||||
|         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; |         let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // both should return 10 last words with a marker at the start and highlighted matches. |         // both should return 10 last words with a marker at the start and highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -801,7 +809,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // Text containing a match unordered and a match ordered. |         // Text containing a match unordered and a match ordered. | ||||||
|         let text = "The world split void void void void void void void void void split the world void void"; |         let text = "The world split void void void void void void void void void split the world void void"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // crop should return 10 last words with a marker at the start. |         // crop should return 10 last words with a marker at the start. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -824,7 +832,7 @@ mod tests { | |||||||
|         let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; |         let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); |         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // should return 10 words with a marker at the start as well the end, and the highlighted matches. |         // should return 10 words with a marker at the start as well the end, and the highlighted matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -832,7 +840,7 @@ mod tests { | |||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); |         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // should highlight "those" and the phrase "and those". |         // should highlight "those" and the phrase "and those". | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -851,7 +859,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // set a smaller crop size |         // set a smaller crop size | ||||||
|         let format_options = FormatOptions { highlight: false, crop: Some(2) }; |         let format_options = FormatOptions { highlight: false, crop: Some(2) }; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // because crop size < query size, partially format matches. |         // because crop size < query size, partially format matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -860,7 +868,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // set a smaller crop size |         // set a smaller crop size | ||||||
|         let format_options = FormatOptions { highlight: false, crop: Some(1) }; |         let format_options = FormatOptions { highlight: false, crop: Some(1) }; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // because crop size < query size, partially format matches. |         // because crop size < query size, partially format matches. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -869,7 +877,7 @@ mod tests { | |||||||
|  |  | ||||||
|         // set  crop size to 0 |         // set  crop size to 0 | ||||||
|         let format_options = FormatOptions { highlight: false, crop: Some(0) }; |         let format_options = FormatOptions { highlight: false, crop: Some(0) }; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         // because crop size is 0, crop is ignored. |         // because crop size is 0, crop is ignored. | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
| @@ -889,7 +897,7 @@ mod tests { | |||||||
|         let format_options = FormatOptions { highlight: true, crop: None }; |         let format_options = FormatOptions { highlight: true, crop: None }; | ||||||
|  |  | ||||||
|         let text = "the do or die can't be he do and or isn't he"; |         let text = "the do or die can't be he do and or isn't he"; | ||||||
|         let mut matcher = builder.build(text); |         let mut matcher = builder.build(text, None); | ||||||
|         insta::assert_snapshot!( |         insta::assert_snapshot!( | ||||||
|             matcher.format(format_options), |             matcher.format(format_options), | ||||||
|             @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" |             @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" | ||||||
|   | |||||||
| @@ -24,7 +24,7 @@ mod tests; | |||||||
| use std::collections::HashSet; | use std::collections::HashSet; | ||||||
|  |  | ||||||
| use bucket_sort::{bucket_sort, BucketSortOutput}; | use bucket_sort::{bucket_sort, BucketSortOutput}; | ||||||
| use charabia::TokenizerBuilder; | use charabia::{Language, TokenizerBuilder}; | ||||||
| use db_cache::DatabaseCache; | use db_cache::DatabaseCache; | ||||||
| use exact_attribute::ExactAttribute; | use exact_attribute::ExactAttribute; | ||||||
| use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; | use graph_based_ranking_rule::{Exactness, Fid, Position, Proximity, Typo}; | ||||||
| @@ -639,6 +639,7 @@ pub fn execute_search( | |||||||
|     query_graph_logger: &mut dyn SearchLogger<QueryGraph>, |     query_graph_logger: &mut dyn SearchLogger<QueryGraph>, | ||||||
|     time_budget: TimeBudget, |     time_budget: TimeBudget, | ||||||
|     ranking_score_threshold: Option<f64>, |     ranking_score_threshold: Option<f64>, | ||||||
|  |     locales: Option<&Vec<Language>>, | ||||||
| ) -> Result<PartialSearchResult> { | ) -> Result<PartialSearchResult> { | ||||||
|     check_sort_criteria(ctx, sort_criteria.as_ref())?; |     check_sort_criteria(ctx, sort_criteria.as_ref())?; | ||||||
|  |  | ||||||
| @@ -670,9 +671,8 @@ pub fn execute_search( | |||||||
|             tokbuilder.words_dict(dictionary); |             tokbuilder.words_dict(dictionary); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let script_lang_map = ctx.index.script_language(ctx.txn)?; |         if let Some(locales) = locales { | ||||||
|         if !script_lang_map.is_empty() { |             tokbuilder.allow_list(locales); | ||||||
|             tokbuilder.allow_list(&script_lang_map); |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let tokenizer = tokbuilder.build(); |         let tokenizer = tokbuilder.build(); | ||||||
|   | |||||||
| @@ -24,7 +24,7 @@ pub struct ExtractedTokens { | |||||||
| #[tracing::instrument(level = "trace", skip_all, target = "search::query")] | #[tracing::instrument(level = "trace", skip_all, target = "search::query")] | ||||||
| pub fn located_query_terms_from_tokens( | pub fn located_query_terms_from_tokens( | ||||||
|     ctx: &mut SearchContext<'_>, |     ctx: &mut SearchContext<'_>, | ||||||
|     query: NormalizedTokenIter<'_, '_>, |     query: NormalizedTokenIter<'_, '_, '_, '_>, | ||||||
|     words_limit: Option<usize>, |     words_limit: Option<usize>, | ||||||
| ) -> Result<ExtractedTokens> { | ) -> Result<ExtractedTokens> { | ||||||
|     let nbr_typos = number_of_typos_allowed(ctx)?; |     let nbr_typos = number_of_typos_allowed(ctx)?; | ||||||
|   | |||||||
| @@ -36,7 +36,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { | |||||||
|             field_id_word_count_docids, |             field_id_word_count_docids, | ||||||
|             word_prefix_position_docids, |             word_prefix_position_docids, | ||||||
|             word_prefix_fid_docids, |             word_prefix_fid_docids, | ||||||
|             script_language_docids, |  | ||||||
|             facet_id_f64_docids, |             facet_id_f64_docids, | ||||||
|             facet_id_string_docids, |             facet_id_string_docids, | ||||||
|             facet_id_normalized_string_strings, |             facet_id_normalized_string_strings, | ||||||
| @@ -83,7 +82,6 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { | |||||||
|         field_id_word_count_docids.clear(self.wtxn)?; |         field_id_word_count_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_position_docids.clear(self.wtxn)?; |         word_prefix_position_docids.clear(self.wtxn)?; | ||||||
|         word_prefix_fid_docids.clear(self.wtxn)?; |         word_prefix_fid_docids.clear(self.wtxn)?; | ||||||
|         script_language_docids.clear(self.wtxn)?; |  | ||||||
|         facet_id_f64_docids.clear(self.wtxn)?; |         facet_id_f64_docids.clear(self.wtxn)?; | ||||||
|         facet_id_normalized_string_strings.clear(self.wtxn)?; |         facet_id_normalized_string_strings.clear(self.wtxn)?; | ||||||
|         facet_id_string_fst.clear(self.wtxn)?; |         facet_id_string_fst.clear(self.wtxn)?; | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::BufReader; | use std::io::BufReader; | ||||||
| use std::{io, mem, str}; | use std::{io, mem, str}; | ||||||
|  |  | ||||||
| use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder}; | ||||||
| use obkv::{KvReader, KvWriterU16}; | use obkv::{KvReader, KvWriterU16}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
| @@ -15,8 +14,6 @@ use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; | |||||||
| use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; | use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; | ||||||
| use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; | use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; | ||||||
|  |  | ||||||
| pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; |  | ||||||
|  |  | ||||||
| /// Extracts the word and positions where this word appear and | /// Extracts the word and positions where this word appear and | ||||||
| /// prefixes it by the document id. | /// prefixes it by the document id. | ||||||
| /// | /// | ||||||
| @@ -28,7 +25,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     settings_diff: &InnerIndexSettingsDiff, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     let max_positions_per_attributes = max_positions_per_attributes |     let max_positions_per_attributes = max_positions_per_attributes | ||||||
|         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); |         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
| @@ -36,7 +33,6 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|     // initialize destination values. |     // initialize destination values. | ||||||
|     let mut documents_ids = RoaringBitmap::new(); |     let mut documents_ids = RoaringBitmap::new(); | ||||||
|     let mut script_language_docids = HashMap::new(); |  | ||||||
|     let mut docid_word_positions_sorter = create_sorter( |     let mut docid_word_positions_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Stable, |         grenad::SortAlgorithm::Stable, | ||||||
|         keep_latest_obkv, |         keep_latest_obkv, | ||||||
| @@ -61,13 +57,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         .map(|s| s.iter().map(String::as_str).collect()); |         .map(|s| s.iter().map(String::as_str).collect()); | ||||||
|     let old_dictionary: Option<Vec<_>> = |     let old_dictionary: Option<Vec<_>> = | ||||||
|         settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); |         settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|     let mut del_builder = tokenizer_builder( |     let del_builder = | ||||||
|         old_stop_words, |         tokenizer_builder(old_stop_words, old_separators.as_deref(), old_dictionary.as_deref()); | ||||||
|         old_separators.as_deref(), |     let del_tokenizer = del_builder.into_tokenizer(); | ||||||
|         old_dictionary.as_deref(), |  | ||||||
|         None, |  | ||||||
|     ); |  | ||||||
|     let del_tokenizer = del_builder.build(); |  | ||||||
|  |  | ||||||
|     let new_stop_words = settings_diff.new.stop_words.as_ref(); |     let new_stop_words = settings_diff.new.stop_words.as_ref(); | ||||||
|     let new_separators: Option<Vec<_>> = settings_diff |     let new_separators: Option<Vec<_>> = settings_diff | ||||||
| @@ -77,13 +69,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         .map(|s| s.iter().map(String::as_str).collect()); |         .map(|s| s.iter().map(String::as_str).collect()); | ||||||
|     let new_dictionary: Option<Vec<_>> = |     let new_dictionary: Option<Vec<_>> = | ||||||
|         settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); |         settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|     let mut add_builder = tokenizer_builder( |     let add_builder = | ||||||
|         new_stop_words, |         tokenizer_builder(new_stop_words, new_separators.as_deref(), new_dictionary.as_deref()); | ||||||
|         new_separators.as_deref(), |     let add_tokenizer = add_builder.into_tokenizer(); | ||||||
|         new_dictionary.as_deref(), |  | ||||||
|         None, |  | ||||||
|     ); |  | ||||||
|     let add_tokenizer = add_builder.build(); |  | ||||||
|  |  | ||||||
|     // iterate over documents. |     // iterate over documents. | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
| @@ -109,7 +97,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         let (del, add): (Result<_>, Result<_>) = rayon::join( |         let (del, add): (Result<_>, Result<_>) = rayon::join( | ||||||
|             || { |             || { | ||||||
|                 // deletions |                 // deletions | ||||||
|                 lang_safe_tokens_from_document( |                 tokens_from_document( | ||||||
|                     &obkv, |                     &obkv, | ||||||
|                     &settings_diff.old, |                     &settings_diff.old, | ||||||
|                     &del_tokenizer, |                     &del_tokenizer, | ||||||
| @@ -120,7 +108,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|             }, |             }, | ||||||
|             || { |             || { | ||||||
|                 // additions |                 // additions | ||||||
|                 lang_safe_tokens_from_document( |                 tokens_from_document( | ||||||
|                     &obkv, |                     &obkv, | ||||||
|                     &settings_diff.new, |                     &settings_diff.new, | ||||||
|                     &add_tokenizer, |                     &add_tokenizer, | ||||||
| @@ -131,8 +119,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|             }, |             }, | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let (del_obkv, del_script_language_word_count) = del?; |         let del_obkv = del?; | ||||||
|         let (add_obkv, add_script_language_word_count) = add?; |         let add_obkv = add?; | ||||||
|  |  | ||||||
|         // merge deletions and additions. |         // merge deletions and additions. | ||||||
|         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> |         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> | ||||||
| @@ -150,31 +138,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|             key_buffer.extend_from_slice(&field_id.to_be_bytes()); |             key_buffer.extend_from_slice(&field_id.to_be_bytes()); | ||||||
|             docid_word_positions_sorter.insert(&key_buffer, value)?; |             docid_word_positions_sorter.insert(&key_buffer, value)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // update script_language_docids deletions. |  | ||||||
|         for (script, languages_frequency) in del_script_language_word_count { |  | ||||||
|             for (language, _) in languages_frequency { |  | ||||||
|                 let entry = script_language_docids |  | ||||||
|                     .entry((script, language)) |  | ||||||
|                     .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); |  | ||||||
|                 entry.0.push(document_id); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // update script_language_docids additions. |  | ||||||
|         for (script, languages_frequency) in add_script_language_word_count { |  | ||||||
|             for (language, _) in languages_frequency { |  | ||||||
|                 let entry = script_language_docids |  | ||||||
|                     .entry((script, language)) |  | ||||||
|                     .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new())); |  | ||||||
|                 entry.1.push(document_id); |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>. |     // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>. | ||||||
|     sorter_into_reader(docid_word_positions_sorter, indexer) |     sorter_into_reader(docid_word_positions_sorter, indexer) | ||||||
|         .map(|reader| (reader, script_language_docids)) |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Check if any searchable fields of a document changed. | /// Check if any searchable fields of a document changed. | ||||||
| @@ -205,7 +172,6 @@ fn tokenizer_builder<'a>( | |||||||
|     stop_words: Option<&'a fst::Set<Vec<u8>>>, |     stop_words: Option<&'a fst::Set<Vec<u8>>>, | ||||||
|     allowed_separators: Option<&'a [&str]>, |     allowed_separators: Option<&'a [&str]>, | ||||||
|     dictionary: Option<&'a [&str]>, |     dictionary: Option<&'a [&str]>, | ||||||
|     script_language: Option<&'a HashMap<Script, Vec<Language>>>, |  | ||||||
| ) -> TokenizerBuilder<'a, Vec<u8>> { | ) -> TokenizerBuilder<'a, Vec<u8>> { | ||||||
|     let mut tokenizer_builder = TokenizerBuilder::new(); |     let mut tokenizer_builder = TokenizerBuilder::new(); | ||||||
|     if let Some(stop_words) = stop_words { |     if let Some(stop_words) = stop_words { | ||||||
| @@ -218,96 +184,23 @@ fn tokenizer_builder<'a>( | |||||||
|         tokenizer_builder.separators(separators); |         tokenizer_builder.separators(separators); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     if let Some(script_language) = script_language { |  | ||||||
|         tokenizer_builder.allow_list(script_language); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     tokenizer_builder |     tokenizer_builder | ||||||
| } | } | ||||||
|  |  | ||||||
| /// Extract words mapped with their positions of a document, |  | ||||||
| /// ensuring no Language detection mistakes was made. |  | ||||||
| fn lang_safe_tokens_from_document<'a>( |  | ||||||
|     obkv: &KvReader<'_, FieldId>, |  | ||||||
|     settings: &InnerIndexSettings, |  | ||||||
|     tokenizer: &Tokenizer<'_>, |  | ||||||
|     max_positions_per_attributes: u32, |  | ||||||
|     del_add: DelAdd, |  | ||||||
|     buffers: &'a mut Buffers, |  | ||||||
| ) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> { |  | ||||||
|     let mut script_language_word_count = HashMap::new(); |  | ||||||
|  |  | ||||||
|     tokens_from_document( |  | ||||||
|         obkv, |  | ||||||
|         &settings.searchable_fields_ids, |  | ||||||
|         tokenizer, |  | ||||||
|         max_positions_per_attributes, |  | ||||||
|         del_add, |  | ||||||
|         buffers, |  | ||||||
|         &mut script_language_word_count, |  | ||||||
|     )?; |  | ||||||
|  |  | ||||||
|     // if we detect a potetial mistake in the language detection, |  | ||||||
|     // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages. |  | ||||||
|     // context: https://github.com/meilisearch/meilisearch/issues/3565 |  | ||||||
|     if script_language_word_count |  | ||||||
|         .values() |  | ||||||
|         .map(Vec::as_slice) |  | ||||||
|         .any(potential_language_detection_error) |  | ||||||
|     { |  | ||||||
|         // build an allow list with the most frequent detected languages in the document. |  | ||||||
|         let script_language: HashMap<_, _> = |  | ||||||
|             script_language_word_count.iter().filter_map(most_frequent_languages).collect(); |  | ||||||
|  |  | ||||||
|         // if the allow list is empty, meaning that no Language is considered frequent, |  | ||||||
|         // then we don't rerun the extraction. |  | ||||||
|         if !script_language.is_empty() { |  | ||||||
|             // build a new temporary tokenizer including the allow list. |  | ||||||
|             let stop_words = settings.stop_words.as_ref(); |  | ||||||
|             let separators: Option<Vec<_>> = settings |  | ||||||
|                 .allowed_separators |  | ||||||
|                 .as_ref() |  | ||||||
|                 .map(|s| s.iter().map(String::as_str).collect()); |  | ||||||
|             let dictionary: Option<Vec<_>> = |  | ||||||
|                 settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); |  | ||||||
|             let mut builder = |  | ||||||
|                 tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); |  | ||||||
|             let tokenizer = builder.build(); |  | ||||||
|  |  | ||||||
|             script_language_word_count.clear(); |  | ||||||
|  |  | ||||||
|             // rerun the extraction. |  | ||||||
|             tokens_from_document( |  | ||||||
|                 obkv, |  | ||||||
|                 &settings.searchable_fields_ids, |  | ||||||
|                 &tokenizer, |  | ||||||
|                 max_positions_per_attributes, |  | ||||||
|                 del_add, |  | ||||||
|                 buffers, |  | ||||||
|                 &mut script_language_word_count, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     // returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>) |  | ||||||
|     Ok((&buffers.obkv_buffer, script_language_word_count)) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /// Extract words mapped with their positions of a document. | /// Extract words mapped with their positions of a document. | ||||||
| fn tokens_from_document<'a>( | fn tokens_from_document<'a>( | ||||||
|     obkv: &KvReader<'a, FieldId>, |     obkv: &KvReader<'a, FieldId>, | ||||||
|     searchable_fields: &[FieldId], |     settings: &InnerIndexSettings, | ||||||
|     tokenizer: &Tokenizer<'_>, |     tokenizer: &Tokenizer<'_>, | ||||||
|     max_positions_per_attributes: u32, |     max_positions_per_attributes: u32, | ||||||
|     del_add: DelAdd, |     del_add: DelAdd, | ||||||
|     buffers: &'a mut Buffers, |     buffers: &'a mut Buffers, | ||||||
|     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, |  | ||||||
| ) -> Result<&'a [u8]> { | ) -> Result<&'a [u8]> { | ||||||
|     buffers.obkv_buffer.clear(); |     buffers.obkv_buffer.clear(); | ||||||
|     let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); |     let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); | ||||||
|     for (field_id, field_bytes) in obkv.iter() { |     for (field_id, field_bytes) in obkv.iter() { | ||||||
|         // if field is searchable. |         // if field is searchable. | ||||||
|         if searchable_fields.as_ref().contains(&field_id) { |         if settings.searchable_fields_ids.contains(&field_id) { | ||||||
|             // extract deletion or addition only. |             // extract deletion or addition only. | ||||||
|             if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { |             if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { | ||||||
|                 // parse json. |                 // parse json. | ||||||
| @@ -322,20 +215,11 @@ fn tokens_from_document<'a>( | |||||||
|                 buffers.field_buffer.clear(); |                 buffers.field_buffer.clear(); | ||||||
|                 if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { |                 if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { | ||||||
|                     // create an iterator of token with their positions. |                     // create an iterator of token with their positions. | ||||||
|                     let tokens = process_tokens(tokenizer.tokenize(field)) |                     let locales = settings.localized_searchable_fields_ids.locales(field_id); | ||||||
|  |                     let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) | ||||||
|                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); |                         .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); | ||||||
|  |  | ||||||
|                     for (index, token) in tokens { |                     for (index, token) in tokens { | ||||||
|                         // if a language has been detected for the token, we update the counter. |  | ||||||
|                         if let Some(language) = token.language { |  | ||||||
|                             let script = token.script; |  | ||||||
|                             let entry = script_language_word_count.entry(script).or_default(); |  | ||||||
|                             match entry.iter_mut().find(|(l, _)| *l == language) { |  | ||||||
|                                 Some((_, n)) => *n += 1, |  | ||||||
|                                 None => entry.push((language, 1)), |  | ||||||
|                             } |  | ||||||
|                         } |  | ||||||
|  |  | ||||||
|                         // keep a word only if it is not empty and fit in a LMDB key. |                         // keep a word only if it is not empty and fit in a LMDB key. | ||||||
|                         let token = token.lemma().trim(); |                         let token = token.lemma().trim(); | ||||||
|                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { |                         if !token.is_empty() && token.len() <= MAX_WORD_LENGTH { | ||||||
| @@ -423,39 +307,6 @@ fn process_tokens<'a>( | |||||||
|         .filter(|(_, t)| t.is_word()) |         .filter(|(_, t)| t.is_word()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn potential_language_detection_error(languages_frequency: &[(Language, usize)]) -> bool { |  | ||||||
|     if languages_frequency.len() > 1 { |  | ||||||
|         let threshold = compute_language_frequency_threshold(languages_frequency); |  | ||||||
|         languages_frequency.iter().any(|(_, c)| *c <= threshold) |  | ||||||
|     } else { |  | ||||||
|         false |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn most_frequent_languages( |  | ||||||
|     (script, languages_frequency): (&Script, &Vec<(Language, usize)>), |  | ||||||
| ) -> Option<(Script, Vec<Language>)> { |  | ||||||
|     if languages_frequency.len() > 1 { |  | ||||||
|         let threshold = compute_language_frequency_threshold(languages_frequency); |  | ||||||
|  |  | ||||||
|         let languages: Vec<_> = |  | ||||||
|             languages_frequency.iter().filter(|(_, c)| *c > threshold).map(|(l, _)| *l).collect(); |  | ||||||
|  |  | ||||||
|         if languages.is_empty() { |  | ||||||
|             None |  | ||||||
|         } else { |  | ||||||
|             Some((*script, languages)) |  | ||||||
|         } |  | ||||||
|     } else { |  | ||||||
|         None |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)]) -> usize { |  | ||||||
|     let total: usize = languages_frequency.iter().map(|(_, c)| c).sum(); |  | ||||||
|     total / 10 // 10% is a completely arbitrary value. |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Default)] | #[derive(Default)] | ||||||
| struct Buffers { | struct Buffers { | ||||||
|     // the field buffer for each fields desserialization, and must be cleared between each field. |     // the field buffer for each fields desserialization, and must be cleared between each field. | ||||||
|   | |||||||
| @@ -5,6 +5,7 @@ use std::iter::FromIterator; | |||||||
| use std::{io, str}; | use std::{io, str}; | ||||||
|  |  | ||||||
| use charabia::normalizer::{Normalize, NormalizerOption}; | use charabia::normalizer::{Normalize, NormalizerOption}; | ||||||
|  | use charabia::{Language, StrDetection, Token}; | ||||||
| use heed::types::SerdeJson; | use heed::types::SerdeJson; | ||||||
| use heed::BytesEncode; | use heed::BytesEncode; | ||||||
|  |  | ||||||
| @@ -26,10 +27,9 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | |||||||
| pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||||
|     docid_fid_facet_string: grenad::Reader<R>, |     docid_fid_facet_string: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     _settings_diff: &InnerIndexSettingsDiff, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|     let options = NormalizerOption { lossy: true, ..Default::default() }; |  | ||||||
|  |  | ||||||
|     let mut facet_string_docids_sorter = create_sorter( |     let mut facet_string_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Stable, |         grenad::SortAlgorithm::Stable, | ||||||
| @@ -54,12 +54,8 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|     while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { |     while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? { | ||||||
|         let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); |         let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes); | ||||||
|  |  | ||||||
|         // nothing to do if we delete and re-add the value. |         let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some() | ||||||
|         if deladd_reader.get(DelAdd::Deletion).is_some() |             && deladd_reader.get(DelAdd::Addition).is_some(); | ||||||
|             && deladd_reader.get(DelAdd::Addition).is_some() |  | ||||||
|         { |  | ||||||
|             continue; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); |         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); | ||||||
|         let field_id = FieldId::from_be_bytes(field_id_bytes); |         let field_id = FieldId::from_be_bytes(field_id_bytes); | ||||||
| @@ -72,29 +68,66 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|         // Facet search normalization |         // Facet search normalization | ||||||
|         { |         { | ||||||
|             let mut hyper_normalized_value = normalized_value.normalize(&options); |             let locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); | ||||||
|             let normalized_truncated_facet: String; |             let old_hyper_normalized_value = normalize_facet_string(normalized_value, locales); | ||||||
|             if hyper_normalized_value.len() > MAX_FACET_VALUE_LENGTH { |             let locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); | ||||||
|                 normalized_truncated_facet = hyper_normalized_value |             let new_hyper_normalized_value = normalize_facet_string(normalized_value, locales); | ||||||
|                     .char_indices() |  | ||||||
|                     .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) |  | ||||||
|                     .map(|(_, c)| c) |  | ||||||
|                     .collect(); |  | ||||||
|                 hyper_normalized_value = normalized_truncated_facet.into(); |  | ||||||
|             } |  | ||||||
|             let set = BTreeSet::from_iter(std::iter::once(normalized_value)); |             let set = BTreeSet::from_iter(std::iter::once(normalized_value)); | ||||||
|  |  | ||||||
|             buffer.clear(); |             // if the facet string is the same, we can put the deletion and addition in the same obkv. | ||||||
|             let mut obkv = KvWriterDelAdd::new(&mut buffer); |             if old_hyper_normalized_value == new_hyper_normalized_value { | ||||||
|             for (deladd_key, _) in deladd_reader.iter() { |                 // nothing to do if we delete and re-add the value. | ||||||
|                 let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; |                 if is_same_value { | ||||||
|                 obkv.insert(deladd_key, val)?; |                     continue; | ||||||
|             } |                 } | ||||||
|             obkv.finish()?; |  | ||||||
|  |  | ||||||
|             let key = (field_id, hyper_normalized_value.as_ref()); |                 buffer.clear(); | ||||||
|             let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; |                 let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|             normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; |                 for (deladd_key, _) in deladd_reader.iter() { | ||||||
|  |                     let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; | ||||||
|  |                     obkv.insert(deladd_key, val)?; | ||||||
|  |                 } | ||||||
|  |                 obkv.finish()?; | ||||||
|  |  | ||||||
|  |                 let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref()); | ||||||
|  |                 let key_bytes = BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; | ||||||
|  |                 normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; | ||||||
|  |             } else { | ||||||
|  |                 // if the facet string is different, we need to insert the deletion and addition in different obkv because the related key is different. | ||||||
|  |                 // deletion | ||||||
|  |                 if deladd_reader.get(DelAdd::Deletion).is_some() { | ||||||
|  |                     // insert old value | ||||||
|  |                     let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; | ||||||
|  |                     buffer.clear(); | ||||||
|  |                     let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |                     obkv.insert(DelAdd::Deletion, val)?; | ||||||
|  |                     obkv.finish()?; | ||||||
|  |                     let key: (u16, &str) = (field_id, old_hyper_normalized_value.as_ref()); | ||||||
|  |                     let key_bytes = | ||||||
|  |                         BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; | ||||||
|  |                     normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; | ||||||
|  |                 } | ||||||
|  |  | ||||||
|  |                 // addition | ||||||
|  |                 if deladd_reader.get(DelAdd::Addition).is_some() { | ||||||
|  |                     // insert new value | ||||||
|  |                     let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; | ||||||
|  |                     buffer.clear(); | ||||||
|  |                     let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |                     obkv.insert(DelAdd::Addition, val)?; | ||||||
|  |                     obkv.finish()?; | ||||||
|  |                     let key: (u16, &str) = (field_id, new_hyper_normalized_value.as_ref()); | ||||||
|  |                     let key_bytes = | ||||||
|  |                         BEU16StrCodec::bytes_encode(&key).map_err(heed::Error::Encoding)?; | ||||||
|  |                     normalized_facet_string_docids_sorter.insert(key_bytes, &buffer)?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // nothing to do if we delete and re-add the value. | ||||||
|  |         if is_same_value { | ||||||
|  |             continue; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; |         let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value }; | ||||||
| @@ -112,3 +145,24 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | |||||||
|     let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; |     let normalized = sorter_into_reader(normalized_facet_string_docids_sorter, indexer)?; | ||||||
|     sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) |     sorter_into_reader(facet_string_docids_sorter, indexer).map(|s| (s, normalized)) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Normalizes the facet string and truncates it to the max length. | ||||||
|  | fn normalize_facet_string(facet_string: &str, locales: Option<&[Language]>) -> String { | ||||||
|  |     let options = NormalizerOption { lossy: true, ..Default::default() }; | ||||||
|  |     let mut detection = StrDetection::new(facet_string, locales); | ||||||
|  |     let token = Token { | ||||||
|  |         lemma: std::borrow::Cow::Borrowed(facet_string), | ||||||
|  |         script: detection.script(), | ||||||
|  |         language: detection.language(), | ||||||
|  |         ..Default::default() | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     // truncate the facet string to the max length | ||||||
|  |     token | ||||||
|  |         .normalize(&options) | ||||||
|  |         .lemma | ||||||
|  |         .char_indices() | ||||||
|  |         .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH) | ||||||
|  |         .map(|(_, c)| c) | ||||||
|  |         .collect() | ||||||
|  | } | ||||||
|   | |||||||
| @@ -345,21 +345,17 @@ fn send_and_extract_flattened_documents_data( | |||||||
|     let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = |     let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) = | ||||||
|         rayon::join( |         rayon::join( | ||||||
|             || { |             || { | ||||||
|                 let (docid_word_positions_chunk, script_language_pair) = |                 let docid_word_positions_chunk = extract_docid_word_positions( | ||||||
|                     extract_docid_word_positions( |                     flattened_documents_chunk.clone(), | ||||||
|                         flattened_documents_chunk.clone(), |                     indexer, | ||||||
|                         indexer, |                     &settings_diff, | ||||||
|                         &settings_diff, |                     max_positions_per_attributes, | ||||||
|                         max_positions_per_attributes, |                 )?; | ||||||
|                     )?; |  | ||||||
|  |  | ||||||
|                 // send docid_word_positions_chunk to DB writer |                 // send docid_word_positions_chunk to DB writer | ||||||
|                 let docid_word_positions_chunk = |                 let docid_word_positions_chunk = | ||||||
|                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; |                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; | ||||||
|  |  | ||||||
|                 let _ = |  | ||||||
|                     lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); |  | ||||||
|  |  | ||||||
|                 Ok(docid_word_positions_chunk) |                 Ok(docid_word_positions_chunk) | ||||||
|             }, |             }, | ||||||
|             || { |             || { | ||||||
|   | |||||||
| @@ -3388,44 +3388,6 @@ mod tests { | |||||||
|         wtxn.commit().unwrap(); |         wtxn.commit().unwrap(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     #[cfg(feature = "all-tokenizations")] |  | ||||||
|     fn stored_detected_script_and_language_should_not_return_deleted_documents() { |  | ||||||
|         use charabia::{Language, Script}; |  | ||||||
|         let index = TempIndex::new(); |  | ||||||
|         let mut wtxn = index.write_txn().unwrap(); |  | ||||||
|         index |  | ||||||
|             .add_documents_using_wtxn( |  | ||||||
|                 &mut wtxn, |  | ||||||
|                 documents!([ |  | ||||||
|                 { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, |  | ||||||
|                 { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" }, |  | ||||||
|                 { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" }, |  | ||||||
|                 { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" }, |  | ||||||
|                 { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" }, |  | ||||||
|                 { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" }, |  | ||||||
|             ])) |  | ||||||
|             .unwrap(); |  | ||||||
|  |  | ||||||
|         let key_cmn = (Script::Cj, Language::Cmn); |  | ||||||
|         let cj_cmn_docs = |  | ||||||
|             index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default(); |  | ||||||
|         let mut expected_cj_cmn_docids = RoaringBitmap::new(); |  | ||||||
|         expected_cj_cmn_docids.push(1); |  | ||||||
|         expected_cj_cmn_docids.push(5); |  | ||||||
|         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); |  | ||||||
|  |  | ||||||
|         delete_documents(&mut wtxn, &index, &["1"]); |  | ||||||
|         wtxn.commit().unwrap(); |  | ||||||
|  |  | ||||||
|         let rtxn = index.read_txn().unwrap(); |  | ||||||
|         let cj_cmn_docs = |  | ||||||
|             index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default(); |  | ||||||
|         let mut expected_cj_cmn_docids = RoaringBitmap::new(); |  | ||||||
|         expected_cj_cmn_docids.push(5); |  | ||||||
|         assert_eq!(cj_cmn_docs, expected_cj_cmn_docids); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn delete_words_exact_attributes() { |     fn delete_words_exact_attributes() { | ||||||
|         let index = TempIndex::new(); |         let index = TempIndex::new(); | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| use std::collections::{BTreeSet, HashMap}; | use std::collections::BTreeSet; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
|  |  | ||||||
| use bytemuck::allocation::pod_collect_to_vec; | use bytemuck::allocation::pod_collect_to_vec; | ||||||
| use charabia::{Language, Script}; |  | ||||||
| use grenad::{Merger, MergerBuilder}; | use grenad::{Merger, MergerBuilder}; | ||||||
| use heed::types::Bytes; | use heed::types::Bytes; | ||||||
| use heed::{BytesDecode, RwTxn}; | use heed::{BytesDecode, RwTxn}; | ||||||
| @@ -94,7 +93,6 @@ pub(crate) enum TypedChunk { | |||||||
|         add_to_user_provided: RoaringBitmap, |         add_to_user_provided: RoaringBitmap, | ||||||
|         remove_from_user_provided: RoaringBitmap, |         remove_from_user_provided: RoaringBitmap, | ||||||
|     }, |     }, | ||||||
|     ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl TypedChunk { | impl TypedChunk { | ||||||
| @@ -113,8 +111,7 @@ impl TypedChunk { | |||||||
|             | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_)) |             | (FieldIdFacetExistsDocids(_), FieldIdFacetExistsDocids(_)) | ||||||
|             | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_)) |             | (FieldIdFacetIsNullDocids(_), FieldIdFacetIsNullDocids(_)) | ||||||
|             | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_)) |             | (FieldIdFacetIsEmptyDocids(_), FieldIdFacetIsEmptyDocids(_)) | ||||||
|             | (GeoPoints(_), GeoPoints(_)) |             | (GeoPoints(_), GeoPoints(_)) => true, | ||||||
|             | (ScriptLanguageDocids(_), ScriptLanguageDocids(_)) => true, |  | ||||||
|             ( |             ( | ||||||
|                 VectorPoints { embedder_name: left, expected_dimension: left_dim, .. }, |                 VectorPoints { embedder_name: left, expected_dimension: left_dim, .. }, | ||||||
|                 VectorPoints { embedder_name: right, expected_dimension: right_dim, .. }, |                 VectorPoints { embedder_name: right, expected_dimension: right_dim, .. }, | ||||||
| @@ -775,33 +772,6 @@ pub(crate) fn write_typed_chunk_into_index( | |||||||
|  |  | ||||||
|             tracing::debug!("Finished vector chunk for {}", embedder_name); |             tracing::debug!("Finished vector chunk for {}", embedder_name); | ||||||
|         } |         } | ||||||
|         TypedChunk::ScriptLanguageDocids(_) => { |  | ||||||
|             let span = tracing::trace_span!(target: "indexing::write_db", "script_language_docids"); |  | ||||||
|             let _entered = span.enter(); |  | ||||||
|  |  | ||||||
|             for typed_chunk in typed_chunks { |  | ||||||
|                 let TypedChunk::ScriptLanguageDocids(sl_map) = typed_chunk else { unreachable!() }; |  | ||||||
|                 for (key, (deletion, addition)) in sl_map { |  | ||||||
|                     let mut db_key_exists = false; |  | ||||||
|                     let final_value = match index.script_language_docids.get(wtxn, &key)? { |  | ||||||
|                         Some(db_values) => { |  | ||||||
|                             db_key_exists = true; |  | ||||||
|                             (db_values - deletion) | addition |  | ||||||
|                         } |  | ||||||
|                         None => addition, |  | ||||||
|                     }; |  | ||||||
|  |  | ||||||
|                     if final_value.is_empty() { |  | ||||||
|                         // If the database entry exists, delete it. |  | ||||||
|                         if db_key_exists { |  | ||||||
|                             index.script_language_docids.delete(wtxn, &key)?; |  | ||||||
|                         } |  | ||||||
|                     } else { |  | ||||||
|                         index.script_language_docids.put(wtxn, &key, &final_value)?; |  | ||||||
|                     } |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|         } |  | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok((RoaringBitmap::new(), is_merged_database)) |     Ok((RoaringBitmap::new(), is_merged_database)) | ||||||
|   | |||||||
| @@ -28,7 +28,7 @@ use crate::vector::settings::{ | |||||||
|     WriteBackToDocuments, |     WriteBackToDocuments, | ||||||
| }; | }; | ||||||
| use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; | use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; | ||||||
| use crate::{FieldId, FieldsIdsMap, Index, Result}; | use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result}; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, PartialEq, Eq, Copy)] | #[derive(Debug, Clone, PartialEq, Eq, Copy)] | ||||||
| pub enum Setting<T> { | pub enum Setting<T> { | ||||||
| @@ -159,6 +159,7 @@ pub struct Settings<'a, 't, 'i> { | |||||||
|     proximity_precision: Setting<ProximityPrecision>, |     proximity_precision: Setting<ProximityPrecision>, | ||||||
|     embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>, |     embedder_settings: Setting<BTreeMap<String, Setting<EmbeddingSettings>>>, | ||||||
|     search_cutoff: Setting<u64>, |     search_cutoff: Setting<u64>, | ||||||
|  |     localized_attributes_rules: Setting<Vec<LocalizedAttributesRule>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a, 't, 'i> Settings<'a, 't, 'i> { | impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||||
| @@ -193,6 +194,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|             proximity_precision: Setting::NotSet, |             proximity_precision: Setting::NotSet, | ||||||
|             embedder_settings: Setting::NotSet, |             embedder_settings: Setting::NotSet, | ||||||
|             search_cutoff: Setting::NotSet, |             search_cutoff: Setting::NotSet, | ||||||
|  |             localized_attributes_rules: Setting::NotSet, | ||||||
|             indexer_config, |             indexer_config, | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| @@ -391,6 +393,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|         self.search_cutoff = Setting::Reset; |         self.search_cutoff = Setting::Reset; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn set_localized_attributes_rules(&mut self, value: Vec<LocalizedAttributesRule>) { | ||||||
|  |         self.localized_attributes_rules = Setting::Set(value); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn reset_localized_attributes_rules(&mut self) { | ||||||
|  |         self.localized_attributes_rules = Setting::Reset; | ||||||
|  |     } | ||||||
|  |  | ||||||
|     #[tracing::instrument( |     #[tracing::instrument( | ||||||
|         level = "trace" |         level = "trace" | ||||||
|         skip(self, progress_callback, should_abort, settings_diff), |         skip(self, progress_callback, should_abort, settings_diff), | ||||||
| @@ -1118,6 +1128,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|         Ok(changed) |         Ok(changed) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     fn update_localized_attributes_rules(&mut self) -> Result<()> { | ||||||
|  |         match &self.localized_attributes_rules { | ||||||
|  |             Setting::Set(new) => { | ||||||
|  |                 let old = self.index.localized_attributes_rules(self.wtxn)?; | ||||||
|  |                 if old.as_ref() != Some(new) { | ||||||
|  |                     self.index.put_localized_attributes_rules(self.wtxn, new.clone())?; | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |             Setting::Reset => { | ||||||
|  |                 self.index.delete_localized_attributes_rules(self.wtxn)?; | ||||||
|  |             } | ||||||
|  |             Setting::NotSet => (), | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()> |     pub fn execute<FP, FA>(mut self, progress_callback: FP, should_abort: FA) -> Result<()> | ||||||
|     where |     where | ||||||
|         FP: Fn(UpdateIndexingStep) + Sync, |         FP: Fn(UpdateIndexingStep) + Sync, | ||||||
| @@ -1151,6 +1178,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|         self.update_searchable()?; |         self.update_searchable()?; | ||||||
|         self.update_exact_attributes()?; |         self.update_exact_attributes()?; | ||||||
|         self.update_proximity_precision()?; |         self.update_proximity_precision()?; | ||||||
|  |         self.update_localized_attributes_rules()?; | ||||||
|  |  | ||||||
|         let embedding_config_updates = self.update_embedding_configs()?; |         let embedding_config_updates = self.update_embedding_configs()?; | ||||||
|  |  | ||||||
| @@ -1229,6 +1257,8 @@ impl InnerIndexSettingsDiff { | |||||||
|                 || old_settings.allowed_separators != new_settings.allowed_separators |                 || old_settings.allowed_separators != new_settings.allowed_separators | ||||||
|                 || old_settings.dictionary != new_settings.dictionary |                 || old_settings.dictionary != new_settings.dictionary | ||||||
|                 || old_settings.proximity_precision != new_settings.proximity_precision |                 || old_settings.proximity_precision != new_settings.proximity_precision | ||||||
|  |                 || old_settings.localized_searchable_fields_ids | ||||||
|  |                     != new_settings.localized_searchable_fields_ids | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; |         let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; | ||||||
| @@ -1304,6 +1334,7 @@ impl InnerIndexSettingsDiff { | |||||||
|         } |         } | ||||||
|  |  | ||||||
|         (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) |         (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) | ||||||
|  |             || self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn reindex_vectors(&self) -> bool { |     pub fn reindex_vectors(&self) -> bool { | ||||||
| @@ -1341,6 +1372,8 @@ pub(crate) struct InnerIndexSettings { | |||||||
|     pub geo_fields_ids: Option<(FieldId, FieldId)>, |     pub geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|     pub non_searchable_fields_ids: Vec<FieldId>, |     pub non_searchable_fields_ids: Vec<FieldId>, | ||||||
|     pub non_faceted_fields_ids: Vec<FieldId>, |     pub non_faceted_fields_ids: Vec<FieldId>, | ||||||
|  |     pub localized_searchable_fields_ids: LocalizedFieldIds, | ||||||
|  |     pub localized_faceted_fields_ids: LocalizedFieldIds, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl InnerIndexSettings { | impl InnerIndexSettings { | ||||||
| @@ -1382,6 +1415,17 @@ impl InnerIndexSettings { | |||||||
|             } |             } | ||||||
|             None => None, |             None => None, | ||||||
|         }; |         }; | ||||||
|  |         let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; | ||||||
|  |         let localized_searchable_fields_ids = LocalizedFieldIds::new( | ||||||
|  |             &localized_attributes_rules, | ||||||
|  |             &fields_ids_map, | ||||||
|  |             searchable_fields_ids.iter().cloned(), | ||||||
|  |         ); | ||||||
|  |         let localized_faceted_fields_ids = LocalizedFieldIds::new( | ||||||
|  |             &localized_attributes_rules, | ||||||
|  |             &fields_ids_map, | ||||||
|  |             faceted_fields_ids.iter().cloned(), | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); |         let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); | ||||||
|         searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); |         searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); | ||||||
| @@ -1403,6 +1447,8 @@ impl InnerIndexSettings { | |||||||
|             geo_fields_ids, |             geo_fields_ids, | ||||||
|             non_searchable_fields_ids: vectors_fids.clone(), |             non_searchable_fields_ids: vectors_fids.clone(), | ||||||
|             non_faceted_fields_ids: vectors_fids.clone(), |             non_faceted_fields_ids: vectors_fids.clone(), | ||||||
|  |             localized_searchable_fields_ids, | ||||||
|  |             localized_faceted_fields_ids, | ||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1418,6 +1464,12 @@ impl InnerIndexSettings { | |||||||
|         index.put_faceted_fields(wtxn, &new_facets)?; |         index.put_faceted_fields(wtxn, &new_facets)?; | ||||||
|  |  | ||||||
|         self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; |         self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; | ||||||
|  |         let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; | ||||||
|  |         self.localized_faceted_fields_ids = LocalizedFieldIds::new( | ||||||
|  |             &localized_attributes_rules, | ||||||
|  |             &self.fields_ids_map, | ||||||
|  |             self.faceted_fields_ids.iter().cloned(), | ||||||
|  |         ); | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1441,8 +1493,13 @@ impl InnerIndexSettings { | |||||||
|                 &self.fields_ids_map, |                 &self.fields_ids_map, | ||||||
|             )?; |             )?; | ||||||
|         } |         } | ||||||
|         let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; |         self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?; | ||||||
|         self.searchable_fields_ids = searchable_fields_ids; |         let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; | ||||||
|  |         self.localized_searchable_fields_ids = LocalizedFieldIds::new( | ||||||
|  |             &localized_attributes_rules, | ||||||
|  |             &self.fields_ids_map, | ||||||
|  |             self.searchable_fields_ids.iter().cloned(), | ||||||
|  |         ); | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
| @@ -2573,6 +2630,7 @@ mod tests { | |||||||
|                     proximity_precision, |                     proximity_precision, | ||||||
|                     embedder_settings, |                     embedder_settings, | ||||||
|                     search_cutoff, |                     search_cutoff, | ||||||
|  |                     localized_attributes_rules, | ||||||
|                 } = settings; |                 } = settings; | ||||||
|                 assert!(matches!(searchable_fields, Setting::NotSet)); |                 assert!(matches!(searchable_fields, Setting::NotSet)); | ||||||
|                 assert!(matches!(displayed_fields, Setting::NotSet)); |                 assert!(matches!(displayed_fields, Setting::NotSet)); | ||||||
| @@ -2597,6 +2655,7 @@ mod tests { | |||||||
|                 assert!(matches!(proximity_precision, Setting::NotSet)); |                 assert!(matches!(proximity_precision, Setting::NotSet)); | ||||||
|                 assert!(matches!(embedder_settings, Setting::NotSet)); |                 assert!(matches!(embedder_settings, Setting::NotSet)); | ||||||
|                 assert!(matches!(search_cutoff, Setting::NotSet)); |                 assert!(matches!(search_cutoff, Setting::NotSet)); | ||||||
|  |                 assert!(matches!(localized_attributes_rules, Setting::NotSet)); | ||||||
|             }) |             }) | ||||||
|             .unwrap(); |             .unwrap(); | ||||||
|     } |     } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user