mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Merge pull request #5494 from meilisearch/deactivate-numbers-in-typos
Deactivate numbers in typos
This commit is contained in:
		| @@ -373,6 +373,7 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> { | ||||
|                     }, | ||||
|                     disable_on_words: typo.disable_on_words.into(), | ||||
|                     disable_on_attributes: typo.disable_on_attributes.into(), | ||||
|                     disable_on_numbers: v6::Setting::NotSet, | ||||
|                 }), | ||||
|                 v5::Setting::Reset => v6::Setting::Reset, | ||||
|                 v5::Setting::NotSet => v6::Setting::NotSet, | ||||
|   | ||||
| @@ -8,6 +8,7 @@ use std::str::FromStr; | ||||
|  | ||||
| use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; | ||||
| use fst::IntoStreamer; | ||||
| use milli::disabled_typos_terms::DisabledTyposTerms; | ||||
| use milli::index::{IndexEmbeddingConfig, PrefixSearch}; | ||||
| use milli::proximity::ProximityPrecision; | ||||
| use milli::update::Setting; | ||||
| @@ -104,6 +105,10 @@ pub struct TypoSettings { | ||||
|     #[deserr(default)] | ||||
|     #[schema(value_type = Option<BTreeSet<String>>, example = json!(["uuid", "url"]))] | ||||
|     pub disable_on_attributes: Setting<BTreeSet<String>>, | ||||
|     #[serde(default, skip_serializing_if = "Setting::is_not_set")] | ||||
|     #[deserr(default)] | ||||
|     #[schema(value_type = Option<bool>, example = json!(true))] | ||||
|     pub disable_on_numbers: Setting<bool>, | ||||
| } | ||||
|  | ||||
| #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] | ||||
| @@ -701,6 +706,12 @@ pub fn apply_settings_to_builder( | ||||
|                 Setting::Reset => builder.reset_exact_attributes(), | ||||
|                 Setting::NotSet => (), | ||||
|             } | ||||
|  | ||||
|             match value.disable_on_numbers { | ||||
|                 Setting::Set(val) => builder.set_disable_on_numbers(val), | ||||
|                 Setting::Reset => builder.reset_disable_on_numbers(), | ||||
|                 Setting::NotSet => (), | ||||
|             } | ||||
|         } | ||||
|         Setting::Reset => { | ||||
|             // all typo settings need to be reset here. | ||||
| @@ -826,12 +837,14 @@ pub fn settings( | ||||
|     }; | ||||
|  | ||||
|     let disabled_attributes = index.exact_attributes(rtxn)?.into_iter().map(String::from).collect(); | ||||
|     let DisabledTyposTerms { disable_on_numbers } = index.disabled_typos_terms(rtxn)?; | ||||
|  | ||||
|     let typo_tolerance = TypoSettings { | ||||
|         enabled: Setting::Set(index.authorize_typos(rtxn)?), | ||||
|         min_word_size_for_typos: Setting::Set(min_typo_word_len), | ||||
|         disable_on_words: Setting::Set(disabled_words), | ||||
|         disable_on_attributes: Setting::Set(disabled_attributes), | ||||
|         disable_on_numbers: Setting::Set(disable_on_numbers), | ||||
|     }; | ||||
|  | ||||
|     let faceting = FacetingSettings { | ||||
|   | ||||
| @@ -87,7 +87,8 @@ async fn import_dump_v1_movie_raw() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -260,7 +261,8 @@ async fn import_dump_v1_movie_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -432,7 +434,8 @@ async fn import_dump_v1_rubygems_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -590,7 +593,8 @@ async fn import_dump_v2_movie_raw() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -760,7 +764,8 @@ async fn import_dump_v2_movie_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -929,7 +934,8 @@ async fn import_dump_v2_rubygems_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -1087,7 +1093,8 @@ async fn import_dump_v3_movie_raw() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -1257,7 +1264,8 @@ async fn import_dump_v3_movie_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -1426,7 +1434,8 @@ async fn import_dump_v3_rubygems_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -1584,7 +1593,8 @@ async fn import_dump_v4_movie_raw() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -1754,7 +1764,8 @@ async fn import_dump_v4_movie_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -1923,7 +1934,8 @@ async fn import_dump_v4_rubygems_with_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -2212,7 +2224,8 @@ async fn import_dump_v6_containing_experimental_features() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -2444,7 +2457,8 @@ async fn generate_and_import_dump_containing_vectors() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
|   | ||||
| @@ -1976,3 +1976,93 @@ async fn change_facet_casing() { | ||||
|         }) | ||||
|         .await; | ||||
| } | ||||
|  | ||||
| #[actix_rt::test] | ||||
| async fn test_exact_typos_terms() { | ||||
|     let documents = json!([ | ||||
|         { | ||||
|             "id": 0, | ||||
|             "title": "The zeroth document 1298484", | ||||
|         }, | ||||
|         { | ||||
|             "id": 1, | ||||
|             "title": "The first document 234342", | ||||
|             "nested": { | ||||
|                 "object": "field 22231", | ||||
|                 "machin": "bidule 23443.32111", | ||||
|             }, | ||||
|         }, | ||||
|         { | ||||
|             "id": 2, | ||||
|             "title": "The second document 3398499", | ||||
|             "nested": [ | ||||
|                 "array", | ||||
|                 { | ||||
|                     "object": "field 23245121,23223", | ||||
|                 }, | ||||
|                 { | ||||
|                     "prout": "truc 123980612321", | ||||
|                     "machin": "lol 12345645333447879", | ||||
|                 }, | ||||
|             ], | ||||
|         }, | ||||
|         { | ||||
|             "id": 3, | ||||
|             "title": "The third document 12333", | ||||
|             "nested": "I lied 98878", | ||||
|         }, | ||||
|     ]); | ||||
|  | ||||
|     // Test prefix search | ||||
|     test_settings_documents_indexing_swapping_and_search( | ||||
|         &documents, | ||||
|         &json!({ | ||||
|             "searchableAttributes": ["title", "nested.object", "nested.machin"], | ||||
|             "typoTolerance": { | ||||
|               "enabled": true, | ||||
|               "disableOnNumbers": true | ||||
|             } | ||||
|         }), | ||||
|         &json!({"q": "12345"}), | ||||
|         |response, code| { | ||||
|             assert_eq!(code, 200, "{}", response); | ||||
|             snapshot!(json_string!(response["hits"]), @r###" | ||||
|             [ | ||||
|               { | ||||
|                 "id": 2, | ||||
|                 "title": "The second document 3398499", | ||||
|                 "nested": [ | ||||
|                   "array", | ||||
|                   { | ||||
|                     "object": "field 23245121,23223" | ||||
|                   }, | ||||
|                   { | ||||
|                     "prout": "truc 123980612321", | ||||
|                     "machin": "lol 12345645333447879" | ||||
|                   } | ||||
|                 ] | ||||
|               } | ||||
|             ] | ||||
|             "###); | ||||
|         }, | ||||
|     ) | ||||
|     .await; | ||||
|  | ||||
|     // Test typo search | ||||
|     test_settings_documents_indexing_swapping_and_search( | ||||
|         &documents, | ||||
|         &json!({ | ||||
|             "searchableAttributes": ["title", "nested.object", "nested.machin"], | ||||
|             "typoTolerance": { | ||||
|               "enabled": true, | ||||
|               "disableOnNumbers": true | ||||
|             } | ||||
|         }), | ||||
|         &json!({"q": "123457"}), | ||||
|         |response, code| { | ||||
|             assert_eq!(code, 200, "{}", response); | ||||
|             snapshot!(json_string!(response["hits"]), @r###"[]"###); | ||||
|         }, | ||||
|     ) | ||||
|     .await; | ||||
| } | ||||
|   | ||||
| @@ -274,7 +274,7 @@ async fn settings_bad_typo_tolerance() { | ||||
|     snapshot!(code, @"400 Bad Request"); | ||||
|     snapshot!(json_string!(response), @r###" | ||||
|     { | ||||
|       "message": "Unknown field `typoTolerance`: expected one of `enabled`, `minWordSizeForTypos`, `disableOnWords`, `disableOnAttributes`", | ||||
|       "message": "Unknown field `typoTolerance`: expected one of `enabled`, `minWordSizeForTypos`, `disableOnWords`, `disableOnAttributes`, `disableOnNumbers`", | ||||
|       "code": "invalid_settings_typo_tolerance", | ||||
|       "type": "invalid_request", | ||||
|       "link": "https://docs.meilisearch.com/errors#invalid_settings_typo_tolerance" | ||||
|   | ||||
| @@ -179,7 +179,7 @@ test_setting_routes!( | ||||
|     { | ||||
|         setting: typo_tolerance, | ||||
|         update_verb: patch, | ||||
|         default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": []} | ||||
|         default_value: {"enabled": true, "minWordSizeForTypos": {"oneTypo": 5, "twoTypos": 9}, "disableOnWords": [], "disableOnAttributes": [], "disableOnNumbers": false} | ||||
|     }, | ||||
| ); | ||||
|  | ||||
| @@ -276,7 +276,7 @@ async fn secrets_are_hidden_in_settings() { | ||||
|  | ||||
|     let (response, code) = index.settings().await; | ||||
|     meili_snap::snapshot!(code, @"200 OK"); | ||||
|     meili_snap::snapshot!(meili_snap::json_string!(response), @r#" | ||||
|     meili_snap::snapshot!(meili_snap::json_string!(response), @r###" | ||||
|     { | ||||
|       "displayedAttributes": [ | ||||
|         "*" | ||||
| @@ -308,7 +308,8 @@ async fn secrets_are_hidden_in_settings() { | ||||
|           "twoTypos": 9 | ||||
|         }, | ||||
|         "disableOnWords": [], | ||||
|         "disableOnAttributes": [] | ||||
|         "disableOnAttributes": [], | ||||
|         "disableOnNumbers": false | ||||
|       }, | ||||
|       "faceting": { | ||||
|         "maxValuesPerFacet": 100, | ||||
| @@ -337,7 +338,7 @@ async fn secrets_are_hidden_in_settings() { | ||||
|       "facetSearch": true, | ||||
|       "prefixSearch": "indexingTime" | ||||
|     } | ||||
|     "#); | ||||
|     "###); | ||||
|  | ||||
|     let (response, code) = server.get_task(settings_update_uid).await; | ||||
|     meili_snap::snapshot!(code, @"200 OK"); | ||||
|   | ||||
| @@ -1,6 +1,5 @@ | ||||
| --- | ||||
| source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs | ||||
| snapshot_kind: text | ||||
| --- | ||||
| { | ||||
|   "displayedAttributes": [ | ||||
| @@ -49,7 +48,8 @@ snapshot_kind: text | ||||
|     ], | ||||
|     "disableOnAttributes": [ | ||||
|       "surname" | ||||
|     ] | ||||
|     ], | ||||
|     "disableOnNumbers": false | ||||
|   }, | ||||
|   "faceting": { | ||||
|     "maxValuesPerFacet": 99, | ||||
|   | ||||
							
								
								
									
										50
									
								
								crates/milli/src/disabled_typos_terms.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										50
									
								
								crates/milli/src/disabled_typos_terms.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,50 @@ | ||||
| use heed::{ | ||||
|     types::{SerdeJson, Str}, | ||||
|     RoTxn, RwTxn, | ||||
| }; | ||||
| use serde::{Deserialize, Serialize}; | ||||
|  | ||||
| use crate::{index::main_key, Index}; | ||||
|  | ||||
| #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] | ||||
| #[serde(rename_all = "camelCase")] | ||||
| pub struct DisabledTyposTerms { | ||||
|     pub disable_on_numbers: bool, | ||||
| } | ||||
|  | ||||
| impl Index { | ||||
|     pub fn disabled_typos_terms(&self, txn: &RoTxn<'_>) -> heed::Result<DisabledTyposTerms> { | ||||
|         self.main | ||||
|             .remap_types::<Str, SerdeJson<DisabledTyposTerms>>() | ||||
|             .get(txn, main_key::DISABLED_TYPOS_TERMS) | ||||
|             .map(|option| option.unwrap_or_default()) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn put_disabled_typos_terms( | ||||
|         &self, | ||||
|         txn: &mut RwTxn<'_>, | ||||
|         disabled_typos_terms: &DisabledTyposTerms, | ||||
|     ) -> heed::Result<()> { | ||||
|         self.main.remap_types::<Str, SerdeJson<DisabledTyposTerms>>().put( | ||||
|             txn, | ||||
|             main_key::DISABLED_TYPOS_TERMS, | ||||
|             disabled_typos_terms, | ||||
|         )?; | ||||
|  | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     pub(crate) fn delete_disabled_typos_terms(&self, txn: &mut RwTxn<'_>) -> heed::Result<()> { | ||||
|         self.main | ||||
|             .remap_types::<Str, SerdeJson<DisabledTyposTerms>>() | ||||
|             .delete(txn, main_key::DISABLED_TYPOS_TERMS)?; | ||||
|         Ok(()) | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl DisabledTyposTerms { | ||||
|     pub fn is_exact(&self, word: &str) -> bool { | ||||
|         // If disable_on_numbers is true, we disable the word if it contains only numbers or punctuation | ||||
|         self.disable_on_numbers && word.chars().all(|c| c.is_numeric() || c.is_ascii_punctuation()) | ||||
|     } | ||||
| } | ||||
| @@ -78,6 +78,7 @@ pub mod main_key { | ||||
|     pub const FACET_SEARCH: &str = "facet_search"; | ||||
|     pub const PREFIX_SEARCH: &str = "prefix_search"; | ||||
|     pub const DOCUMENTS_STATS: &str = "documents_stats"; | ||||
|     pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms"; | ||||
| } | ||||
|  | ||||
| pub mod db_name { | ||||
|   | ||||
| @@ -12,6 +12,7 @@ mod asc_desc; | ||||
| mod attribute_patterns; | ||||
| mod criterion; | ||||
| pub mod database_stats; | ||||
| pub mod disabled_typos_terms; | ||||
| mod error; | ||||
| mod external_documents_ids; | ||||
| pub mod facet; | ||||
|   | ||||
| @@ -127,7 +127,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | ||||
|         // merge all deletions | ||||
|         let obkv = KvReaderDelAdd::from_slice(value); | ||||
|         if let Some(value) = obkv.get(DelAdd::Deletion) { | ||||
|             let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid); | ||||
|             let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid) | ||||
|                 || settings_diff.old.disabled_typos_terms.is_exact(w); | ||||
|             buffer.clear(); | ||||
|             let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||
|             obkv.insert(DelAdd::Deletion, value)?; | ||||
| @@ -139,7 +140,8 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | ||||
|         } | ||||
|         // merge all additions | ||||
|         if let Some(value) = obkv.get(DelAdd::Addition) { | ||||
|             let add_in_exact = settings_diff.new.exact_attributes.contains(&fid); | ||||
|             let add_in_exact = settings_diff.new.exact_attributes.contains(&fid) | ||||
|                 || settings_diff.new.disabled_typos_terms.is_exact(w); | ||||
|             buffer.clear(); | ||||
|             let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||
|             obkv.insert(DelAdd::Addition, value)?; | ||||
|   | ||||
| @@ -273,14 +273,11 @@ pub(crate) fn write_typed_chunk_into_index( | ||||
|                     unreachable!(); | ||||
|                 }; | ||||
|                 let clonable_word_docids = unsafe { as_cloneable_grenad(&word_docids_reader) }?; | ||||
|                 let clonable_exact_word_docids = | ||||
|                     unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; | ||||
|  | ||||
|                 word_docids_builder.push(word_docids_reader.into_cursor()?); | ||||
|                 exact_word_docids_builder.push(exact_word_docids_reader.into_cursor()?); | ||||
|                 word_fid_docids_builder.push(word_fid_docids_reader.into_cursor()?); | ||||
|                 fst_merger_builder.push(clonable_word_docids.into_cursor()?); | ||||
|                 fst_merger_builder.push(clonable_exact_word_docids.into_cursor()?); | ||||
|             } | ||||
|  | ||||
|             let word_docids_merger = word_docids_builder.build(); | ||||
|   | ||||
| @@ -319,8 +319,11 @@ impl WordDocidsExtractors { | ||||
|         let doc_alloc = &context.doc_alloc; | ||||
|  | ||||
|         let exact_attributes = index.exact_attributes(rtxn)?; | ||||
|         let is_exact_attribute = | ||||
|             |fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr)); | ||||
|         let disabled_typos_terms = index.disabled_typos_terms(rtxn)?; | ||||
|         let is_exact = |fname: &str, word: &str| { | ||||
|             exact_attributes.iter().any(|attr| contained_in(fname, attr)) | ||||
|                 || disabled_typos_terms.is_exact(word) | ||||
|         }; | ||||
|         match document_change { | ||||
|             DocumentChange::Deletion(inner) => { | ||||
|                 let mut token_fn = |fname: &str, fid, pos, word: &str| { | ||||
| @@ -328,7 +331,7 @@ impl WordDocidsExtractors { | ||||
|                         fid, | ||||
|                         pos, | ||||
|                         word, | ||||
|                         is_exact_attribute(fname), | ||||
|                         is_exact(fname, word), | ||||
|                         inner.docid(), | ||||
|                         doc_alloc, | ||||
|                     ) | ||||
| @@ -356,7 +359,7 @@ impl WordDocidsExtractors { | ||||
|                         fid, | ||||
|                         pos, | ||||
|                         word, | ||||
|                         is_exact_attribute(fname), | ||||
|                         is_exact(fname, word), | ||||
|                         inner.docid(), | ||||
|                         doc_alloc, | ||||
|                     ) | ||||
| @@ -372,7 +375,7 @@ impl WordDocidsExtractors { | ||||
|                         fid, | ||||
|                         pos, | ||||
|                         word, | ||||
|                         is_exact_attribute(fname), | ||||
|                         is_exact(fname, word), | ||||
|                         inner.docid(), | ||||
|                         doc_alloc, | ||||
|                     ) | ||||
| @@ -389,7 +392,7 @@ impl WordDocidsExtractors { | ||||
|                         fid, | ||||
|                         pos, | ||||
|                         word, | ||||
|                         is_exact_attribute(fname), | ||||
|                         is_exact(fname, word), | ||||
|                         inner.docid(), | ||||
|                         doc_alloc, | ||||
|                     ) | ||||
|   | ||||
| @@ -9,6 +9,7 @@ pub use document_operation::{DocumentOperation, PayloadStats}; | ||||
| use hashbrown::HashMap; | ||||
| use heed::RwTxn; | ||||
| pub use partial_dump::PartialDump; | ||||
| pub use post_processing::recompute_word_fst_from_word_docids_database; | ||||
| pub use update_by_function::UpdateByFunction; | ||||
| pub use write::ChannelCongestion; | ||||
| use write::{build_vectors, update_index, write_to_db}; | ||||
|   | ||||
| @@ -131,6 +131,20 @@ fn compute_word_fst( | ||||
|     } | ||||
| } | ||||
|  | ||||
| pub fn recompute_word_fst_from_word_docids_database(index: &Index, wtxn: &mut RwTxn) -> Result<()> { | ||||
|     let fst = fst::Set::default().map_data(std::borrow::Cow::Owned)?; | ||||
|     let mut word_fst_builder = WordFstBuilder::new(&fst)?; | ||||
|     let words = index.word_docids.iter(wtxn)?.remap_data_type::<DecodeIgnore>(); | ||||
|     for res in words { | ||||
|         let (word, _) = res?; | ||||
|         word_fst_builder.register_word(DelAdd::Addition, word.as_ref())?; | ||||
|     } | ||||
|     let (word_fst_mmap, _) = word_fst_builder.build(index, wtxn)?; | ||||
|     index.main.remap_types::<Str, Bytes>().put(wtxn, WORDS_FST_KEY, &word_fst_mmap)?; | ||||
|  | ||||
|     Ok(()) | ||||
| } | ||||
|  | ||||
| #[tracing::instrument(level = "trace", skip_all, target = "indexing::facet_search")] | ||||
| fn compute_facet_search_database( | ||||
|     index: &Index, | ||||
|   | ||||
| @@ -17,6 +17,7 @@ use super::IndexerConfig; | ||||
| use crate::attribute_patterns::PatternMatch; | ||||
| use crate::constants::RESERVED_GEO_FIELD_NAME; | ||||
| use crate::criterion::Criterion; | ||||
| use crate::disabled_typos_terms::DisabledTyposTerms; | ||||
| use crate::error::UserError; | ||||
| use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; | ||||
| use crate::filterable_attributes_rules::match_faceted_field; | ||||
| @@ -169,6 +170,7 @@ pub struct Settings<'a, 't, 'i> { | ||||
|     synonyms: Setting<BTreeMap<String, Vec<String>>>, | ||||
|     primary_key: Setting<String>, | ||||
|     authorize_typos: Setting<bool>, | ||||
|     disable_on_numbers: Setting<bool>, | ||||
|     min_word_len_two_typos: Setting<u8>, | ||||
|     min_word_len_one_typo: Setting<u8>, | ||||
|     exact_words: Setting<BTreeSet<String>>, | ||||
| @@ -207,6 +209,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|             synonyms: Setting::NotSet, | ||||
|             primary_key: Setting::NotSet, | ||||
|             authorize_typos: Setting::NotSet, | ||||
|             disable_on_numbers: Setting::NotSet, | ||||
|             exact_words: Setting::NotSet, | ||||
|             min_word_len_two_typos: Setting::NotSet, | ||||
|             min_word_len_one_typo: Setting::NotSet, | ||||
| @@ -354,6 +357,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         self.min_word_len_one_typo = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     pub fn set_disable_on_numbers(&mut self, disable_on_numbers: bool) { | ||||
|         self.disable_on_numbers = Setting::Set(disable_on_numbers); | ||||
|     } | ||||
|  | ||||
|     pub fn reset_disable_on_numbers(&mut self) { | ||||
|         self.disable_on_numbers = Setting::Reset; | ||||
|     } | ||||
|  | ||||
|     pub fn set_exact_words(&mut self, words: BTreeSet<String>) { | ||||
|         self.exact_words = Setting::Set(words); | ||||
|     } | ||||
| @@ -866,6 +877,24 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn update_disabled_typos_terms(&mut self) -> Result<()> { | ||||
|         let mut disabled_typos_terms = self.index.disabled_typos_terms(self.wtxn)?; | ||||
|         match self.disable_on_numbers { | ||||
|             Setting::Set(disable_on_numbers) => { | ||||
|                 disabled_typos_terms.disable_on_numbers = disable_on_numbers; | ||||
|             } | ||||
|             Setting::Reset => { | ||||
|                 self.index.delete_disabled_typos_terms(self.wtxn)?; | ||||
|                 disabled_typos_terms.disable_on_numbers = | ||||
|                     DisabledTyposTerms::default().disable_on_numbers; | ||||
|             } | ||||
|             Setting::NotSet => (), | ||||
|         } | ||||
|  | ||||
|         self.index.put_disabled_typos_terms(self.wtxn, &disabled_typos_terms)?; | ||||
|         Ok(()) | ||||
|     } | ||||
|  | ||||
|     fn update_exact_words(&mut self) -> Result<()> { | ||||
|         match self.exact_words { | ||||
|             Setting::Set(ref mut words) => { | ||||
| @@ -1246,6 +1275,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | ||||
|         self.update_prefix_search()?; | ||||
|         self.update_facet_search()?; | ||||
|         self.update_localized_attributes_rules()?; | ||||
|         self.update_disabled_typos_terms()?; | ||||
|  | ||||
|         let embedding_config_updates = self.update_embedding_configs()?; | ||||
|  | ||||
| @@ -1327,6 +1357,7 @@ impl InnerIndexSettingsDiff { | ||||
|                 || old_settings.prefix_search != new_settings.prefix_search | ||||
|                 || old_settings.localized_attributes_rules | ||||
|                     != new_settings.localized_attributes_rules | ||||
|                 || old_settings.disabled_typos_terms != new_settings.disabled_typos_terms | ||||
|         }; | ||||
|  | ||||
|         let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; | ||||
| @@ -1526,6 +1557,7 @@ pub(crate) struct InnerIndexSettings { | ||||
|     pub user_defined_searchable_attributes: Option<Vec<String>>, | ||||
|     pub sortable_fields: HashSet<String>, | ||||
|     pub exact_attributes: HashSet<FieldId>, | ||||
|     pub disabled_typos_terms: DisabledTyposTerms, | ||||
|     pub proximity_precision: ProximityPrecision, | ||||
|     pub embedding_configs: EmbeddingConfigs, | ||||
|     pub geo_fields_ids: Option<(FieldId, FieldId)>, | ||||
| @@ -1574,7 +1606,7 @@ impl InnerIndexSettings { | ||||
|             .map(|fields| fields.into_iter().map(|f| f.to_string()).collect()); | ||||
|         let builder = MetadataBuilder::from_index(index, rtxn)?; | ||||
|         let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); | ||||
|  | ||||
|         let disabled_typos_terms = index.disabled_typos_terms(rtxn)?; | ||||
|         Ok(Self { | ||||
|             stop_words, | ||||
|             allowed_separators, | ||||
| @@ -1592,6 +1624,7 @@ impl InnerIndexSettings { | ||||
|             geo_fields_ids, | ||||
|             prefix_search, | ||||
|             facet_search, | ||||
|             disabled_typos_terms, | ||||
|         }) | ||||
|     } | ||||
|  | ||||
|   | ||||
| @@ -896,6 +896,7 @@ fn test_correct_settings_init() { | ||||
|                 localized_attributes_rules, | ||||
|                 prefix_search, | ||||
|                 facet_search, | ||||
|                 disable_on_numbers, | ||||
|             } = settings; | ||||
|             assert!(matches!(searchable_fields, Setting::NotSet)); | ||||
|             assert!(matches!(displayed_fields, Setting::NotSet)); | ||||
| @@ -923,6 +924,7 @@ fn test_correct_settings_init() { | ||||
|             assert!(matches!(localized_attributes_rules, Setting::NotSet)); | ||||
|             assert!(matches!(prefix_search, Setting::NotSet)); | ||||
|             assert!(matches!(facet_search, Setting::NotSet)); | ||||
|             assert!(matches!(disable_on_numbers, Setting::NotSet)); | ||||
|         }) | ||||
|         .unwrap(); | ||||
| } | ||||
|   | ||||
| @@ -1,11 +1,12 @@ | ||||
| mod v1_12; | ||||
| mod v1_13; | ||||
| mod v1_14; | ||||
|  | ||||
| mod v1_15; | ||||
| use heed::RwTxn; | ||||
| use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; | ||||
| use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; | ||||
| use v1_14::Latest_V1_13_To_Latest_V1_14; | ||||
| use v1_15::Latest_V1_14_To_Latest_V1_15; | ||||
|  | ||||
| use crate::progress::{Progress, VariableNameStep}; | ||||
| use crate::{Index, InternalError, Result}; | ||||
| @@ -36,6 +37,7 @@ pub fn upgrade( | ||||
|         &V1_13_0_To_V1_13_1 {}, | ||||
|         &V1_13_1_To_Latest_V1_13 {}, | ||||
|         &Latest_V1_13_To_Latest_V1_14 {}, | ||||
|         &Latest_V1_14_To_Latest_V1_15 {}, | ||||
|     ]; | ||||
|  | ||||
|     let start = match from { | ||||
| @@ -43,8 +45,9 @@ pub fn upgrade( | ||||
|         (1, 12, 3..) => 1, | ||||
|         (1, 13, 0) => 2, | ||||
|         (1, 13, _) => 4, | ||||
|         (1, 14, _) => 5, | ||||
|         // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. | ||||
|         (1, 14, _) => 4, | ||||
|         (1, 15, _) => 5, | ||||
|         (major, minor, patch) => { | ||||
|             return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into()) | ||||
|         } | ||||
|   | ||||
							
								
								
									
										35
									
								
								crates/milli/src/update/upgrade/v1_15.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										35
									
								
								crates/milli/src/update/upgrade/v1_15.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,35 @@ | ||||
| use heed::RwTxn; | ||||
|  | ||||
| use super::UpgradeIndex; | ||||
| use crate::progress::Progress; | ||||
| use crate::update::new::indexer::recompute_word_fst_from_word_docids_database; | ||||
| use crate::{make_enum_progress, Index, Result}; | ||||
|  | ||||
| #[allow(non_camel_case_types)] | ||||
| pub(super) struct Latest_V1_14_To_Latest_V1_15(); | ||||
|  | ||||
| impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 { | ||||
|     fn upgrade( | ||||
|         &self, | ||||
|         wtxn: &mut RwTxn, | ||||
|         index: &Index, | ||||
|         _original: (u32, u32, u32), | ||||
|         progress: Progress, | ||||
|     ) -> Result<bool> { | ||||
|         // Recompute the word FST from the word docids database. | ||||
|         make_enum_progress! { | ||||
|             enum TypoTolerance { | ||||
|                 RecomputeWordFst, | ||||
|             } | ||||
|         }; | ||||
|  | ||||
|         progress.update_progress(TypoTolerance::RecomputeWordFst); | ||||
|         recompute_word_fst_from_word_docids_database(index, wtxn)?; | ||||
|  | ||||
|         Ok(false) | ||||
|     } | ||||
|  | ||||
|     fn target_version(&self) -> (u32, u32, u32) { | ||||
|         (1, 15, 0) | ||||
|     } | ||||
| } | ||||
		Reference in New Issue
	
	Block a user