mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 13:36:27 +00:00 
			
		
		
		
	Update Charabia
This commit is contained in:
		
							
								
								
									
										220
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										220
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -152,7 +152,7 @@ dependencies = [ | |||||||
|  "pin-project-lite", |  "pin-project-lite", | ||||||
|  "tokio-rustls 0.23.4", |  "tokio-rustls 0.23.4", | ||||||
|  "tokio-util", |  "tokio-util", | ||||||
|  "webpki-roots", |  "webpki-roots 0.22.6", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| @@ -706,23 +706,24 @@ dependencies = [ | |||||||
| [[package]] | [[package]] | ||||||
| name = "charabia" | name = "charabia" | ||||||
| version = "0.7.2" | version = "0.7.2" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "413155d93157bff9130895c3bd83970ac7f35659ca57226a96aa35cf1e8e102c" |  | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  |  "aho-corasick", | ||||||
|  "cow-utils", |  "cow-utils", | ||||||
|  "csv", |  "csv", | ||||||
|  "deunicode", |  "deunicode", | ||||||
|  |  "either", | ||||||
|  "finl_unicode", |  "finl_unicode", | ||||||
|  "fst", |  "fst", | ||||||
|  "irg-kvariants", |  "irg-kvariants", | ||||||
|  "jieba-rs", |  "jieba-rs", | ||||||
|  "lindera", |  "lindera-core", | ||||||
|  |  "lindera-dictionary", | ||||||
|  |  "lindera-tokenizer", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "pinyin", |  "pinyin", | ||||||
|  "serde", |  "serde", | ||||||
|  "slice-group-by", |  "slice-group-by", | ||||||
|  "unicode-normalization", |  "unicode-normalization", | ||||||
|  "unicode-segmentation", |  | ||||||
|  "wana_kana", |  "wana_kana", | ||||||
|  "whatlang", |  "whatlang", | ||||||
| ] | ] | ||||||
| @@ -2135,15 +2136,6 @@ dependencies = [ | |||||||
|  "simple_asn1", |  "simple_asn1", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "kanaria" |  | ||||||
| version = "0.2.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" |  | ||||||
| dependencies = [ |  | ||||||
|  "bitflags", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "language-tags" | name = "language-tags" | ||||||
| version = "0.3.2" | version = "0.3.2" | ||||||
| @@ -2211,38 +2203,11 @@ dependencies = [ | |||||||
|  "vcpkg", |  "vcpkg", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "lindera" |  | ||||||
| version = "0.23.1" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "72be283281bec2768687b1784be03a678609b51f2f90f6f9d9b4f07953e6dd25" |  | ||||||
| dependencies = [ |  | ||||||
|  "anyhow", |  | ||||||
|  "bincode", |  | ||||||
|  "byteorder", |  | ||||||
|  "encoding", |  | ||||||
|  "kanaria", |  | ||||||
|  "lindera-cc-cedict-builder", |  | ||||||
|  "lindera-core", |  | ||||||
|  "lindera-dictionary", |  | ||||||
|  "lindera-filter", |  | ||||||
|  "lindera-ipadic-builder", |  | ||||||
|  "lindera-ko-dic-builder", |  | ||||||
|  "lindera-unidic-builder", |  | ||||||
|  "regex", |  | ||||||
|  "serde", |  | ||||||
|  "serde_json", |  | ||||||
|  "thiserror", |  | ||||||
|  "unicode-blocks", |  | ||||||
|  "unicode-normalization", |  | ||||||
|  "yada", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-cc-cedict-builder" | name = "lindera-cc-cedict-builder" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "10fbafd37adab44ccc2668a40fba2dbc4e665cb3c36018c15dfe2e2b830e28ce" | checksum = "4c6bf79b29a90bcd22036e494d6cc9ac3abe9ab604b21f3258ba6dc1ce501801" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2259,9 +2224,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-compress" | name = "lindera-compress" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "ed9196bf5995503f6878a090dfee6114ba86430c72f67ef3624246b564869937" | checksum = "8f2e99e67736352bbb6ed1c273643975822505067ca32194b0981040bc50527a" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "flate2", |  "flate2", | ||||||
| @@ -2270,9 +2235,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-core" | name = "lindera-core" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "e5f0baa9932f682e9c5b388897330f155d3c40de80016e60125897fde5e0e246" | checksum = "7c3935e966409156f22cb4b334b21b0dce84b7aa1cad62214b466489d249c8e5" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2287,9 +2252,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-decompress" | name = "lindera-decompress" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "a6e63fa6ef0bc3ce2c26d372aa6185b7a316194494a84f81678f5da2893bf4a2" | checksum = "7476406abb63c49d7f59c88b9b868ee8d2981495ea7e2c3ad129902f9916b3c6" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "flate2", |  "flate2", | ||||||
| @@ -2298,63 +2263,50 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-dictionary" | name = "lindera-dictionary" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "fd765c36166016de87a1f447ea971573e4c63e334836c46ad0020f0408c88bfc" | checksum = "808b7d2b3cabc25a4022526d484a4cfd1d5924dc76a26e0379707698841acef2" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  |  "lindera-cc-cedict-builder", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-ipadic", |  | ||||||
|  "lindera-ko-dic", |  | ||||||
|  "serde", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "lindera-filter" |  | ||||||
| version = "0.23.1" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "a5345e37fb9521ab3cee19283bed135d46b3521dc1fd13a49fa0992379056203" |  | ||||||
| dependencies = [ |  | ||||||
|  "anyhow", |  | ||||||
|  "bincode", |  | ||||||
|  "byteorder", |  | ||||||
|  "kanaria", |  | ||||||
|  "lindera-core", |  | ||||||
|  "lindera-dictionary", |  | ||||||
|  "once_cell", |  | ||||||
|  "regex", |  | ||||||
|  "serde", |  | ||||||
|  "serde_json", |  | ||||||
|  "unicode-blocks", |  | ||||||
|  "unicode-normalization", |  | ||||||
|  "unicode-segmentation", |  | ||||||
|  "yada", |  | ||||||
| ] |  | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "lindera-ipadic" |  | ||||||
| version = "0.23.0" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "60eeb356295f784e7db4cfd2c6772f2bd059e565a7744e246642a07bc333a88a" |  | ||||||
| dependencies = [ |  | ||||||
|  "bincode", |  | ||||||
|  "byteorder", |  | ||||||
|  "encoding", |  | ||||||
|  "flate2", |  | ||||||
|  "lindera-core", |  | ||||||
|  "lindera-decompress", |  | ||||||
|  "lindera-ipadic-builder", |  "lindera-ipadic-builder", | ||||||
|  "once_cell", |  "lindera-ipadic-neologd-builder", | ||||||
|  "tar", |  "lindera-ko-dic", | ||||||
|  |  "lindera-ko-dic-builder", | ||||||
|  |  "lindera-unidic", | ||||||
|  |  "lindera-unidic-builder", | ||||||
|  |  "serde", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ipadic-builder" | name = "lindera-ipadic-builder" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "0a16a2a88db9d956f5086bc976deb9951ca2dbbfef41a002df0a7bfb2c845aab" | checksum = "31f373a280958c930e5ee4a1e4db3a0ee0542afaf02d3b5cacb8cab4e298648e" | ||||||
|  | dependencies = [ | ||||||
|  |  "anyhow", | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "csv", | ||||||
|  |  "encoding_rs", | ||||||
|  |  "encoding_rs_io", | ||||||
|  |  "env_logger", | ||||||
|  |  "glob", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-decompress", | ||||||
|  |  "log", | ||||||
|  |  "serde", | ||||||
|  |  "yada", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-ipadic-neologd-builder" | ||||||
|  | version = "0.25.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "92eff98e9ed1a7a412b91709c2343457a04ef02fa0c27c27e3a5892f5591eae9" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2364,7 +2316,6 @@ dependencies = [ | |||||||
|  "encoding_rs_io", |  "encoding_rs_io", | ||||||
|  "env_logger", |  "env_logger", | ||||||
|  "glob", |  "glob", | ||||||
|  "lindera-compress", |  | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-decompress", |  "lindera-decompress", | ||||||
|  "log", |  "log", | ||||||
| @@ -2374,9 +2325,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ko-dic" | name = "lindera-ko-dic" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "abb479b170a841b8cfbe602d772e30849ffe0562b219190a378368968b8c8f66" | checksum = "74c6d5bf7d8092bd6d10de7a5d74b70ea7cf234586235b0d6cdb903b05a6c9e2" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bincode", |  "bincode", | ||||||
|  "byteorder", |  "byteorder", | ||||||
| @@ -2391,9 +2342,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ko-dic-builder" | name = "lindera-ko-dic-builder" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "9b9b58213552560717c48e7833444a20d2d7fe26a6e565f7ce0cbbf85784c7cf" | checksum = "f0a4add6d3c1e41ec9e2690d33e287d0223fb59a30ccee4980c23f31368cae1e" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2410,10 +2361,42 @@ dependencies = [ | |||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-unidic-builder" | name = "lindera-tokenizer" | ||||||
| version = "0.23.0" | version = "0.25.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "6858147cdaf4a7b564c08a247449d3aca38e9b4812499651af08afbf85324596" | checksum = "cb6a8acbd068019d1cdac7316f0dcb87f8e33ede2b13aa237f45114f9750afb8" | ||||||
|  | dependencies = [ | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-dictionary", | ||||||
|  |  "once_cell", | ||||||
|  |  "serde", | ||||||
|  |  "serde_json", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-unidic" | ||||||
|  | version = "0.25.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "14abf0613d350b30d3b0406a33b1de8fa8d829f26516909421702174785991c8" | ||||||
|  | dependencies = [ | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "encoding", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-decompress", | ||||||
|  |  "lindera-unidic-builder", | ||||||
|  |  "once_cell", | ||||||
|  |  "ureq", | ||||||
|  |  "zip", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-unidic-builder" | ||||||
|  | version = "0.25.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "e204ed53d9bd63227d1e6a6c1f122ca039e00a8634ac32e7fb0281eeec8615c4" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2422,6 +2405,7 @@ dependencies = [ | |||||||
|  "encoding", |  "encoding", | ||||||
|  "env_logger", |  "env_logger", | ||||||
|  "glob", |  "glob", | ||||||
|  |  "lindera-compress", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-decompress", |  "lindera-decompress", | ||||||
|  "log", |  "log", | ||||||
| @@ -3427,7 +3411,7 @@ dependencies = [ | |||||||
|  "wasm-bindgen", |  "wasm-bindgen", | ||||||
|  "wasm-bindgen-futures", |  "wasm-bindgen-futures", | ||||||
|  "web-sys", |  "web-sys", | ||||||
|  "webpki-roots", |  "webpki-roots 0.22.6", | ||||||
|  "winreg", |  "winreg", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| @@ -4210,12 +4194,6 @@ version = "0.3.13" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" | checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" | ||||||
|  |  | ||||||
| [[package]] |  | ||||||
| name = "unicode-blocks" |  | ||||||
| version = "0.1.6" |  | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" |  | ||||||
| checksum = "943e3f1f50cc455d072e0801ccb71ff893b0c88060b1169f92e35fb5bb881cc6" |  | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicode-ident" | name = "unicode-ident" | ||||||
| version = "1.0.9" | version = "1.0.9" | ||||||
| @@ -4249,6 +4227,21 @@ version = "0.7.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" | checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "ureq" | ||||||
|  | version = "2.7.1" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "0b11c96ac7ee530603dcdf68ed1557050f374ce55a5a07193ebf8cbc9f8927e9" | ||||||
|  | dependencies = [ | ||||||
|  |  "base64 0.21.2", | ||||||
|  |  "log", | ||||||
|  |  "once_cell", | ||||||
|  |  "rustls 0.21.1", | ||||||
|  |  "rustls-webpki", | ||||||
|  |  "url", | ||||||
|  |  "webpki-roots 0.23.1", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "url" | name = "url" | ||||||
| version = "2.3.1" | version = "2.3.1" | ||||||
| @@ -4457,6 +4450,15 @@ dependencies = [ | |||||||
|  "webpki", |  "webpki", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "webpki-roots" | ||||||
|  | version = "0.23.1" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "b03058f88386e5ff5310d9111d53f48b17d732b401aeb83a8d5190f2ac459338" | ||||||
|  | dependencies = [ | ||||||
|  |  "rustls-webpki", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "whatlang" | name = "whatlang" | ||||||
| version = "0.16.2" | version = "0.16.2" | ||||||
|   | |||||||
| @@ -727,10 +727,10 @@ fn extract_field( | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| fn format_fields<A: AsRef<[u8]>>( | fn format_fields( | ||||||
|     document: &Document, |     document: &Document, | ||||||
|     field_ids_map: &FieldsIdsMap, |     field_ids_map: &FieldsIdsMap, | ||||||
|     builder: &MatcherBuilder<'_, A>, |     builder: &MatcherBuilder, | ||||||
|     formatted_options: &BTreeMap<FieldId, FormatOptions>, |     formatted_options: &BTreeMap<FieldId, FormatOptions>, | ||||||
|     compute_matches: bool, |     compute_matches: bool, | ||||||
|     displayable_ids: &BTreeSet<FieldId>, |     displayable_ids: &BTreeSet<FieldId>, | ||||||
| @@ -775,9 +775,9 @@ fn format_fields<A: AsRef<[u8]>>( | |||||||
|     Ok((matches_position, document)) |     Ok((matches_position, document)) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn format_value<A: AsRef<[u8]>>( | fn format_value( | ||||||
|     value: Value, |     value: Value, | ||||||
|     builder: &MatcherBuilder<'_, A>, |     builder: &MatcherBuilder, | ||||||
|     format_options: Option<FormatOptions>, |     format_options: Option<FormatOptions>, | ||||||
|     infos: &mut Vec<MatchBounds>, |     infos: &mut Vec<MatchBounds>, | ||||||
|     compute_matches: bool, |     compute_matches: bool, | ||||||
|   | |||||||
| @@ -17,7 +17,7 @@ bincode = "1.3.3" | |||||||
| bstr = "1.4.0" | bstr = "1.4.0" | ||||||
| bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] } | bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] } | ||||||
| byteorder = "1.4.3" | byteorder = "1.4.3" | ||||||
| charabia = { version = "0.7.2", default-features = false } | charabia = { version = "0.8.1", default-features = false } | ||||||
| concat-arrays = "0.1.2" | concat-arrays = "0.1.2" | ||||||
| crossbeam-channel = "0.5.8" | crossbeam-channel = "0.5.8" | ||||||
| deserr = "0.5.0" | deserr = "0.5.0" | ||||||
|   | |||||||
| @@ -256,7 +256,8 @@ pub(crate) mod tests { | |||||||
|         let temp_index = temp_index_with_documents(); |         let temp_index = temp_index_with_documents(); | ||||||
|         let rtxn = temp_index.read_txn().unwrap(); |         let rtxn = temp_index.read_txn().unwrap(); | ||||||
|         let mut ctx = SearchContext::new(&temp_index, &rtxn); |         let mut ctx = SearchContext::new(&temp_index, &rtxn); | ||||||
|         let tokenizer = TokenizerBuilder::new().build(); |         let mut builder = TokenizerBuilder::default(); | ||||||
|  |         let tokenizer = builder.build(); | ||||||
|         let tokens = tokenizer.tokenize("split this world"); |         let tokens = tokenizer.tokenize("split this world"); | ||||||
|         let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); |         let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); | ||||||
|         let matching_words = MatchingWords::new(ctx, query_terms); |         let matching_words = MatchingWords::new(ctx, query_terms); | ||||||
|   | |||||||
| @@ -12,16 +12,16 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>"; | |||||||
| const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>"; | const DEFAULT_HIGHLIGHT_SUFFIX: &str = "</em>"; | ||||||
|  |  | ||||||
| /// Structure used to build a Matcher allowing to customize formating tags. | /// Structure used to build a Matcher allowing to customize formating tags. | ||||||
| pub struct MatcherBuilder<'a, A> { | pub struct MatcherBuilder<'m> { | ||||||
|     matching_words: MatchingWords, |     matching_words: MatchingWords, | ||||||
|     tokenizer: Tokenizer<'a, 'a, A>, |     tokenizer: Tokenizer<'m>, | ||||||
|     crop_marker: Option<String>, |     crop_marker: Option<String>, | ||||||
|     highlight_prefix: Option<String>, |     highlight_prefix: Option<String>, | ||||||
|     highlight_suffix: Option<String>, |     highlight_suffix: Option<String>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'a, A> MatcherBuilder<'a, A> { | impl<'m> MatcherBuilder<'m> { | ||||||
|     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { |     pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'m>) -> Self { | ||||||
|         Self { |         Self { | ||||||
|             matching_words, |             matching_words, | ||||||
|             tokenizer, |             tokenizer, | ||||||
| @@ -46,7 +46,7 @@ impl<'a, A> MatcherBuilder<'a, A> { | |||||||
|         self |         self | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { |     pub fn build<'t>(&'m self, text: &'t str) -> Matcher<'t, 'm> { | ||||||
|         let crop_marker = match &self.crop_marker { |         let crop_marker = match &self.crop_marker { | ||||||
|             Some(marker) => marker.as_str(), |             Some(marker) => marker.as_str(), | ||||||
|             None => DEFAULT_CROP_MARKER, |             None => DEFAULT_CROP_MARKER, | ||||||
| @@ -103,17 +103,17 @@ pub struct MatchBounds { | |||||||
|  |  | ||||||
| /// Structure used to analize a string, compute words that match, | /// Structure used to analize a string, compute words that match, | ||||||
| /// and format the source string, returning a highlighted and cropped sub-string. | /// and format the source string, returning a highlighted and cropped sub-string. | ||||||
| pub struct Matcher<'t, 'm, A> { | pub struct Matcher<'t, 'm> { | ||||||
|     text: &'t str, |     text: &'t str, | ||||||
|     matching_words: &'m MatchingWords, |     matching_words: &'m MatchingWords, | ||||||
|     tokenizer: &'m Tokenizer<'m, 'm, A>, |     tokenizer: &'m Tokenizer<'m>, | ||||||
|     crop_marker: &'m str, |     crop_marker: &'m str, | ||||||
|     highlight_prefix: &'m str, |     highlight_prefix: &'m str, | ||||||
|     highlight_suffix: &'m str, |     highlight_suffix: &'m str, | ||||||
|     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, |     matches: Option<(Vec<Token<'t>>, Vec<Match>)>, | ||||||
| } | } | ||||||
|  |  | ||||||
| impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { | impl<'t> Matcher<'t, '_> { | ||||||
|     /// Iterates over tokens and save any of them that matches the query. |     /// Iterates over tokens and save any of them that matches the query. | ||||||
|     fn compute_matches(&mut self) -> &mut Self { |     fn compute_matches(&mut self) -> &mut Self { | ||||||
|         /// some words are counted as matches only if they are close together and in the good order, |         /// some words are counted as matches only if they are close together and in the good order, | ||||||
| @@ -503,7 +503,7 @@ mod tests { | |||||||
|     use crate::index::tests::TempIndex; |     use crate::index::tests::TempIndex; | ||||||
|     use crate::{execute_search, SearchContext}; |     use crate::{execute_search, SearchContext}; | ||||||
|  |  | ||||||
|     impl<'a> MatcherBuilder<'a, &[u8]> { |     impl<'a> MatcherBuilder<'a> { | ||||||
|         fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { |         fn new_test(rtxn: &'a heed::RoTxn, index: &'a TempIndex, query: &str) -> Self { | ||||||
|             let mut ctx = SearchContext::new(index, rtxn); |             let mut ctx = SearchContext::new(index, rtxn); | ||||||
|             let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( |             let crate::search::PartialSearchResult { located_query_terms, .. } = execute_search( | ||||||
| @@ -530,7 +530,7 @@ mod tests { | |||||||
|                 None => MatchingWords::default(), |                 None => MatchingWords::default(), | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
|             MatcherBuilder::new(matching_words, TokenizerBuilder::new().build()) |             MatcherBuilder::new(matching_words, TokenizerBuilder::default().into_tokenizer()) | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -7,7 +7,7 @@ use crate::{Result, SearchContext, MAX_WORD_LENGTH}; | |||||||
| /// Convert the tokenised search query into a list of located query terms. | /// Convert the tokenised search query into a list of located query terms. | ||||||
| pub fn located_query_terms_from_tokens( | pub fn located_query_terms_from_tokens( | ||||||
|     ctx: &mut SearchContext, |     ctx: &mut SearchContext, | ||||||
|     query: NormalizedTokenIter<&[u8]>, |     query: NormalizedTokenIter, | ||||||
|     words_limit: Option<usize>, |     words_limit: Option<usize>, | ||||||
| ) -> Result<Vec<LocatedQueryTerm>> { | ) -> Result<Vec<LocatedQueryTerm>> { | ||||||
|     let nbr_typos = number_of_typos_allowed(ctx)?; |     let nbr_typos = number_of_typos_allowed(ctx)?; | ||||||
| @@ -303,7 +303,8 @@ mod tests { | |||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|     fn start_with_hard_separator() -> Result<()> { |     fn start_with_hard_separator() -> Result<()> { | ||||||
|         let tokenizer = TokenizerBuilder::new().build(); |         let mut builder = TokenizerBuilder::default(); | ||||||
|  |         let tokenizer = builder.build(); | ||||||
|         let tokens = tokenizer.tokenize("."); |         let tokens = tokenizer.tokenize("."); | ||||||
|         let index = temp_index_with_documents(); |         let index = temp_index_with_documents(); | ||||||
|         let rtxn = index.read_txn()?; |         let rtxn = index.read_txn()?; | ||||||
|   | |||||||
| @@ -113,7 +113,7 @@ fn test_ignore_stop_words() { | |||||||
|             ), |             ), | ||||||
|             Position( |             Position( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 9, |                     rank: 7, | ||||||
|                     max_rank: 11, |                     max_rank: 11, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
| @@ -166,7 +166,7 @@ fn test_ignore_stop_words() { | |||||||
|             ), |             ), | ||||||
|             Position( |             Position( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 9, |                     rank: 7, | ||||||
|                     max_rank: 11, |                     max_rank: 11, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
| @@ -219,7 +219,7 @@ fn test_ignore_stop_words() { | |||||||
|             ), |             ), | ||||||
|             Position( |             Position( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 9, |                     rank: 7, | ||||||
|                     max_rank: 11, |                     max_rank: 11, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
| @@ -259,7 +259,7 @@ fn test_ignore_stop_words() { | |||||||
|             ), |             ), | ||||||
|             Proximity( |             Proximity( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 7, |                     rank: 1, | ||||||
|                     max_rank: 8, |                     max_rank: 8, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
| @@ -271,7 +271,7 @@ fn test_ignore_stop_words() { | |||||||
|             ), |             ), | ||||||
|             Position( |             Position( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 17, |                     rank: 15, | ||||||
|                     max_rank: 21, |                     max_rank: 21, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
| @@ -411,7 +411,7 @@ fn test_stop_words_in_phrase() { | |||||||
|             ), |             ), | ||||||
|             Proximity( |             Proximity( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 6, |                     rank: 1, | ||||||
|                     max_rank: 8, |                     max_rank: 8, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
| @@ -423,7 +423,7 @@ fn test_stop_words_in_phrase() { | |||||||
|             ), |             ), | ||||||
|             Position( |             Position( | ||||||
|                 Rank { |                 Rank { | ||||||
|                     rank: 29, |                     rank: 27, | ||||||
|                     max_rank: 31, |                     max_rank: 31, | ||||||
|                 }, |                 }, | ||||||
|             ), |             ), | ||||||
|   | |||||||
| @@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         .map(|reader| (documents_ids, reader, script_language_docids)) |         .map(|reader| (documents_ids, reader, script_language_docids)) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn extract_tokens_from_document<T: AsRef<[u8]>>( | fn extract_tokens_from_document( | ||||||
|     obkv: &KvReader<FieldId>, |     obkv: &KvReader<FieldId>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     searchable_fields: &Option<HashSet<FieldId>>, | ||||||
|     tokenizer: &Tokenizer<T>, |     tokenizer: &Tokenizer, | ||||||
|     max_positions_per_attributes: u32, |     max_positions_per_attributes: u32, | ||||||
|     buffers: &mut Buffers, |     buffers: &mut Buffers, | ||||||
|     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, |     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>, | ||||||
|   | |||||||
| @@ -1,7 +1,7 @@ | |||||||
| use std::collections::{BTreeSet, HashMap, HashSet}; | use std::collections::{BTreeSet, HashMap, HashSet}; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
|  |  | ||||||
| use charabia::{Tokenizer, TokenizerBuilder}; | use charabia::{Normalize, Tokenizer, TokenizerBuilder}; | ||||||
| use deserr::{DeserializeError, Deserr}; | use deserr::{DeserializeError, Deserr}; | ||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use serde::{Deserialize, Deserializer, Serialize, Serializer}; | use serde::{Deserialize, Deserializer, Serialize, Serializer}; | ||||||
| @@ -413,6 +413,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|         match self.stop_words { |         match self.stop_words { | ||||||
|             Setting::Set(ref stop_words) => { |             Setting::Set(ref stop_words) => { | ||||||
|                 let current = self.index.stop_words(self.wtxn)?; |                 let current = self.index.stop_words(self.wtxn)?; | ||||||
|  |  | ||||||
|  |                 // Apply an unlossy normalization on stop_words | ||||||
|  |                 let stop_words = stop_words | ||||||
|  |                     .iter() | ||||||
|  |                     .map(|w| w.as_str().normalize(&Default::default()).into_owned()); | ||||||
|  |  | ||||||
|                 // since we can't compare a BTreeSet with an FST we are going to convert the |                 // since we can't compare a BTreeSet with an FST we are going to convert the | ||||||
|                 // BTreeSet to an FST and then compare bytes per bytes the two FSTs. |                 // BTreeSet to an FST and then compare bytes per bytes the two FSTs. | ||||||
|                 let fst = fst::Set::from_iter(stop_words)?; |                 let fst = fst::Set::from_iter(stop_words)?; | ||||||
| @@ -436,7 +442,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     fn update_synonyms(&mut self) -> Result<bool> { |     fn update_synonyms(&mut self) -> Result<bool> { | ||||||
|         match self.synonyms { |         match self.synonyms { | ||||||
|             Setting::Set(ref synonyms) => { |             Setting::Set(ref synonyms) => { | ||||||
|                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> { |                 fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> { | ||||||
|                     tokenizer |                     tokenizer | ||||||
|                         .tokenize(text) |                         .tokenize(text) | ||||||
|                         .filter_map(|token| { |                         .filter_map(|token| { | ||||||
| @@ -637,7 +643,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { | |||||||
|     fn update_exact_words(&mut self) -> Result<()> { |     fn update_exact_words(&mut self) -> Result<()> { | ||||||
|         match self.exact_words { |         match self.exact_words { | ||||||
|             Setting::Set(ref mut words) => { |             Setting::Set(ref mut words) => { | ||||||
|                 fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { |                 fn normalize(tokenizer: &Tokenizer, text: &str) -> String { | ||||||
|                     tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() |                     tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user