mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-31 16:06:31 +00:00 
			
		
		
		
	Merge #4621
4621: Bring back changes from v1.8.0 into main r=curquiza a=curquiza Co-authored-by: ManyTheFish <many@meilisearch.com> Co-authored-by: Tamo <tamo@meilisearch.com> Co-authored-by: meili-bors[bot] <89034592+meili-bors[bot]@users.noreply.github.com> Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
		
							
								
								
									
										247
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										247
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -354,9 +354,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "anyhow" | name = "anyhow" | ||||||
| version = "1.0.80" | version = "1.0.82" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5ad32ce52e4161730f7098c077cd2ed6229b5804ccf99e5366be1ab72a98b4e1" | checksum = "f538837af36e6f6a9be0faa67f9a314f8119e4e4b5867c6ab40ed60360142519" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "backtrace", |  "backtrace", | ||||||
| ] | ] | ||||||
| @@ -889,9 +889,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "charabia" | name = "charabia" | ||||||
| version = "0.8.8" | version = "0.8.10" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "60dc1a562fc8cb53d552d371758a4ecd76d15cc7489d2b968529cd9cadcbd854" | checksum = "933f20f2269b24d32fd5503e7b3c268af902190daf8d9d2b73ed2e75d77c00b4" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "aho-corasick", |  "aho-corasick", | ||||||
|  "cow-utils", |  "cow-utils", | ||||||
| @@ -901,9 +901,7 @@ dependencies = [ | |||||||
|  "fst", |  "fst", | ||||||
|  "irg-kvariants", |  "irg-kvariants", | ||||||
|  "jieba-rs", |  "jieba-rs", | ||||||
|  "lindera-core", |  "lindera", | ||||||
|  "lindera-dictionary", |  | ||||||
|  "lindera-tokenizer", |  | ||||||
|  "litemap", |  "litemap", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "pinyin", |  "pinyin", | ||||||
| @@ -1715,9 +1713,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "env_logger" | name = "env_logger" | ||||||
| version = "0.11.2" | version = "0.11.3" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "6c012a26a7f605efc424dd53697843a72be7dc86ad2d01f7814337794a12231d" | checksum = "38b35839ba51819680ba087cd351788c9a3c476841207e0b8cee0b04722343b9" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anstream", |  "anstream", | ||||||
|  "anstyle", |  "anstyle", | ||||||
| @@ -2661,6 +2659,15 @@ dependencies = [ | |||||||
|  "simple_asn1", |  "simple_asn1", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "kanaria" | ||||||
|  | version = "0.2.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "c0f9d9652540055ac4fded998a73aca97d965899077ab1212587437da44196ff" | ||||||
|  | dependencies = [ | ||||||
|  |  "bitflags 1.3.2", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "kstring" | name = "kstring" | ||||||
| version = "2.0.0" | version = "2.0.0" | ||||||
| @@ -2766,10 +2773,67 @@ dependencies = [ | |||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-cc-cedict-builder" | name = "lindera" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "ca21f2ee3ca40e7f3ebbd568d041be1531c2c28dbf540e737aeba934ab53f330" | checksum = "a1bbf252ea3490053dc397539ece0b510924f2f72605fa28d3e858d86f43ec88" | ||||||
|  | dependencies = [ | ||||||
|  |  "lindera-analyzer", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-dictionary", | ||||||
|  |  "lindera-filter", | ||||||
|  |  "lindera-tokenizer", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-analyzer" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "87febfec0e2859ce2154fb90dd6f66b774ddb0b6e264b44f8e3d1303c9dcedd7" | ||||||
|  | dependencies = [ | ||||||
|  |  "anyhow", | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "encoding", | ||||||
|  |  "kanaria", | ||||||
|  |  "lindera-cc-cedict-builder", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-dictionary", | ||||||
|  |  "lindera-filter", | ||||||
|  |  "lindera-ipadic-builder", | ||||||
|  |  "lindera-ko-dic-builder", | ||||||
|  |  "lindera-tokenizer", | ||||||
|  |  "lindera-unidic-builder", | ||||||
|  |  "once_cell", | ||||||
|  |  "regex", | ||||||
|  |  "serde", | ||||||
|  |  "serde_json", | ||||||
|  |  "thiserror", | ||||||
|  |  "unicode-blocks", | ||||||
|  |  "unicode-normalization", | ||||||
|  |  "unicode-segmentation", | ||||||
|  |  "yada", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-cc-cedict" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "fcb91bb8a93ab0f95dbc3c43b5105354bb059134ef731154f75a64b5d919e71d" | ||||||
|  | dependencies = [ | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "lindera-cc-cedict-builder", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-decompress", | ||||||
|  |  "once_cell", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-cc-cedict-builder" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "f6022a8309a287dbef425fd09a61585351670c83001d74f6c089979e2330b683" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2778,6 +2842,7 @@ dependencies = [ | |||||||
|  "encoding", |  "encoding", | ||||||
|  "env_logger", |  "env_logger", | ||||||
|  "glob", |  "glob", | ||||||
|  |  "lindera-compress", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-decompress", |  "lindera-decompress", | ||||||
|  "log", |  "log", | ||||||
| @@ -2786,9 +2851,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-compress" | name = "lindera-compress" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "34da125091f3b3a49351f418484a16cb2a23f6888cd53fe219edad19d263da5d" | checksum = "32363cbcf433f915e7d77c2a0c410db2d6b23442e80715cf2cf6b9864078a500" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "flate2", |  "flate2", | ||||||
| @@ -2797,9 +2862,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-core" | name = "lindera-core" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "09d4b717a8a31b73a3cbd3552e0abda14e0c85d97dc8b911035342533defdbad" | checksum = "d9a0e858753a02b1a3524fae4fbb11ca4b3a947128fd7854b797386562678be8" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2814,9 +2879,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-decompress" | name = "lindera-decompress" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "98f4476c99cb4ffa54fbfc42953adf69ada7276cfbb594bce9829547de012058" | checksum = "0e406345f6f8b665b9a129c67079c18ca9d97e9d171d102b4106a64a592c285e" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "flate2", |  "flate2", | ||||||
| @@ -2825,29 +2890,73 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-dictionary" | name = "lindera-dictionary" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "a45b92f0ce331c2202c6cec3135e4bfce29525ab3bb97a613c27c8e0a29fa967" | checksum = "3e2a3ec0e5fd6768a27c6ec1040e8470d3a5926418f7afe065859e98aabb3bfe" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  |  "lindera-cc-cedict", | ||||||
|  "lindera-cc-cedict-builder", |  "lindera-cc-cedict-builder", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  |  "lindera-ipadic", | ||||||
|  "lindera-ipadic-builder", |  "lindera-ipadic-builder", | ||||||
|  |  "lindera-ipadic-neologd", | ||||||
|  "lindera-ipadic-neologd-builder", |  "lindera-ipadic-neologd-builder", | ||||||
|  "lindera-ko-dic", |  "lindera-ko-dic", | ||||||
|  "lindera-ko-dic-builder", |  "lindera-ko-dic-builder", | ||||||
|  "lindera-unidic", |  "lindera-unidic", | ||||||
|  "lindera-unidic-builder", |  "lindera-unidic-builder", | ||||||
|  "serde", |  "serde", | ||||||
|  |  "strum", | ||||||
|  |  "strum_macros", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-filter" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "1badaf51bad051185ea4917ba91bbbf2d6f8167e155647e21e0eaaef0982a95d" | ||||||
|  | dependencies = [ | ||||||
|  |  "anyhow", | ||||||
|  |  "csv", | ||||||
|  |  "kanaria", | ||||||
|  |  "lindera-cc-cedict-builder", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-dictionary", | ||||||
|  |  "lindera-ipadic-builder", | ||||||
|  |  "lindera-ko-dic-builder", | ||||||
|  |  "lindera-unidic-builder", | ||||||
|  |  "once_cell", | ||||||
|  |  "regex", | ||||||
|  |  "serde", | ||||||
|  |  "serde_json", | ||||||
|  |  "unicode-blocks", | ||||||
|  |  "unicode-normalization", | ||||||
|  |  "unicode-segmentation", | ||||||
|  |  "yada", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-ipadic" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "129ec16366354998f9791467ad38731539197747f649e573ead845358271ce25" | ||||||
|  | dependencies = [ | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-decompress", | ||||||
|  |  "lindera-ipadic-builder", | ||||||
|  |  "once_cell", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ipadic-builder" | name = "lindera-ipadic-builder" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "642dee52201852df209cb43423ff1ca4d161a329f5cdba049a7b5820118345f2" | checksum = "7f0979a56bc57e9c9be2996dff232c47aa146a2e7baebf5dd567e388eba3dd90" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2857,6 +2966,7 @@ dependencies = [ | |||||||
|  "encoding_rs_io", |  "encoding_rs_io", | ||||||
|  "env_logger", |  "env_logger", | ||||||
|  "glob", |  "glob", | ||||||
|  |  "lindera-compress", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-decompress", |  "lindera-decompress", | ||||||
|  "log", |  "log", | ||||||
| @@ -2865,10 +2975,24 @@ dependencies = [ | |||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ipadic-neologd-builder" | name = "lindera-ipadic-neologd" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "325144b154e68159373e944d1cd7f67c6ff9965a2af41240a8e41732b3fdb3af" | checksum = "20076660c4e79ef0316735b44e18ec7644e54786acdee8946c972d5f97086d0f" | ||||||
|  | dependencies = [ | ||||||
|  |  "bincode", | ||||||
|  |  "byteorder", | ||||||
|  |  "lindera-core", | ||||||
|  |  "lindera-decompress", | ||||||
|  |  "lindera-ipadic-neologd-builder", | ||||||
|  |  "once_cell", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "lindera-ipadic-neologd-builder" | ||||||
|  | version = "0.30.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "eccd18ed5f65d1d64ac0cbfa1d6827bfbbaf6530520ae6847e6a91ee38f47e20" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2878,6 +3002,7 @@ dependencies = [ | |||||||
|  "encoding_rs_io", |  "encoding_rs_io", | ||||||
|  "env_logger", |  "env_logger", | ||||||
|  "glob", |  "glob", | ||||||
|  |  "lindera-compress", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-decompress", |  "lindera-decompress", | ||||||
|  "log", |  "log", | ||||||
| @@ -2887,9 +3012,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ko-dic" | name = "lindera-ko-dic" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "b484a2f9964e7424264fda304beb6ff6ad883c347accfe1115e777dedef3661d" | checksum = "59073171566c3e498ca048e84c2d0a7e117a42f36c8eb7d7163e65ac38bd6d48" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bincode", |  "bincode", | ||||||
|  "byteorder", |  "byteorder", | ||||||
| @@ -2900,13 +3025,14 @@ dependencies = [ | |||||||
|  "lindera-ko-dic-builder", |  "lindera-ko-dic-builder", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "tar", |  "tar", | ||||||
|  |  "ureq", | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-ko-dic-builder" | name = "lindera-ko-dic-builder" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "b9413d4d9bf7af921f5ac64414a290c7ba81695e8ba08dd2f6c950b57c281a69" | checksum = "ae176afa8535ca2a5ee9471873f85d531db0a6c32a3c42b41084506aac22b577" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -2924,9 +3050,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-tokenizer" | name = "lindera-tokenizer" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "9987c818462d51ca67e131e40f0386e25e8c557e195059b1257f95731561185d" | checksum = "457285bdde84571aa510c9e05371904305a55e8a541fa1473d4393062f06932d" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bincode", |  "bincode", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
| @@ -2938,26 +3064,27 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-unidic" | name = "lindera-unidic" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "0c379cf436b2627cd7d3498642e491eadbff9b3e01231c516ce9f9b1893ab7c3" | checksum = "5839980be552dfa639b70964c61914a9ad014148663679b0e148aa72e5e30f23" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "bincode", |  "bincode", | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  "encoding", |  "encoding", | ||||||
|  |  "flate2", | ||||||
|  "lindera-core", |  "lindera-core", | ||||||
|  "lindera-decompress", |  "lindera-decompress", | ||||||
|  "lindera-unidic-builder", |  "lindera-unidic-builder", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  |  "tar", | ||||||
|  "ureq", |  "ureq", | ||||||
|  "zip", |  | ||||||
| ] | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "lindera-unidic-builder" | name = "lindera-unidic-builder" | ||||||
| version = "0.28.0" | version = "0.30.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "601ec33b5174141396a7a4ca066278863840221fec32d0be19091e7fae91ed94" | checksum = "dcaab8f061d5b944b1e424f49c7efbf8f276e8a72e4f4ff956d01e46d481f008" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "anyhow", |  "anyhow", | ||||||
|  "bincode", |  "bincode", | ||||||
| @@ -4214,9 +4341,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "regex" | name = "regex" | ||||||
| version = "1.10.2" | version = "1.10.4" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "380b951a9c5e80ddfd6136919eef32310721aa4aacd4889a8d39124b026ab343" | checksum = "c117dbdfde9c8308975b6a18d71f3f385c89461f7b3fb054288ecf2a2058ba4c" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "aho-corasick", |  "aho-corasick", | ||||||
|  "memchr", |  "memchr", | ||||||
| @@ -4226,9 +4353,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "regex-automata" | name = "regex-automata" | ||||||
| version = "0.4.3" | version = "0.4.6" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" | checksum = "86b83b8b9847f9bf95ef68afb0b8e6cdb80f498442f5179a29fad448fcc1eaea" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "aho-corasick", |  "aho-corasick", | ||||||
|  "memchr", |  "memchr", | ||||||
| @@ -4795,6 +4922,28 @@ version = "0.10.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "strum" | ||||||
|  | version = "0.26.2" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29" | ||||||
|  | dependencies = [ | ||||||
|  |  "strum_macros", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "strum_macros" | ||||||
|  | version = "0.26.2" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946" | ||||||
|  | dependencies = [ | ||||||
|  |  "heck", | ||||||
|  |  "proc-macro2", | ||||||
|  |  "quote", | ||||||
|  |  "rustversion", | ||||||
|  |  "syn 2.0.58", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "subtle" | name = "subtle" | ||||||
| version = "2.5.0" | version = "2.5.0" | ||||||
| @@ -5324,6 +5473,12 @@ version = "0.3.13" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" | checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "unicode-blocks" | ||||||
|  | version = "0.1.9" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "6b12e05d9e06373163a9bb6bb8c263c261b396643a99445fe6b9811fd376581b" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicode-ident" | name = "unicode-ident" | ||||||
| version = "1.0.12" | version = "1.0.12" | ||||||
| @@ -5332,9 +5487,9 @@ checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicode-normalization" | name = "unicode-normalization" | ||||||
| version = "0.1.22" | version = "0.1.23" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "5c5713f0fc4b5db668a2ac63cdb7bb4469d8c9fed047b1d0292cc7b0ce2ba921" | checksum = "a56d1686db2308d901306f92a263857ef59ea39678a5458e7cb17f01415101f5" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "tinyvec", |  "tinyvec", | ||||||
| ] | ] | ||||||
| @@ -5350,9 +5505,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicode-segmentation" | name = "unicode-segmentation" | ||||||
| version = "1.10.1" | version = "1.11.0" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" | checksum = "d4c87d22b6e3f4a18d4d40ef354e97c90fcb14dd91d7dc0aa9d8a1172ebf7202" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicode-width" | name = "unicode-width" | ||||||
| @@ -5942,9 +6097,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "yada" | name = "yada" | ||||||
| version = "0.5.0" | version = "0.5.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "b6d12cb7a57bbf2ab670ed9545bae3648048547f9039279a89ce000208e585c1" | checksum = "aed111bd9e48a802518765906cbdadf0b45afb72b9c81ab049a3b86252adffdd" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "yaml-rust" | name = "yaml-rust" | ||||||
|   | |||||||
| @@ -256,8 +256,8 @@ pub(crate) mod test { | |||||||
|  |  | ||||||
|     pub fn create_test_settings() -> Settings<Checked> { |     pub fn create_test_settings() -> Settings<Checked> { | ||||||
|         let settings = Settings { |         let settings = Settings { | ||||||
|             displayed_attributes: Setting::Set(vec![S("race"), S("name")]), |             displayed_attributes: Setting::Set(vec![S("race"), S("name")]).into(), | ||||||
|             searchable_attributes: Setting::Set(vec![S("name"), S("race")]), |             searchable_attributes: Setting::Set(vec![S("name"), S("race")]).into(), | ||||||
|             filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), |             filterable_attributes: Setting::Set(btreeset! { S("race"), S("age") }), | ||||||
|             sortable_attributes: Setting::Set(btreeset! { S("age") }), |             sortable_attributes: Setting::Set(btreeset! { S("age") }), | ||||||
|             ranking_rules: Setting::NotSet, |             ranking_rules: Setting::NotSet, | ||||||
|   | |||||||
| @@ -315,8 +315,8 @@ impl From<v5::ResponseError> for v6::ResponseError { | |||||||
| impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> { | impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> { | ||||||
|     fn from(settings: v5::Settings<T>) -> Self { |     fn from(settings: v5::Settings<T>) -> Self { | ||||||
|         v6::Settings { |         v6::Settings { | ||||||
|             displayed_attributes: settings.displayed_attributes.into(), |             displayed_attributes: v6::Setting::from(settings.displayed_attributes).into(), | ||||||
|             searchable_attributes: settings.searchable_attributes.into(), |             searchable_attributes: v6::Setting::from(settings.searchable_attributes).into(), | ||||||
|             filterable_attributes: settings.filterable_attributes.into(), |             filterable_attributes: settings.filterable_attributes.into(), | ||||||
|             sortable_attributes: settings.sortable_attributes.into(), |             sortable_attributes: settings.sortable_attributes.into(), | ||||||
|             ranking_rules: { |             ranking_rules: { | ||||||
|   | |||||||
| @@ -3041,6 +3041,7 @@ mod tests { | |||||||
|             source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), |             source: Setting::Set(milli::vector::settings::EmbedderSource::Rest), | ||||||
|             api_key: Setting::Set(S("My super secret")), |             api_key: Setting::Set(S("My super secret")), | ||||||
|             url: Setting::Set(S("http://localhost:7777")), |             url: Setting::Set(S("http://localhost:7777")), | ||||||
|  |             dimensions: Setting::Set(4), | ||||||
|             ..Default::default() |             ..Default::default() | ||||||
|         }; |         }; | ||||||
|         embedders.insert(S("default"), Setting::Set(embedding_settings)); |         embedders.insert(S("default"), Setting::Set(embedding_settings)); | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ expression: task.details | |||||||
|     "default": { |     "default": { | ||||||
|       "source": "rest", |       "source": "rest", | ||||||
|       "apiKey": "MyXXXX...", |       "apiKey": "MyXXXX...", | ||||||
|  |       "dimensions": 4, | ||||||
|       "url": "http://localhost:7777" |       "url": "http://localhost:7777" | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ expression: embedding_config.embedder_options | |||||||
|   "Rest": { |   "Rest": { | ||||||
|     "api_key": "My super secret", |     "api_key": "My super secret", | ||||||
|     "distribution": null, |     "distribution": null, | ||||||
|     "dimensions": null, |     "dimensions": 4, | ||||||
|     "url": "http://localhost:7777", |     "url": "http://localhost:7777", | ||||||
|     "query": null, |     "query": null, | ||||||
|     "input_field": [ |     "input_field": [ | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ expression: task.details | |||||||
|     "default": { |     "default": { | ||||||
|       "source": "rest", |       "source": "rest", | ||||||
|       "apiKey": "MyXXXX...", |       "apiKey": "MyXXXX...", | ||||||
|  |       "dimensions": 4, | ||||||
|       "url": "http://localhost:7777" |       "url": "http://localhost:7777" | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
| enqueued [0,] | enqueued [0,] | ||||||
|   | |||||||
| @@ -6,7 +6,7 @@ source: index-scheduler/src/lib.rs | |||||||
| [] | [] | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### All Tasks: | ### All Tasks: | ||||||
| 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: NotSet, searchable_attributes: NotSet, filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: NotSet, document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | 0 {uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, api_key: Set("My super secret"), dimensions: Set(4), document_template: NotSet, url: Set("http://localhost:7777"), query: NotSet, input_field: NotSet, path_to_embeddings: NotSet, embedding_object: NotSet, input_type: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, _kind: PhantomData<meilisearch_types::settings::Unchecked> }, is_deletion: false, allow_index_creation: true }} | ||||||
| ---------------------------------------------------------------------- | ---------------------------------------------------------------------- | ||||||
| ### Status: | ### Status: | ||||||
| enqueued [] | enqueued [] | ||||||
|   | |||||||
| @@ -44,6 +44,7 @@ all-tokenizations = ["milli/all-tokenizations"] | |||||||
|  |  | ||||||
| # chinese specialized tokenization | # chinese specialized tokenization | ||||||
| chinese = ["milli/chinese"] | chinese = ["milli/chinese"] | ||||||
|  | chinese-pinyin = ["milli/chinese-pinyin"] | ||||||
| # hebrew specialized tokenization | # hebrew specialized tokenization | ||||||
| hebrew = ["milli/hebrew"] | hebrew = ["milli/hebrew"] | ||||||
| # japanese specialized tokenization | # japanese specialized tokenization | ||||||
| @@ -56,3 +57,5 @@ greek = ["milli/greek"] | |||||||
| khmer = ["milli/khmer"] | khmer = ["milli/khmer"] | ||||||
| # allow vietnamese specialized tokenization | # allow vietnamese specialized tokenization | ||||||
| vietnamese = ["milli/vietnamese"] | vietnamese = ["milli/vietnamese"] | ||||||
|  | # force swedish character recomposition | ||||||
|  | swedish-recomposition = ["milli/swedish-recomposition"] | ||||||
|   | |||||||
| @@ -3,7 +3,7 @@ use std::convert::Infallible; | |||||||
| use std::fmt; | use std::fmt; | ||||||
| use std::marker::PhantomData; | use std::marker::PhantomData; | ||||||
| use std::num::NonZeroUsize; | use std::num::NonZeroUsize; | ||||||
| use std::ops::ControlFlow; | use std::ops::{ControlFlow, Deref}; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
|  |  | ||||||
| use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; | use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; | ||||||
| @@ -143,21 +143,13 @@ impl MergeWithError<milli::CriterionError> for DeserrJsonError<InvalidSettingsRa | |||||||
| )] | )] | ||||||
| #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] | #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] | ||||||
| pub struct Settings<T> { | pub struct Settings<T> { | ||||||
|     #[serde( |     #[serde(default, skip_serializing_if = "Setting::is_not_set")] | ||||||
|         default, |  | ||||||
|         serialize_with = "serialize_with_wildcard", |  | ||||||
|         skip_serializing_if = "Setting::is_not_set" |  | ||||||
|     )] |  | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)] |     #[deserr(default, error = DeserrJsonError<InvalidSettingsDisplayedAttributes>)] | ||||||
|     pub displayed_attributes: Setting<Vec<String>>, |     pub displayed_attributes: WildcardSetting, | ||||||
|  |  | ||||||
|     #[serde( |     #[serde(default, skip_serializing_if = "Setting::is_not_set")] | ||||||
|         default, |  | ||||||
|         serialize_with = "serialize_with_wildcard", |  | ||||||
|         skip_serializing_if = "Setting::is_not_set" |  | ||||||
|     )] |  | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)] |     #[deserr(default, error = DeserrJsonError<InvalidSettingsSearchableAttributes>)] | ||||||
|     pub searchable_attributes: Setting<Vec<String>>, |     pub searchable_attributes: WildcardSetting, | ||||||
|  |  | ||||||
|     #[serde(default, skip_serializing_if = "Setting::is_not_set")] |     #[serde(default, skip_serializing_if = "Setting::is_not_set")] | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)] |     #[deserr(default, error = DeserrJsonError<InvalidSettingsFilterableAttributes>)] | ||||||
| @@ -251,8 +243,8 @@ impl<T> Settings<T> { | |||||||
| impl Settings<Checked> { | impl Settings<Checked> { | ||||||
|     pub fn cleared() -> Settings<Checked> { |     pub fn cleared() -> Settings<Checked> { | ||||||
|         Settings { |         Settings { | ||||||
|             displayed_attributes: Setting::Reset, |             displayed_attributes: Setting::Reset.into(), | ||||||
|             searchable_attributes: Setting::Reset, |             searchable_attributes: Setting::Reset.into(), | ||||||
|             filterable_attributes: Setting::Reset, |             filterable_attributes: Setting::Reset, | ||||||
|             sortable_attributes: Setting::Reset, |             sortable_attributes: Setting::Reset, | ||||||
|             ranking_rules: Setting::Reset, |             ranking_rules: Setting::Reset, | ||||||
| @@ -319,7 +311,7 @@ impl Settings<Checked> { | |||||||
|  |  | ||||||
| impl Settings<Unchecked> { | impl Settings<Unchecked> { | ||||||
|     pub fn check(self) -> Settings<Checked> { |     pub fn check(self) -> Settings<Checked> { | ||||||
|         let displayed_attributes = match self.displayed_attributes { |         let displayed_attributes = match self.displayed_attributes.0 { | ||||||
|             Setting::Set(fields) => { |             Setting::Set(fields) => { | ||||||
|                 if fields.iter().any(|f| f == "*") { |                 if fields.iter().any(|f| f == "*") { | ||||||
|                     Setting::Reset |                     Setting::Reset | ||||||
| @@ -330,7 +322,7 @@ impl Settings<Unchecked> { | |||||||
|             otherwise => otherwise, |             otherwise => otherwise, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let searchable_attributes = match self.searchable_attributes { |         let searchable_attributes = match self.searchable_attributes.0 { | ||||||
|             Setting::Set(fields) => { |             Setting::Set(fields) => { | ||||||
|                 if fields.iter().any(|f| f == "*") { |                 if fields.iter().any(|f| f == "*") { | ||||||
|                     Setting::Reset |                     Setting::Reset | ||||||
| @@ -342,8 +334,8 @@ impl Settings<Unchecked> { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         Settings { |         Settings { | ||||||
|             displayed_attributes, |             displayed_attributes: displayed_attributes.into(), | ||||||
|             searchable_attributes, |             searchable_attributes: searchable_attributes.into(), | ||||||
|             filterable_attributes: self.filterable_attributes, |             filterable_attributes: self.filterable_attributes, | ||||||
|             sortable_attributes: self.sortable_attributes, |             sortable_attributes: self.sortable_attributes, | ||||||
|             ranking_rules: self.ranking_rules, |             ranking_rules: self.ranking_rules, | ||||||
| @@ -412,13 +404,13 @@ pub fn apply_settings_to_builder( | |||||||
|         _kind, |         _kind, | ||||||
|     } = settings; |     } = settings; | ||||||
|  |  | ||||||
|     match searchable_attributes { |     match searchable_attributes.deref() { | ||||||
|         Setting::Set(ref names) => builder.set_searchable_fields(names.clone()), |         Setting::Set(ref names) => builder.set_searchable_fields(names.clone()), | ||||||
|         Setting::Reset => builder.reset_searchable_fields(), |         Setting::Reset => builder.reset_searchable_fields(), | ||||||
|         Setting::NotSet => (), |         Setting::NotSet => (), | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     match displayed_attributes { |     match displayed_attributes.deref() { | ||||||
|         Setting::Set(ref names) => builder.set_displayed_fields(names.clone()), |         Setting::Set(ref names) => builder.set_displayed_fields(names.clone()), | ||||||
|         Setting::Reset => builder.reset_displayed_fields(), |         Setting::Reset => builder.reset_displayed_fields(), | ||||||
|         Setting::NotSet => (), |         Setting::NotSet => (), | ||||||
| @@ -690,11 +682,13 @@ pub fn settings( | |||||||
|         displayed_attributes: match displayed_attributes { |         displayed_attributes: match displayed_attributes { | ||||||
|             Some(attrs) => Setting::Set(attrs), |             Some(attrs) => Setting::Set(attrs), | ||||||
|             None => Setting::Reset, |             None => Setting::Reset, | ||||||
|         }, |         } | ||||||
|  |         .into(), | ||||||
|         searchable_attributes: match searchable_attributes { |         searchable_attributes: match searchable_attributes { | ||||||
|             Some(attrs) => Setting::Set(attrs), |             Some(attrs) => Setting::Set(attrs), | ||||||
|             None => Setting::Reset, |             None => Setting::Reset, | ||||||
|         }, |         } | ||||||
|  |         .into(), | ||||||
|         filterable_attributes: Setting::Set(filterable_attributes), |         filterable_attributes: Setting::Set(filterable_attributes), | ||||||
|         sortable_attributes: Setting::Set(sortable_attributes), |         sortable_attributes: Setting::Set(sortable_attributes), | ||||||
|         ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()), |         ranking_rules: Setting::Set(criteria.iter().map(|c| c.clone().into()).collect()), | ||||||
| @@ -848,6 +842,41 @@ impl From<ProximityPrecisionView> for ProximityPrecision { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, Default, Deserialize, PartialEq, Eq)] | ||||||
|  | pub struct WildcardSetting(Setting<Vec<String>>); | ||||||
|  |  | ||||||
|  | impl From<Setting<Vec<String>>> for WildcardSetting { | ||||||
|  |     fn from(setting: Setting<Vec<String>>) -> Self { | ||||||
|  |         Self(setting) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl Serialize for WildcardSetting { | ||||||
|  |     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error> | ||||||
|  |     where | ||||||
|  |         S: Serializer, | ||||||
|  |     { | ||||||
|  |         serialize_with_wildcard(&self.0, serializer) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<E: deserr::DeserializeError> Deserr<E> for WildcardSetting { | ||||||
|  |     fn deserialize_from_value<V: deserr::IntoValue>( | ||||||
|  |         value: deserr::Value<V>, | ||||||
|  |         location: ValuePointerRef<'_>, | ||||||
|  |     ) -> Result<Self, E> { | ||||||
|  |         Ok(Self(Setting::deserialize_from_value(value, location)?)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl std::ops::Deref for WildcardSetting { | ||||||
|  |     type Target = Setting<Vec<String>>; | ||||||
|  |  | ||||||
|  |     fn deref(&self) -> &Self::Target { | ||||||
|  |         &self.0 | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| pub(crate) mod test { | pub(crate) mod test { | ||||||
|     use super::*; |     use super::*; | ||||||
| @@ -856,8 +885,8 @@ pub(crate) mod test { | |||||||
|     fn test_setting_check() { |     fn test_setting_check() { | ||||||
|         // test no changes |         // test no changes | ||||||
|         let settings = Settings { |         let settings = Settings { | ||||||
|             displayed_attributes: Setting::Set(vec![String::from("hello")]), |             displayed_attributes: Setting::Set(vec![String::from("hello")]).into(), | ||||||
|             searchable_attributes: Setting::Set(vec![String::from("hello")]), |             searchable_attributes: Setting::Set(vec![String::from("hello")]).into(), | ||||||
|             filterable_attributes: Setting::NotSet, |             filterable_attributes: Setting::NotSet, | ||||||
|             sortable_attributes: Setting::NotSet, |             sortable_attributes: Setting::NotSet, | ||||||
|             ranking_rules: Setting::NotSet, |             ranking_rules: Setting::NotSet, | ||||||
| @@ -883,8 +912,9 @@ pub(crate) mod test { | |||||||
|         // test wildcard |         // test wildcard | ||||||
|         // test no changes |         // test no changes | ||||||
|         let settings = Settings { |         let settings = Settings { | ||||||
|             displayed_attributes: Setting::Set(vec![String::from("*")]), |             displayed_attributes: Setting::Set(vec![String::from("*")]).into(), | ||||||
|             searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]), |             searchable_attributes: Setting::Set(vec![String::from("hello"), String::from("*")]) | ||||||
|  |                 .into(), | ||||||
|             filterable_attributes: Setting::NotSet, |             filterable_attributes: Setting::NotSet, | ||||||
|             sortable_attributes: Setting::NotSet, |             sortable_attributes: Setting::NotSet, | ||||||
|             ranking_rules: Setting::NotSet, |             ranking_rules: Setting::NotSet, | ||||||
| @@ -904,7 +934,7 @@ pub(crate) mod test { | |||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let checked = settings.check(); |         let checked = settings.check(); | ||||||
|         assert_eq!(checked.displayed_attributes, Setting::Reset); |         assert_eq!(checked.displayed_attributes, Setting::Reset.into()); | ||||||
|         assert_eq!(checked.searchable_attributes, Setting::Reset); |         assert_eq!(checked.searchable_attributes, Setting::Reset.into()); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -149,12 +149,14 @@ mini-dashboard = [ | |||||||
|     "zip", |     "zip", | ||||||
| ] | ] | ||||||
| chinese = ["meilisearch-types/chinese"] | chinese = ["meilisearch-types/chinese"] | ||||||
|  | chinese-pinyin = ["meilisearch-types/chinese-pinyin"] | ||||||
| hebrew = ["meilisearch-types/hebrew"] | hebrew = ["meilisearch-types/hebrew"] | ||||||
| japanese = ["meilisearch-types/japanese"] | japanese = ["meilisearch-types/japanese"] | ||||||
| thai = ["meilisearch-types/thai"] | thai = ["meilisearch-types/thai"] | ||||||
| greek = ["meilisearch-types/greek"] | greek = ["meilisearch-types/greek"] | ||||||
| khmer = ["meilisearch-types/khmer"] | khmer = ["meilisearch-types/khmer"] | ||||||
| vietnamese = ["meilisearch-types/vietnamese"] | vietnamese = ["meilisearch-types/vietnamese"] | ||||||
|  | swedish-recomposition = ["meilisearch-types/swedish-recomposition"] | ||||||
|  |  | ||||||
| [package.metadata.mini-dashboard] | [package.metadata.mini-dashboard] | ||||||
| assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" | assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip" | ||||||
|   | |||||||
| @@ -7,7 +7,6 @@ use serde_json::Value; | |||||||
|  |  | ||||||
| use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind}; | use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind}; | ||||||
| use crate::routes::indexes::documents::UpdateDocumentsQuery; | use crate::routes::indexes::documents::UpdateDocumentsQuery; | ||||||
| use crate::routes::tasks::TasksFilterQuery; |  | ||||||
| use crate::Opt; | use crate::Opt; | ||||||
|  |  | ||||||
| pub struct MockAnalytics { | pub struct MockAnalytics { | ||||||
| @@ -86,6 +85,4 @@ impl Analytics for MockAnalytics { | |||||||
|     } |     } | ||||||
|     fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} |     fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} | ||||||
|     fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} |     fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} | ||||||
|     fn get_tasks(&self, _query: &TasksFilterQuery, _request: &HttpRequest) {} |  | ||||||
|     fn health_seen(&self, _request: &HttpRequest) {} |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -14,7 +14,6 @@ use platform_dirs::AppDirs; | |||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  |  | ||||||
| use crate::routes::indexes::documents::UpdateDocumentsQuery; | use crate::routes::indexes::documents::UpdateDocumentsQuery; | ||||||
| use crate::routes::tasks::TasksFilterQuery; |  | ||||||
|  |  | ||||||
| // if the analytics feature is disabled | // if the analytics feature is disabled | ||||||
| // the `SegmentAnalytics` point to the mock instead of the real analytics | // the `SegmentAnalytics` point to the mock instead of the real analytics | ||||||
| @@ -117,10 +116,4 @@ pub trait Analytics: Sync + Send { | |||||||
|         index_creation: bool, |         index_creation: bool, | ||||||
|         request: &HttpRequest, |         request: &HttpRequest, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     // this method should be called to aggregate the get tasks requests. |  | ||||||
|     fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest); |  | ||||||
|  |  | ||||||
|     // this method should be called to aggregate a add documents request |  | ||||||
|     fn health_seen(&self, request: &HttpRequest); |  | ||||||
| } | } | ||||||
|   | |||||||
| @@ -33,7 +33,6 @@ use crate::option::{ | |||||||
| }; | }; | ||||||
| use crate::routes::indexes::documents::UpdateDocumentsQuery; | use crate::routes::indexes::documents::UpdateDocumentsQuery; | ||||||
| use crate::routes::indexes::facet_search::FacetSearchQuery; | use crate::routes::indexes::facet_search::FacetSearchQuery; | ||||||
| use crate::routes::tasks::TasksFilterQuery; |  | ||||||
| use crate::routes::{create_all_stats, Stats}; | use crate::routes::{create_all_stats, Stats}; | ||||||
| use crate::search::{ | use crate::search::{ | ||||||
|     FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult, |     FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult, | ||||||
| @@ -81,8 +80,6 @@ pub enum AnalyticsMsg { | |||||||
|     AggregateUpdateDocuments(DocumentsAggregator), |     AggregateUpdateDocuments(DocumentsAggregator), | ||||||
|     AggregateGetFetchDocuments(DocumentsFetchAggregator), |     AggregateGetFetchDocuments(DocumentsFetchAggregator), | ||||||
|     AggregatePostFetchDocuments(DocumentsFetchAggregator), |     AggregatePostFetchDocuments(DocumentsFetchAggregator), | ||||||
|     AggregateTasks(TasksAggregator), |  | ||||||
|     AggregateHealth(HealthAggregator), |  | ||||||
| } | } | ||||||
|  |  | ||||||
| pub struct SegmentAnalytics { | pub struct SegmentAnalytics { | ||||||
| @@ -152,8 +149,6 @@ impl SegmentAnalytics { | |||||||
|             update_documents_aggregator: DocumentsAggregator::default(), |             update_documents_aggregator: DocumentsAggregator::default(), | ||||||
|             get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), |             get_fetch_documents_aggregator: DocumentsFetchAggregator::default(), | ||||||
|             post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), |             post_fetch_documents_aggregator: DocumentsFetchAggregator::default(), | ||||||
|             get_tasks_aggregator: TasksAggregator::default(), |  | ||||||
|             health_aggregator: HealthAggregator::default(), |  | ||||||
|         }); |         }); | ||||||
|         tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); |         tokio::spawn(segment.run(index_scheduler.clone(), auth_controller.clone())); | ||||||
|  |  | ||||||
| @@ -231,16 +226,6 @@ impl super::Analytics for SegmentAnalytics { | |||||||
|         let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); |         let aggregate = DocumentsFetchAggregator::from_query(documents_query, request); | ||||||
|         let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); |         let _ = self.sender.try_send(AnalyticsMsg::AggregatePostFetchDocuments(aggregate)); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn get_tasks(&self, query: &TasksFilterQuery, request: &HttpRequest) { |  | ||||||
|         let aggregate = TasksAggregator::from_query(query, request); |  | ||||||
|         let _ = self.sender.try_send(AnalyticsMsg::AggregateTasks(aggregate)); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn health_seen(&self, request: &HttpRequest) { |  | ||||||
|         let aggregate = HealthAggregator::from_query(request); |  | ||||||
|         let _ = self.sender.try_send(AnalyticsMsg::AggregateHealth(aggregate)); |  | ||||||
|     } |  | ||||||
| } | } | ||||||
|  |  | ||||||
| /// This structure represent the `infos` field we send in the analytics. | /// This structure represent the `infos` field we send in the analytics. | ||||||
| @@ -394,8 +379,6 @@ pub struct Segment { | |||||||
|     update_documents_aggregator: DocumentsAggregator, |     update_documents_aggregator: DocumentsAggregator, | ||||||
|     get_fetch_documents_aggregator: DocumentsFetchAggregator, |     get_fetch_documents_aggregator: DocumentsFetchAggregator, | ||||||
|     post_fetch_documents_aggregator: DocumentsFetchAggregator, |     post_fetch_documents_aggregator: DocumentsFetchAggregator, | ||||||
|     get_tasks_aggregator: TasksAggregator, |  | ||||||
|     health_aggregator: HealthAggregator, |  | ||||||
| } | } | ||||||
|  |  | ||||||
| impl Segment { | impl Segment { | ||||||
| @@ -458,8 +441,6 @@ impl Segment { | |||||||
|                         Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), |                         Some(AnalyticsMsg::AggregateUpdateDocuments(agreg)) => self.update_documents_aggregator.aggregate(agreg), | ||||||
|                         Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), |                         Some(AnalyticsMsg::AggregateGetFetchDocuments(agreg)) => self.get_fetch_documents_aggregator.aggregate(agreg), | ||||||
|                         Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), |                         Some(AnalyticsMsg::AggregatePostFetchDocuments(agreg)) => self.post_fetch_documents_aggregator.aggregate(agreg), | ||||||
|                         Some(AnalyticsMsg::AggregateTasks(agreg)) => self.get_tasks_aggregator.aggregate(agreg), |  | ||||||
|                         Some(AnalyticsMsg::AggregateHealth(agreg)) => self.health_aggregator.aggregate(agreg), |  | ||||||
|                         None => (), |                         None => (), | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -513,8 +494,6 @@ impl Segment { | |||||||
|             update_documents_aggregator, |             update_documents_aggregator, | ||||||
|             get_fetch_documents_aggregator, |             get_fetch_documents_aggregator, | ||||||
|             post_fetch_documents_aggregator, |             post_fetch_documents_aggregator, | ||||||
|             get_tasks_aggregator, |  | ||||||
|             health_aggregator, |  | ||||||
|         } = self; |         } = self; | ||||||
|  |  | ||||||
|         if let Some(get_search) = |         if let Some(get_search) = | ||||||
| @@ -562,12 +541,6 @@ impl Segment { | |||||||
|         { |         { | ||||||
|             let _ = self.batcher.push(post_fetch_documents).await; |             let _ = self.batcher.push(post_fetch_documents).await; | ||||||
|         } |         } | ||||||
|         if let Some(get_tasks) = take(get_tasks_aggregator).into_event(user, "Tasks Seen") { |  | ||||||
|             let _ = self.batcher.push(get_tasks).await; |  | ||||||
|         } |  | ||||||
|         if let Some(health) = take(health_aggregator).into_event(user, "Health Seen") { |  | ||||||
|             let _ = self.batcher.push(health).await; |  | ||||||
|         } |  | ||||||
|         let _ = self.batcher.flush().await; |         let _ = self.batcher.flush().await; | ||||||
|     } |     } | ||||||
| } | } | ||||||
| @@ -1503,176 +1476,6 @@ impl DocumentsDeletionAggregator { | |||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Default, Serialize)] |  | ||||||
| pub struct TasksAggregator { |  | ||||||
|     #[serde(skip)] |  | ||||||
|     timestamp: Option<OffsetDateTime>, |  | ||||||
|  |  | ||||||
|     // context |  | ||||||
|     #[serde(rename = "user-agent")] |  | ||||||
|     user_agents: HashSet<String>, |  | ||||||
|  |  | ||||||
|     filtered_by_uid: bool, |  | ||||||
|     filtered_by_index_uid: bool, |  | ||||||
|     filtered_by_type: bool, |  | ||||||
|     filtered_by_status: bool, |  | ||||||
|     filtered_by_canceled_by: bool, |  | ||||||
|     filtered_by_before_enqueued_at: bool, |  | ||||||
|     filtered_by_after_enqueued_at: bool, |  | ||||||
|     filtered_by_before_started_at: bool, |  | ||||||
|     filtered_by_after_started_at: bool, |  | ||||||
|     filtered_by_before_finished_at: bool, |  | ||||||
|     filtered_by_after_finished_at: bool, |  | ||||||
|     total_received: usize, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl TasksAggregator { |  | ||||||
|     pub fn from_query(query: &TasksFilterQuery, request: &HttpRequest) -> Self { |  | ||||||
|         let TasksFilterQuery { |  | ||||||
|             limit: _, |  | ||||||
|             from: _, |  | ||||||
|             uids, |  | ||||||
|             index_uids, |  | ||||||
|             types, |  | ||||||
|             statuses, |  | ||||||
|             canceled_by, |  | ||||||
|             before_enqueued_at, |  | ||||||
|             after_enqueued_at, |  | ||||||
|             before_started_at, |  | ||||||
|             after_started_at, |  | ||||||
|             before_finished_at, |  | ||||||
|             after_finished_at, |  | ||||||
|         } = query; |  | ||||||
|  |  | ||||||
|         Self { |  | ||||||
|             timestamp: Some(OffsetDateTime::now_utc()), |  | ||||||
|             user_agents: extract_user_agents(request).into_iter().collect(), |  | ||||||
|             filtered_by_uid: uids.is_some(), |  | ||||||
|             filtered_by_index_uid: index_uids.is_some(), |  | ||||||
|             filtered_by_type: types.is_some(), |  | ||||||
|             filtered_by_status: statuses.is_some(), |  | ||||||
|             filtered_by_canceled_by: canceled_by.is_some(), |  | ||||||
|             filtered_by_before_enqueued_at: before_enqueued_at.is_some(), |  | ||||||
|             filtered_by_after_enqueued_at: after_enqueued_at.is_some(), |  | ||||||
|             filtered_by_before_started_at: before_started_at.is_some(), |  | ||||||
|             filtered_by_after_started_at: after_started_at.is_some(), |  | ||||||
|             filtered_by_before_finished_at: before_finished_at.is_some(), |  | ||||||
|             filtered_by_after_finished_at: after_finished_at.is_some(), |  | ||||||
|             total_received: 1, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Aggregate one [TasksAggregator] into another. |  | ||||||
|     pub fn aggregate(&mut self, other: Self) { |  | ||||||
|         let Self { |  | ||||||
|             timestamp, |  | ||||||
|             user_agents, |  | ||||||
|             total_received, |  | ||||||
|             filtered_by_uid, |  | ||||||
|             filtered_by_index_uid, |  | ||||||
|             filtered_by_type, |  | ||||||
|             filtered_by_status, |  | ||||||
|             filtered_by_canceled_by, |  | ||||||
|             filtered_by_before_enqueued_at, |  | ||||||
|             filtered_by_after_enqueued_at, |  | ||||||
|             filtered_by_before_started_at, |  | ||||||
|             filtered_by_after_started_at, |  | ||||||
|             filtered_by_before_finished_at, |  | ||||||
|             filtered_by_after_finished_at, |  | ||||||
|         } = other; |  | ||||||
|  |  | ||||||
|         if self.timestamp.is_none() { |  | ||||||
|             self.timestamp = timestamp; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // we can't create a union because there is no `into_union` method |  | ||||||
|         for user_agent in user_agents { |  | ||||||
|             self.user_agents.insert(user_agent); |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         self.filtered_by_uid |= filtered_by_uid; |  | ||||||
|         self.filtered_by_index_uid |= filtered_by_index_uid; |  | ||||||
|         self.filtered_by_type |= filtered_by_type; |  | ||||||
|         self.filtered_by_status |= filtered_by_status; |  | ||||||
|         self.filtered_by_canceled_by |= filtered_by_canceled_by; |  | ||||||
|         self.filtered_by_before_enqueued_at |= filtered_by_before_enqueued_at; |  | ||||||
|         self.filtered_by_after_enqueued_at |= filtered_by_after_enqueued_at; |  | ||||||
|         self.filtered_by_before_started_at |= filtered_by_before_started_at; |  | ||||||
|         self.filtered_by_after_started_at |= filtered_by_after_started_at; |  | ||||||
|         self.filtered_by_before_finished_at |= filtered_by_before_finished_at; |  | ||||||
|         self.filtered_by_after_finished_at |= filtered_by_after_finished_at; |  | ||||||
|         self.filtered_by_after_finished_at |= filtered_by_after_finished_at; |  | ||||||
|  |  | ||||||
|         self.total_received = self.total_received.saturating_add(total_received); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { |  | ||||||
|         // if we had no timestamp it means we never encountered any events and |  | ||||||
|         // thus we don't need to send this event. |  | ||||||
|         let timestamp = self.timestamp?; |  | ||||||
|  |  | ||||||
|         Some(Track { |  | ||||||
|             timestamp: Some(timestamp), |  | ||||||
|             user: user.clone(), |  | ||||||
|             event: event_name.to_string(), |  | ||||||
|             properties: serde_json::to_value(self).ok()?, |  | ||||||
|             ..Default::default() |  | ||||||
|         }) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Default, Serialize)] |  | ||||||
| pub struct HealthAggregator { |  | ||||||
|     #[serde(skip)] |  | ||||||
|     timestamp: Option<OffsetDateTime>, |  | ||||||
|  |  | ||||||
|     // context |  | ||||||
|     #[serde(rename = "user-agent")] |  | ||||||
|     user_agents: HashSet<String>, |  | ||||||
|  |  | ||||||
|     #[serde(rename = "requests.total_received")] |  | ||||||
|     total_received: usize, |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl HealthAggregator { |  | ||||||
|     pub fn from_query(request: &HttpRequest) -> Self { |  | ||||||
|         Self { |  | ||||||
|             timestamp: Some(OffsetDateTime::now_utc()), |  | ||||||
|             user_agents: extract_user_agents(request).into_iter().collect(), |  | ||||||
|             total_received: 1, |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     /// Aggregate one [HealthAggregator] into another. |  | ||||||
|     pub fn aggregate(&mut self, other: Self) { |  | ||||||
|         let Self { timestamp, user_agents, total_received } = other; |  | ||||||
|  |  | ||||||
|         if self.timestamp.is_none() { |  | ||||||
|             self.timestamp = timestamp; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         // we can't create a union because there is no `into_union` method |  | ||||||
|         for user_agent in user_agents { |  | ||||||
|             self.user_agents.insert(user_agent); |  | ||||||
|         } |  | ||||||
|         self.total_received = self.total_received.saturating_add(total_received); |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     pub fn into_event(self, user: &User, event_name: &str) -> Option<Track> { |  | ||||||
|         // if we had no timestamp it means we never encountered any events and |  | ||||||
|         // thus we don't need to send this event. |  | ||||||
|         let timestamp = self.timestamp?; |  | ||||||
|  |  | ||||||
|         Some(Track { |  | ||||||
|             timestamp: Some(timestamp), |  | ||||||
|             user: user.clone(), |  | ||||||
|             event: event_name.to_string(), |  | ||||||
|             properties: serde_json::to_value(self).ok()?, |  | ||||||
|             ..Default::default() |  | ||||||
|         }) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[derive(Default, Serialize)] | #[derive(Default, Serialize)] | ||||||
| pub struct DocumentsFetchAggregator { | pub struct DocumentsFetchAggregator { | ||||||
|     #[serde(skip)] |     #[serde(skip)] | ||||||
|   | |||||||
| @@ -13,6 +13,7 @@ use byte_unit::{Byte, ByteError}; | |||||||
| use clap::Parser; | use clap::Parser; | ||||||
| use meilisearch_types::features::InstanceTogglableFeatures; | use meilisearch_types::features::InstanceTogglableFeatures; | ||||||
| use meilisearch_types::milli::update::IndexerConfig; | use meilisearch_types::milli::update::IndexerConfig; | ||||||
|  | use meilisearch_types::milli::ThreadPoolNoAbortBuilder; | ||||||
| use rustls::server::{ | use rustls::server::{ | ||||||
|     AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache, |     AllowAnyAnonymousOrAuthenticatedClient, AllowAnyAuthenticatedClient, ServerSessionMemoryCache, | ||||||
| }; | }; | ||||||
| @@ -666,7 +667,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { | |||||||
|     type Error = anyhow::Error; |     type Error = anyhow::Error; | ||||||
|  |  | ||||||
|     fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> { |     fn try_from(other: &IndexerOpts) -> Result<Self, Self::Error> { | ||||||
|         let thread_pool = rayon::ThreadPoolBuilder::new() |         let thread_pool = ThreadPoolNoAbortBuilder::new() | ||||||
|             .thread_name(|index| format!("indexing-thread:{index}")) |             .thread_name(|index| format!("indexing-thread:{index}")) | ||||||
|             .num_threads(*other.max_indexing_threads) |             .num_threads(*other.max_indexing_threads) | ||||||
|             .build()?; |             .build()?; | ||||||
|   | |||||||
| @@ -269,12 +269,8 @@ impl From<index_scheduler::IndexStats> for IndexStats { | |||||||
| pub async fn get_index_stats( | pub async fn get_index_stats( | ||||||
|     index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>, |     index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>, | ||||||
|     index_uid: web::Path<String>, |     index_uid: web::Path<String>, | ||||||
|     req: HttpRequest, |  | ||||||
|     analytics: web::Data<dyn Analytics>, |  | ||||||
| ) -> Result<HttpResponse, ResponseError> { | ) -> Result<HttpResponse, ResponseError> { | ||||||
|     let index_uid = IndexUid::try_from(index_uid.into_inner())?; |     let index_uid = IndexUid::try_from(index_uid.into_inner())?; | ||||||
|     analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": true }), Some(&req)); |  | ||||||
|  |  | ||||||
|     let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?); |     let stats = IndexStats::from(index_scheduler.index_stats(&index_uid)?); | ||||||
|  |  | ||||||
|     debug!(returns = ?stats, "Get index stats"); |     debug!(returns = ?stats, "Get index stats"); | ||||||
|   | |||||||
| @@ -137,10 +137,8 @@ macro_rules! make_setting_route { | |||||||
|                 let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?; |                 let settings = settings(&index, &rtxn, meilisearch_types::settings::SecretPolicy::HideSecrets)?; | ||||||
|  |  | ||||||
|                 debug!(returns = ?settings, "Update settings"); |                 debug!(returns = ?settings, "Update settings"); | ||||||
|                 let mut json = serde_json::json!(&settings); |  | ||||||
|                 let val = json[$camelcase_attr].take(); |  | ||||||
|  |  | ||||||
|                 Ok(HttpResponse::Ok().json(val)) |                 Ok(HttpResponse::Ok().json(settings.$attr)) | ||||||
|             } |             } | ||||||
|  |  | ||||||
|             pub fn resources() -> Resource { |             pub fn resources() -> Resource { | ||||||
|   | |||||||
| @@ -8,11 +8,9 @@ use meilisearch_types::error::{Code, ResponseError}; | |||||||
| use meilisearch_types::settings::{Settings, Unchecked}; | use meilisearch_types::settings::{Settings, Unchecked}; | ||||||
| use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; | use meilisearch_types::tasks::{Kind, Status, Task, TaskId}; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
| use serde_json::json; |  | ||||||
| use time::OffsetDateTime; | use time::OffsetDateTime; | ||||||
| use tracing::debug; | use tracing::debug; | ||||||
|  |  | ||||||
| use crate::analytics::Analytics; |  | ||||||
| use crate::extractors::authentication::policies::*; | use crate::extractors::authentication::policies::*; | ||||||
| use crate::extractors::authentication::GuardedData; | use crate::extractors::authentication::GuardedData; | ||||||
| use crate::search_queue::SearchQueue; | use crate::search_queue::SearchQueue; | ||||||
| @@ -296,10 +294,7 @@ pub struct Stats { | |||||||
| async fn get_stats( | async fn get_stats( | ||||||
|     index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>, |     index_scheduler: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<IndexScheduler>>, | ||||||
|     auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>, |     auth_controller: GuardedData<ActionPolicy<{ actions::STATS_GET }>, Data<AuthController>>, | ||||||
|     req: HttpRequest, |  | ||||||
|     analytics: web::Data<dyn Analytics>, |  | ||||||
| ) -> Result<HttpResponse, ResponseError> { | ) -> Result<HttpResponse, ResponseError> { | ||||||
|     analytics.publish("Stats Seen".to_string(), json!({ "per_index_uid": false }), Some(&req)); |  | ||||||
|     let filters = index_scheduler.filters(); |     let filters = index_scheduler.filters(); | ||||||
|  |  | ||||||
|     let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?; |     let stats = create_all_stats((*index_scheduler).clone(), (*auth_controller).clone(), filters)?; | ||||||
| @@ -355,11 +350,7 @@ struct VersionResponse { | |||||||
|  |  | ||||||
| async fn get_version( | async fn get_version( | ||||||
|     _index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>, |     _index_scheduler: GuardedData<ActionPolicy<{ actions::VERSION }>, Data<IndexScheduler>>, | ||||||
|     req: HttpRequest, |  | ||||||
|     analytics: web::Data<dyn Analytics>, |  | ||||||
| ) -> HttpResponse { | ) -> HttpResponse { | ||||||
|     analytics.publish("Version Seen".to_string(), json!(null), Some(&req)); |  | ||||||
|  |  | ||||||
|     let build_info = build_info::BuildInfo::from_build(); |     let build_info = build_info::BuildInfo::from_build(); | ||||||
|  |  | ||||||
|     HttpResponse::Ok().json(VersionResponse { |     HttpResponse::Ok().json(VersionResponse { | ||||||
| @@ -377,14 +368,10 @@ async fn get_version( | |||||||
| } | } | ||||||
|  |  | ||||||
| pub async fn get_health( | pub async fn get_health( | ||||||
|     req: HttpRequest, |  | ||||||
|     index_scheduler: Data<IndexScheduler>, |     index_scheduler: Data<IndexScheduler>, | ||||||
|     auth_controller: Data<AuthController>, |     auth_controller: Data<AuthController>, | ||||||
|     search_queue: Data<SearchQueue>, |     search_queue: Data<SearchQueue>, | ||||||
|     analytics: web::Data<dyn Analytics>, |  | ||||||
| ) -> Result<HttpResponse, ResponseError> { | ) -> Result<HttpResponse, ResponseError> { | ||||||
|     analytics.health_seen(&req); |  | ||||||
|  |  | ||||||
|     search_queue.health().unwrap(); |     search_queue.health().unwrap(); | ||||||
|     index_scheduler.health().unwrap(); |     index_scheduler.health().unwrap(); | ||||||
|     auth_controller.health().unwrap(); |     auth_controller.health().unwrap(); | ||||||
|   | |||||||
| @@ -270,12 +270,8 @@ pub struct AllTasks { | |||||||
| async fn get_tasks( | async fn get_tasks( | ||||||
|     index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>, |     index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>, | ||||||
|     params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>, |     params: AwebQueryParameter<TasksFilterQuery, DeserrQueryParamError>, | ||||||
|     req: HttpRequest, |  | ||||||
|     analytics: web::Data<dyn Analytics>, |  | ||||||
| ) -> Result<HttpResponse, ResponseError> { | ) -> Result<HttpResponse, ResponseError> { | ||||||
|     let mut params = params.into_inner(); |     let mut params = params.into_inner(); | ||||||
|     analytics.get_tasks(¶ms, &req); |  | ||||||
|  |  | ||||||
|     // We +1 just to know if there is more after this "page" or not. |     // We +1 just to know if there is more after this "page" or not. | ||||||
|     params.limit.0 = params.limit.0.saturating_add(1); |     params.limit.0 = params.limit.0.saturating_add(1); | ||||||
|     let limit = params.limit.0; |     let limit = params.limit.0; | ||||||
| @@ -298,8 +294,6 @@ async fn get_tasks( | |||||||
| async fn get_task( | async fn get_task( | ||||||
|     index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>, |     index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_GET }>, Data<IndexScheduler>>, | ||||||
|     task_uid: web::Path<String>, |     task_uid: web::Path<String>, | ||||||
|     req: HttpRequest, |  | ||||||
|     analytics: web::Data<dyn Analytics>, |  | ||||||
| ) -> Result<HttpResponse, ResponseError> { | ) -> Result<HttpResponse, ResponseError> { | ||||||
|     let task_uid_string = task_uid.into_inner(); |     let task_uid_string = task_uid.into_inner(); | ||||||
|  |  | ||||||
| @@ -310,8 +304,6 @@ async fn get_task( | |||||||
|         } |         } | ||||||
|     }; |     }; | ||||||
|  |  | ||||||
|     analytics.publish("Tasks Seen".to_string(), json!({ "per_task_uid": true }), Some(&req)); |  | ||||||
|  |  | ||||||
|     let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; |     let query = index_scheduler::Query { uids: Some(vec![task_uid]), ..Query::default() }; | ||||||
|     let filters = index_scheduler.filters(); |     let filters = index_scheduler.filters(); | ||||||
|     let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?; |     let (tasks, _) = index_scheduler.get_tasks_from_authorized_indexes(query, filters)?; | ||||||
|   | |||||||
| @@ -1,3 +1,4 @@ | |||||||
|  | use core::fmt; | ||||||
| use std::cmp::min; | use std::cmp::min; | ||||||
| use std::collections::{BTreeMap, BTreeSet, HashSet}; | use std::collections::{BTreeMap, BTreeSet, HashSet}; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
| @@ -39,7 +40,7 @@ pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string(); | |||||||
| pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string(); | pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string(); | ||||||
| pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5); | pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5); | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Default, PartialEq, Deserr)] | #[derive(Clone, Default, PartialEq, Deserr)] | ||||||
| #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] | #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] | ||||||
| pub struct SearchQuery { | pub struct SearchQuery { | ||||||
|     #[deserr(default, error = DeserrJsonError<InvalidSearchQ>)] |     #[deserr(default, error = DeserrJsonError<InvalidSearchQ>)] | ||||||
| @@ -88,6 +89,110 @@ pub struct SearchQuery { | |||||||
|     pub attributes_to_search_on: Option<Vec<String>>, |     pub attributes_to_search_on: Option<Vec<String>>, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // Since this structure is logged A LOT we're going to reduce the number of things it logs to the bare minimum. | ||||||
|  | // - Only what IS used, we know everything else is set to None so there is no need to print it | ||||||
|  | // - Re-order the most important field to debug first | ||||||
|  | impl fmt::Debug for SearchQuery { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||||
|  |         let Self { | ||||||
|  |             q, | ||||||
|  |             vector, | ||||||
|  |             hybrid, | ||||||
|  |             offset, | ||||||
|  |             limit, | ||||||
|  |             page, | ||||||
|  |             hits_per_page, | ||||||
|  |             attributes_to_retrieve, | ||||||
|  |             attributes_to_crop, | ||||||
|  |             crop_length, | ||||||
|  |             attributes_to_highlight, | ||||||
|  |             show_matches_position, | ||||||
|  |             show_ranking_score, | ||||||
|  |             show_ranking_score_details, | ||||||
|  |             filter, | ||||||
|  |             sort, | ||||||
|  |             facets, | ||||||
|  |             highlight_pre_tag, | ||||||
|  |             highlight_post_tag, | ||||||
|  |             crop_marker, | ||||||
|  |             matching_strategy, | ||||||
|  |             attributes_to_search_on, | ||||||
|  |         } = self; | ||||||
|  |  | ||||||
|  |         let mut debug = f.debug_struct("SearchQuery"); | ||||||
|  |  | ||||||
|  |         // First, everything related to the number of documents to retrieve | ||||||
|  |         debug.field("limit", &limit).field("offset", &offset); | ||||||
|  |         if let Some(page) = page { | ||||||
|  |             debug.field("page", &page); | ||||||
|  |         } | ||||||
|  |         if let Some(hits_per_page) = hits_per_page { | ||||||
|  |             debug.field("hits_per_page", &hits_per_page); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Then, everything related to the queries | ||||||
|  |         if let Some(q) = q { | ||||||
|  |             debug.field("q", &q); | ||||||
|  |         } | ||||||
|  |         if let Some(v) = vector { | ||||||
|  |             if v.len() < 10 { | ||||||
|  |                 debug.field("vector", &v); | ||||||
|  |             } else { | ||||||
|  |                 debug.field( | ||||||
|  |                     "vector", | ||||||
|  |                     &format!("[{}, {}, {}, ... {} dimensions]", v[0], v[1], v[2], v.len()), | ||||||
|  |                 ); | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         if let Some(hybrid) = hybrid { | ||||||
|  |             debug.field("hybrid", &hybrid); | ||||||
|  |         } | ||||||
|  |         if let Some(attributes_to_search_on) = attributes_to_search_on { | ||||||
|  |             debug.field("attributes_to_search_on", &attributes_to_search_on); | ||||||
|  |         } | ||||||
|  |         if let Some(filter) = filter { | ||||||
|  |             debug.field("filter", &filter); | ||||||
|  |         } | ||||||
|  |         if let Some(sort) = sort { | ||||||
|  |             debug.field("sort", &sort); | ||||||
|  |         } | ||||||
|  |         if let Some(facets) = facets { | ||||||
|  |             debug.field("facets", &facets); | ||||||
|  |         } | ||||||
|  |         debug.field("matching_strategy", &matching_strategy); | ||||||
|  |  | ||||||
|  |         // Then everything related to the formatting | ||||||
|  |         debug.field("crop_length", &crop_length); | ||||||
|  |         if *show_matches_position { | ||||||
|  |             debug.field("show_matches_position", show_matches_position); | ||||||
|  |         } | ||||||
|  |         if *show_ranking_score { | ||||||
|  |             debug.field("show_ranking_score", show_ranking_score); | ||||||
|  |         } | ||||||
|  |         if *show_ranking_score_details { | ||||||
|  |             debug.field("self.show_ranking_score_details", show_ranking_score_details); | ||||||
|  |         } | ||||||
|  |         debug.field("crop_length", &crop_length); | ||||||
|  |         if let Some(facets) = facets { | ||||||
|  |             debug.field("facets", &facets); | ||||||
|  |         } | ||||||
|  |         if let Some(attributes_to_retrieve) = attributes_to_retrieve { | ||||||
|  |             debug.field("attributes_to_retrieve", &attributes_to_retrieve); | ||||||
|  |         } | ||||||
|  |         if let Some(attributes_to_crop) = attributes_to_crop { | ||||||
|  |             debug.field("attributes_to_crop", &attributes_to_crop); | ||||||
|  |         } | ||||||
|  |         if let Some(attributes_to_highlight) = attributes_to_highlight { | ||||||
|  |             debug.field("attributes_to_highlight", &attributes_to_highlight); | ||||||
|  |         } | ||||||
|  |         debug.field("highlight_pre_tag", &highlight_pre_tag); | ||||||
|  |         debug.field("highlight_post_tag", &highlight_post_tag); | ||||||
|  |         debug.field("crop_marker", &crop_marker); | ||||||
|  |  | ||||||
|  |         debug.finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Default, PartialEq, Deserr)] | #[derive(Debug, Clone, Default, PartialEq, Deserr)] | ||||||
| #[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)] | #[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)] | ||||||
| pub struct HybridQuery { | pub struct HybridQuery { | ||||||
| @@ -370,7 +475,7 @@ pub struct SearchHit { | |||||||
|     pub ranking_score_details: Option<serde_json::Map<String, serde_json::Value>>, |     pub ranking_score_details: Option<serde_json::Map<String, serde_json::Value>>, | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Serialize, Debug, Clone, PartialEq)] | #[derive(Serialize, Clone, PartialEq)] | ||||||
| #[serde(rename_all = "camelCase")] | #[serde(rename_all = "camelCase")] | ||||||
| pub struct SearchResult { | pub struct SearchResult { | ||||||
|     pub hits: Vec<SearchHit>, |     pub hits: Vec<SearchHit>, | ||||||
| @@ -393,6 +498,46 @@ pub struct SearchResult { | |||||||
|     pub used_negative_operator: bool, |     pub used_negative_operator: bool, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | impl fmt::Debug for SearchResult { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { | ||||||
|  |         let SearchResult { | ||||||
|  |             hits, | ||||||
|  |             query, | ||||||
|  |             processing_time_ms, | ||||||
|  |             hits_info, | ||||||
|  |             facet_distribution, | ||||||
|  |             facet_stats, | ||||||
|  |             semantic_hit_count, | ||||||
|  |             degraded, | ||||||
|  |             used_negative_operator, | ||||||
|  |         } = self; | ||||||
|  |  | ||||||
|  |         let mut debug = f.debug_struct("SearchResult"); | ||||||
|  |         // The most important thing when looking at a search result is the time it took to process | ||||||
|  |         debug.field("processing_time_ms", &processing_time_ms); | ||||||
|  |         debug.field("hits", &format!("[{} hits returned]", hits.len())); | ||||||
|  |         debug.field("query", &query); | ||||||
|  |         debug.field("hits_info", &hits_info); | ||||||
|  |         if *used_negative_operator { | ||||||
|  |             debug.field("used_negative_operator", used_negative_operator); | ||||||
|  |         } | ||||||
|  |         if *degraded { | ||||||
|  |             debug.field("degraded", degraded); | ||||||
|  |         } | ||||||
|  |         if let Some(facet_distribution) = facet_distribution { | ||||||
|  |             debug.field("facet_distribution", &facet_distribution); | ||||||
|  |         } | ||||||
|  |         if let Some(facet_stats) = facet_stats { | ||||||
|  |             debug.field("facet_stats", &facet_stats); | ||||||
|  |         } | ||||||
|  |         if let Some(semantic_hit_count) = semantic_hit_count { | ||||||
|  |             debug.field("semantic_hit_count", &semantic_hit_count); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         debug.finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| #[derive(Serialize, Debug, Clone, PartialEq)] | #[derive(Serialize, Debug, Clone, PartialEq)] | ||||||
| #[serde(rename_all = "camelCase")] | #[serde(rename_all = "camelCase")] | ||||||
| pub struct SearchResultWithIndex { | pub struct SearchResultWithIndex { | ||||||
|   | |||||||
| @@ -113,7 +113,8 @@ async fn secrets_are_hidden_in_settings() { | |||||||
|                 "default": { |                 "default": { | ||||||
|                     "source": "rest", |                     "source": "rest", | ||||||
|                     "url": "https://localhost:7777", |                     "url": "https://localhost:7777", | ||||||
|                     "apiKey": "My super secret value you will never guess" |                     "apiKey": "My super secret value you will never guess", | ||||||
|  |                     "dimensions": 4, | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         })) |         })) | ||||||
| @@ -184,6 +185,7 @@ async fn secrets_are_hidden_in_settings() { | |||||||
|         "default": { |         "default": { | ||||||
|           "source": "rest", |           "source": "rest", | ||||||
|           "apiKey": "My suXXXXXX...", |           "apiKey": "My suXXXXXX...", | ||||||
|  |           "dimensions": 4, | ||||||
|           "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", |           "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", | ||||||
|           "url": "https://localhost:7777", |           "url": "https://localhost:7777", | ||||||
|           "query": null, |           "query": null, | ||||||
| @@ -211,6 +213,7 @@ async fn secrets_are_hidden_in_settings() { | |||||||
|         "default": { |         "default": { | ||||||
|           "source": "rest", |           "source": "rest", | ||||||
|           "apiKey": "My suXXXXXX...", |           "apiKey": "My suXXXXXX...", | ||||||
|  |           "dimensions": 4, | ||||||
|           "url": "https://localhost:7777" |           "url": "https://localhost:7777" | ||||||
|         } |         } | ||||||
|       } |       } | ||||||
|   | |||||||
| @@ -17,7 +17,7 @@ bincode = "1.3.3" | |||||||
| bstr = "1.9.0" | bstr = "1.9.0" | ||||||
| bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } | bytemuck = { version = "1.14.0", features = ["extern_crate_alloc"] } | ||||||
| byteorder = "1.5.0" | byteorder = "1.5.0" | ||||||
| charabia = { version = "0.8.8", default-features = false } | charabia = { version = "0.8.10", default-features = false } | ||||||
| concat-arrays = "0.1.2" | concat-arrays = "0.1.2" | ||||||
| crossbeam-channel = "0.5.11" | crossbeam-channel = "0.5.11" | ||||||
| deserr = "0.6.1" | deserr = "0.6.1" | ||||||
| @@ -115,6 +115,7 @@ lmdb-posix-sem = ["heed/posix-sem"] | |||||||
|  |  | ||||||
| # allow chinese specialized tokenization | # allow chinese specialized tokenization | ||||||
| chinese = ["charabia/chinese"] | chinese = ["charabia/chinese"] | ||||||
|  | chinese-pinyin = ["chinese", "charabia/chinese-normalization-pinyin"] | ||||||
|  |  | ||||||
| # allow hebrew specialized tokenization | # allow hebrew specialized tokenization | ||||||
| hebrew = ["charabia/hebrew"] | hebrew = ["charabia/hebrew"] | ||||||
| @@ -135,7 +136,11 @@ greek = ["charabia/greek"] | |||||||
| # allow khmer specialized tokenization | # allow khmer specialized tokenization | ||||||
| khmer = ["charabia/khmer"] | khmer = ["charabia/khmer"] | ||||||
|  |  | ||||||
|  | # allow vietnamese specialized tokenization | ||||||
| vietnamese = ["charabia/vietnamese"] | vietnamese = ["charabia/vietnamese"] | ||||||
|  |  | ||||||
|  | # force swedish character recomposition | ||||||
|  | swedish-recomposition = ["charabia/swedish-recomposition"] | ||||||
|  |  | ||||||
| # allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306> | # allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306> | ||||||
| cuda = ["candle-core/cuda"] | cuda = ["candle-core/cuda"] | ||||||
|   | |||||||
| @@ -9,6 +9,7 @@ use serde_json::Value; | |||||||
| use thiserror::Error; | use thiserror::Error; | ||||||
|  |  | ||||||
| use crate::documents::{self, DocumentsBatchCursorError}; | use crate::documents::{self, DocumentsBatchCursorError}; | ||||||
|  | use crate::thread_pool_no_abort::PanicCatched; | ||||||
| use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; | use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; | ||||||
|  |  | ||||||
| pub fn is_reserved_keyword(keyword: &str) -> bool { | pub fn is_reserved_keyword(keyword: &str) -> bool { | ||||||
| @@ -39,17 +40,19 @@ pub enum InternalError { | |||||||
|     Fst(#[from] fst::Error), |     Fst(#[from] fst::Error), | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     DocumentsError(#[from] documents::Error), |     DocumentsError(#[from] documents::Error), | ||||||
|     #[error("Invalid compression type have been specified to grenad.")] |     #[error("Invalid compression type have been specified to grenad")] | ||||||
|     GrenadInvalidCompressionType, |     GrenadInvalidCompressionType, | ||||||
|     #[error("Invalid grenad file with an invalid version format.")] |     #[error("Invalid grenad file with an invalid version format")] | ||||||
|     GrenadInvalidFormatVersion, |     GrenadInvalidFormatVersion, | ||||||
|     #[error("Invalid merge while processing {process}.")] |     #[error("Invalid merge while processing {process}")] | ||||||
|     IndexingMergingKeys { process: &'static str }, |     IndexingMergingKeys { process: &'static str }, | ||||||
|     #[error("{}", HeedError::InvalidDatabaseTyping)] |     #[error("{}", HeedError::InvalidDatabaseTyping)] | ||||||
|     InvalidDatabaseTyping, |     InvalidDatabaseTyping, | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     RayonThreadPool(#[from] ThreadPoolBuildError), |     RayonThreadPool(#[from] ThreadPoolBuildError), | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|  |     PanicInThreadPool(#[from] PanicCatched), | ||||||
|  |     #[error(transparent)] | ||||||
|     SerdeJson(#[from] serde_json::Error), |     SerdeJson(#[from] serde_json::Error), | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     Serialization(#[from] SerializationError), |     Serialization(#[from] SerializationError), | ||||||
| @@ -57,9 +60,9 @@ pub enum InternalError { | |||||||
|     Store(#[from] MdbError), |     Store(#[from] MdbError), | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     Utf8(#[from] str::Utf8Error), |     Utf8(#[from] str::Utf8Error), | ||||||
|     #[error("An indexation process was explicitly aborted.")] |     #[error("An indexation process was explicitly aborted")] | ||||||
|     AbortedIndexation, |     AbortedIndexation, | ||||||
|     #[error("The matching words list contains at least one invalid member.")] |     #[error("The matching words list contains at least one invalid member")] | ||||||
|     InvalidMatchingWords, |     InvalidMatchingWords, | ||||||
|     #[error(transparent)] |     #[error(transparent)] | ||||||
|     ArroyError(#[from] arroy::Error), |     ArroyError(#[from] arroy::Error), | ||||||
|   | |||||||
| @@ -678,6 +678,23 @@ impl Index { | |||||||
|             .get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) |             .get(rtxn, main_key::USER_DEFINED_SEARCHABLE_FIELDS_KEY) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Identical to `user_defined_searchable_fields`, but returns ids instead. | ||||||
|  |     pub fn user_defined_searchable_fields_ids(&self, rtxn: &RoTxn) -> Result<Option<Vec<FieldId>>> { | ||||||
|  |         match self.user_defined_searchable_fields(rtxn)? { | ||||||
|  |             Some(fields) => { | ||||||
|  |                 let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||||
|  |                 let mut fields_ids = Vec::new(); | ||||||
|  |                 for name in fields { | ||||||
|  |                     if let Some(field_id) = fields_ids_map.id(name) { | ||||||
|  |                         fields_ids.push(field_id); | ||||||
|  |                     } | ||||||
|  |                 } | ||||||
|  |                 Ok(Some(fields_ids)) | ||||||
|  |             } | ||||||
|  |             None => Ok(None), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /* filterable fields */ |     /* filterable fields */ | ||||||
|  |  | ||||||
|     /// Writes the filterable fields names in the database. |     /// Writes the filterable fields names in the database. | ||||||
| @@ -824,11 +841,11 @@ impl Index { | |||||||
|  |  | ||||||
|     /// Identical to `user_defined_faceted_fields`, but returns ids instead. |     /// Identical to `user_defined_faceted_fields`, but returns ids instead. | ||||||
|     pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { |     pub fn user_defined_faceted_fields_ids(&self, rtxn: &RoTxn) -> Result<HashSet<FieldId>> { | ||||||
|         let fields = self.faceted_fields(rtxn)?; |         let fields = self.user_defined_faceted_fields(rtxn)?; | ||||||
|         let fields_ids_map = self.fields_ids_map(rtxn)?; |         let fields_ids_map = self.fields_ids_map(rtxn)?; | ||||||
|  |  | ||||||
|         let mut fields_ids = HashSet::new(); |         let mut fields_ids = HashSet::new(); | ||||||
|         for name in fields.into_iter() { |         for name in fields { | ||||||
|             if let Some(field_id) = fields_ids_map.id(&name) { |             if let Some(field_id) = fields_ids_map.id(&name) { | ||||||
|                 fields_ids.insert(field_id); |                 fields_ids.insert(field_id); | ||||||
|             } |             } | ||||||
|   | |||||||
| @@ -21,6 +21,7 @@ pub mod prompt; | |||||||
| pub mod proximity; | pub mod proximity; | ||||||
| pub mod score_details; | pub mod score_details; | ||||||
| mod search; | mod search; | ||||||
|  | mod thread_pool_no_abort; | ||||||
| pub mod update; | pub mod update; | ||||||
| pub mod vector; | pub mod vector; | ||||||
|  |  | ||||||
| @@ -42,6 +43,7 @@ pub use search::new::{ | |||||||
|     SearchLogger, VisualSearchLogger, |     SearchLogger, VisualSearchLogger, | ||||||
| }; | }; | ||||||
| use serde_json::Value; | use serde_json::Value; | ||||||
|  | pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; | ||||||
| pub use {charabia as tokenizer, heed}; | pub use {charabia as tokenizer, heed}; | ||||||
|  |  | ||||||
| pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; | pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; | ||||||
| @@ -128,7 +130,7 @@ impl fmt::Debug for TimeBudget { | |||||||
|  |  | ||||||
| impl Default for TimeBudget { | impl Default for TimeBudget { | ||||||
|     fn default() -> Self { |     fn default() -> Self { | ||||||
|         Self::new(std::time::Duration::from_millis(150)) |         Self::new(std::time::Duration::from_millis(1500)) | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
|   | |||||||
| @@ -97,6 +97,7 @@ impl<'a> FacetDistribution<'a> { | |||||||
|     ) -> heed::Result<()> { |     ) -> heed::Result<()> { | ||||||
|         match facet_type { |         match facet_type { | ||||||
|             FacetType::Number => { |             FacetType::Number => { | ||||||
|  |                 let mut lexicographic_distribution = BTreeMap::new(); | ||||||
|                 let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); |                 let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); | ||||||
|  |  | ||||||
|                 let distribution_prelength = distribution.len(); |                 let distribution_prelength = distribution.len(); | ||||||
| @@ -111,14 +112,17 @@ impl<'a> FacetDistribution<'a> { | |||||||
|  |  | ||||||
|                     for result in iter { |                     for result in iter { | ||||||
|                         let ((_, _, value), ()) = result?; |                         let ((_, _, value), ()) = result?; | ||||||
|                         *distribution.entry(value.to_string()).or_insert(0) += 1; |                         *lexicographic_distribution.entry(value.to_string()).or_insert(0) += 1; | ||||||
|  |  | ||||||
|                         if distribution.len() - distribution_prelength == self.max_values_per_facet |                         if lexicographic_distribution.len() - distribution_prelength | ||||||
|  |                             == self.max_values_per_facet | ||||||
|                         { |                         { | ||||||
|                             break; |                             break; | ||||||
|                         } |                         } | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|  |  | ||||||
|  |                 distribution.extend(lexicographic_distribution); | ||||||
|             } |             } | ||||||
|             FacetType::String => { |             FacetType::String => { | ||||||
|                 let mut normalized_distribution = BTreeMap::new(); |                 let mut normalized_distribution = BTreeMap::new(); | ||||||
|   | |||||||
							
								
								
									
										69
									
								
								milli/src/thread_pool_no_abort.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										69
									
								
								milli/src/thread_pool_no_abort.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,69 @@ | |||||||
|  | use std::sync::atomic::{AtomicBool, Ordering}; | ||||||
|  | use std::sync::Arc; | ||||||
|  |  | ||||||
|  | use rayon::{ThreadPool, ThreadPoolBuilder}; | ||||||
|  | use thiserror::Error; | ||||||
|  |  | ||||||
|  | /// A rayon ThreadPool wrapper that can catch panics in the pool | ||||||
|  | /// and modifies the install function accordingly. | ||||||
|  | #[derive(Debug)] | ||||||
|  | pub struct ThreadPoolNoAbort { | ||||||
|  |     thread_pool: ThreadPool, | ||||||
|  |     /// Set to true if the thread pool catched a panic. | ||||||
|  |     pool_catched_panic: Arc<AtomicBool>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl ThreadPoolNoAbort { | ||||||
|  |     pub fn install<OP, R>(&self, op: OP) -> Result<R, PanicCatched> | ||||||
|  |     where | ||||||
|  |         OP: FnOnce() -> R + Send, | ||||||
|  |         R: Send, | ||||||
|  |     { | ||||||
|  |         let output = self.thread_pool.install(op); | ||||||
|  |         // While reseting the pool panic catcher we return an error if we catched one. | ||||||
|  |         if self.pool_catched_panic.swap(false, Ordering::SeqCst) { | ||||||
|  |             Err(PanicCatched) | ||||||
|  |         } else { | ||||||
|  |             Ok(output) | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn current_num_threads(&self) -> usize { | ||||||
|  |         self.thread_pool.current_num_threads() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Error, Debug)] | ||||||
|  | #[error("A panic occured. Read the logs to find more information about it")] | ||||||
|  | pub struct PanicCatched; | ||||||
|  |  | ||||||
|  | #[derive(Default)] | ||||||
|  | pub struct ThreadPoolNoAbortBuilder(ThreadPoolBuilder); | ||||||
|  |  | ||||||
|  | impl ThreadPoolNoAbortBuilder { | ||||||
|  |     pub fn new() -> ThreadPoolNoAbortBuilder { | ||||||
|  |         ThreadPoolNoAbortBuilder::default() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn thread_name<F>(mut self, closure: F) -> Self | ||||||
|  |     where | ||||||
|  |         F: FnMut(usize) -> String + 'static, | ||||||
|  |     { | ||||||
|  |         self.0 = self.0.thread_name(closure); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn num_threads(mut self, num_threads: usize) -> ThreadPoolNoAbortBuilder { | ||||||
|  |         self.0 = self.0.num_threads(num_threads); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn build(mut self) -> Result<ThreadPoolNoAbort, rayon::ThreadPoolBuildError> { | ||||||
|  |         let pool_catched_panic = Arc::new(AtomicBool::new(false)); | ||||||
|  |         self.0 = self.0.panic_handler({ | ||||||
|  |             let catched_panic = pool_catched_panic.clone(); | ||||||
|  |             move |_result| catched_panic.store(true, Ordering::SeqCst) | ||||||
|  |         }); | ||||||
|  |         Ok(ThreadPoolNoAbort { thread_pool: self.0.build()?, pool_catched_panic }) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -71,8 +71,8 @@ pub enum DelAddOperation { | |||||||
| /// putting each deletion obkv's keys under an DelAdd::Deletion | /// putting each deletion obkv's keys under an DelAdd::Deletion | ||||||
| /// and putting each addition obkv's keys under an DelAdd::Addition | /// and putting each addition obkv's keys under an DelAdd::Addition | ||||||
| pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>( | pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>( | ||||||
|     deletion: obkv::KvReader<K>, |     deletion: &obkv::KvReader<K>, | ||||||
|     addition: obkv::KvReader<K>, |     addition: &obkv::KvReader<K>, | ||||||
|     buffer: &mut Vec<u8>, |     buffer: &mut Vec<u8>, | ||||||
| ) -> Result<(), std::io::Error> { | ) -> Result<(), std::io::Error> { | ||||||
|     use itertools::merge_join_by; |     use itertools::merge_join_by; | ||||||
|   | |||||||
| @@ -1,4 +1,4 @@ | |||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::HashMap; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::BufReader; | use std::io::BufReader; | ||||||
| @@ -12,6 +12,7 @@ use serde_json::Value; | |||||||
| use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; | use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters}; | ||||||
| use crate::error::{InternalError, SerializationError}; | use crate::error::{InternalError, SerializationError}; | ||||||
| use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; | use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd}; | ||||||
|  | use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; | ||||||
| use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; | use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH}; | ||||||
|  |  | ||||||
| pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; | pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>; | ||||||
| @@ -25,10 +26,7 @@ pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, R | |||||||
| pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | ||||||
|     obkv_documents: grenad::Reader<R>, |     obkv_documents: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
|     stop_words: Option<&fst::Set<Vec<u8>>>, |  | ||||||
|     allowed_separators: Option<&[&str]>, |  | ||||||
|     dictionary: Option<&[&str]>, |  | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
| @@ -36,6 +34,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|     let max_positions_per_attributes = max_positions_per_attributes |     let max_positions_per_attributes = max_positions_per_attributes | ||||||
|         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); |         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE)); | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |     let force_reindexing = settings_diff.reindex_searchable(); | ||||||
|  |  | ||||||
|     // initialize destination values. |     // initialize destination values. | ||||||
|     let mut documents_ids = RoaringBitmap::new(); |     let mut documents_ids = RoaringBitmap::new(); | ||||||
| @@ -56,8 +55,37 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|     let mut value_buffer = Vec::new(); |     let mut value_buffer = Vec::new(); | ||||||
|  |  | ||||||
|     // initialize tokenizer. |     // initialize tokenizer. | ||||||
|     let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None); |     let old_stop_words = settings_diff.old.stop_words.as_ref(); | ||||||
|     let tokenizer = builder.build(); |     let old_separators: Option<Vec<_>> = settings_diff | ||||||
|  |         .old | ||||||
|  |         .allowed_separators | ||||||
|  |         .as_ref() | ||||||
|  |         .map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |     let old_dictionary: Option<Vec<_>> = | ||||||
|  |         settings_diff.old.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |     let mut del_builder = tokenizer_builder( | ||||||
|  |         old_stop_words, | ||||||
|  |         old_separators.as_deref(), | ||||||
|  |         old_dictionary.as_deref(), | ||||||
|  |         None, | ||||||
|  |     ); | ||||||
|  |     let del_tokenizer = del_builder.build(); | ||||||
|  |  | ||||||
|  |     let new_stop_words = settings_diff.new.stop_words.as_ref(); | ||||||
|  |     let new_separators: Option<Vec<_>> = settings_diff | ||||||
|  |         .new | ||||||
|  |         .allowed_separators | ||||||
|  |         .as_ref() | ||||||
|  |         .map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |     let new_dictionary: Option<Vec<_>> = | ||||||
|  |         settings_diff.new.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |     let mut add_builder = tokenizer_builder( | ||||||
|  |         new_stop_words, | ||||||
|  |         new_separators.as_deref(), | ||||||
|  |         new_dictionary.as_deref(), | ||||||
|  |         None, | ||||||
|  |     ); | ||||||
|  |     let add_tokenizer = add_builder.build(); | ||||||
|  |  | ||||||
|     // iterate over documents. |     // iterate over documents. | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
| @@ -69,7 +97,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         let obkv = KvReader::<FieldId>::new(value); |         let obkv = KvReader::<FieldId>::new(value); | ||||||
|  |  | ||||||
|         // if the searchable fields didn't change, skip the searchable indexing for this document. |         // if the searchable fields didn't change, skip the searchable indexing for this document. | ||||||
|         if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) { |         if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) { | ||||||
|             continue; |             continue; | ||||||
|         } |         } | ||||||
|  |  | ||||||
| @@ -85,11 +113,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|                 // deletions |                 // deletions | ||||||
|                 lang_safe_tokens_from_document( |                 lang_safe_tokens_from_document( | ||||||
|                     &obkv, |                     &obkv, | ||||||
|                     searchable_fields, |                     &settings_diff.old, | ||||||
|                     &tokenizer, |                     &del_tokenizer, | ||||||
|                     stop_words, |  | ||||||
|                     allowed_separators, |  | ||||||
|                     dictionary, |  | ||||||
|                     max_positions_per_attributes, |                     max_positions_per_attributes, | ||||||
|                     DelAdd::Deletion, |                     DelAdd::Deletion, | ||||||
|                     &mut del_buffers, |                     &mut del_buffers, | ||||||
| @@ -99,11 +124,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|                 // additions |                 // additions | ||||||
|                 lang_safe_tokens_from_document( |                 lang_safe_tokens_from_document( | ||||||
|                     &obkv, |                     &obkv, | ||||||
|                     searchable_fields, |                     &settings_diff.new, | ||||||
|                     &tokenizer, |                     &add_tokenizer, | ||||||
|                     stop_words, |  | ||||||
|                     allowed_separators, |  | ||||||
|                     dictionary, |  | ||||||
|                     max_positions_per_attributes, |                     max_positions_per_attributes, | ||||||
|                     DelAdd::Addition, |                     DelAdd::Addition, | ||||||
|                     &mut add_buffers, |                     &mut add_buffers, | ||||||
| @@ -118,8 +140,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
|         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> |         // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>> | ||||||
|         value_buffer.clear(); |         value_buffer.clear(); | ||||||
|         del_add_from_two_obkvs( |         del_add_from_two_obkvs( | ||||||
|             KvReader::<FieldId>::new(del_obkv), |             &KvReader::<FieldId>::new(del_obkv), | ||||||
|             KvReader::<FieldId>::new(add_obkv), |             &KvReader::<FieldId>::new(add_obkv), | ||||||
|             &mut value_buffer, |             &mut value_buffer, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
| @@ -160,8 +182,9 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>( | |||||||
| /// Check if any searchable fields of a document changed. | /// Check if any searchable fields of a document changed. | ||||||
| fn searchable_fields_changed( | fn searchable_fields_changed( | ||||||
|     obkv: &KvReader<FieldId>, |     obkv: &KvReader<FieldId>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> bool { | ) -> bool { | ||||||
|  |     let searchable_fields = &settings_diff.new.searchable_fields_ids; | ||||||
|     for (field_id, field_bytes) in obkv.iter() { |     for (field_id, field_bytes) in obkv.iter() { | ||||||
|         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { |         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { | ||||||
|             let del_add = KvReaderDelAdd::new(field_bytes); |             let del_add = KvReaderDelAdd::new(field_bytes); | ||||||
| @@ -206,14 +229,10 @@ fn tokenizer_builder<'a>( | |||||||
|  |  | ||||||
| /// Extract words mapped with their positions of a document, | /// Extract words mapped with their positions of a document, | ||||||
| /// ensuring no Language detection mistakes was made. | /// ensuring no Language detection mistakes was made. | ||||||
| #[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct |  | ||||||
| fn lang_safe_tokens_from_document<'a>( | fn lang_safe_tokens_from_document<'a>( | ||||||
|     obkv: &KvReader<FieldId>, |     obkv: &KvReader<FieldId>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     settings: &InnerIndexSettings, | ||||||
|     tokenizer: &Tokenizer, |     tokenizer: &Tokenizer, | ||||||
|     stop_words: Option<&fst::Set<Vec<u8>>>, |  | ||||||
|     allowed_separators: Option<&[&str]>, |  | ||||||
|     dictionary: Option<&[&str]>, |  | ||||||
|     max_positions_per_attributes: u32, |     max_positions_per_attributes: u32, | ||||||
|     del_add: DelAdd, |     del_add: DelAdd, | ||||||
|     buffers: &'a mut Buffers, |     buffers: &'a mut Buffers, | ||||||
| @@ -222,7 +241,7 @@ fn lang_safe_tokens_from_document<'a>( | |||||||
|  |  | ||||||
|     tokens_from_document( |     tokens_from_document( | ||||||
|         obkv, |         obkv, | ||||||
|         searchable_fields, |         &settings.searchable_fields_ids, | ||||||
|         tokenizer, |         tokenizer, | ||||||
|         max_positions_per_attributes, |         max_positions_per_attributes, | ||||||
|         del_add, |         del_add, | ||||||
| @@ -246,12 +265,15 @@ fn lang_safe_tokens_from_document<'a>( | |||||||
|         // then we don't rerun the extraction. |         // then we don't rerun the extraction. | ||||||
|         if !script_language.is_empty() { |         if !script_language.is_empty() { | ||||||
|             // build a new temporary tokenizer including the allow list. |             // build a new temporary tokenizer including the allow list. | ||||||
|             let mut builder = tokenizer_builder( |             let stop_words = settings.stop_words.as_ref(); | ||||||
|                 stop_words, |             let separators: Option<Vec<_>> = settings | ||||||
|                 allowed_separators, |                 .allowed_separators | ||||||
|                 dictionary, |                 .as_ref() | ||||||
|                 Some(&script_language), |                 .map(|s| s.iter().map(String::as_str).collect()); | ||||||
|             ); |             let dictionary: Option<Vec<_>> = | ||||||
|  |                 settings.dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect()); | ||||||
|  |             let mut builder = | ||||||
|  |                 tokenizer_builder(stop_words, separators.as_deref(), dictionary.as_deref(), None); | ||||||
|             let tokenizer = builder.build(); |             let tokenizer = builder.build(); | ||||||
|  |  | ||||||
|             script_language_word_count.clear(); |             script_language_word_count.clear(); | ||||||
| @@ -259,7 +281,7 @@ fn lang_safe_tokens_from_document<'a>( | |||||||
|             // rerun the extraction. |             // rerun the extraction. | ||||||
|             tokens_from_document( |             tokens_from_document( | ||||||
|                 obkv, |                 obkv, | ||||||
|                 searchable_fields, |                 &settings.searchable_fields_ids, | ||||||
|                 &tokenizer, |                 &tokenizer, | ||||||
|                 max_positions_per_attributes, |                 max_positions_per_attributes, | ||||||
|                 del_add, |                 del_add, | ||||||
| @@ -276,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>( | |||||||
| /// Extract words mapped with their positions of a document. | /// Extract words mapped with their positions of a document. | ||||||
| fn tokens_from_document<'a>( | fn tokens_from_document<'a>( | ||||||
|     obkv: &KvReader<FieldId>, |     obkv: &KvReader<FieldId>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |     searchable_fields: &Option<Vec<FieldId>>, | ||||||
|     tokenizer: &Tokenizer, |     tokenizer: &Tokenizer, | ||||||
|     max_positions_per_attributes: u32, |     max_positions_per_attributes: u32, | ||||||
|     del_add: DelAdd, |     del_add: DelAdd, | ||||||
|   | |||||||
| @@ -10,6 +10,7 @@ use crate::heed_codec::facet::{ | |||||||
|     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, |     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, | ||||||
| }; | }; | ||||||
| use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::Result; | use crate::Result; | ||||||
|  |  | ||||||
| /// Extracts the facet number and the documents ids where this facet number appear. | /// Extracts the facet number and the documents ids where this facet number appear. | ||||||
| @@ -20,6 +21,7 @@ use crate::Result; | |||||||
| pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | pub fn extract_facet_number_docids<R: io::Read + io::Seek>( | ||||||
|     fid_docid_facet_number: grenad::Reader<R>, |     fid_docid_facet_number: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|  |     _settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -15,6 +15,7 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | |||||||
| use crate::update::index_documents::helpers::{ | use crate::update::index_documents::helpers::{ | ||||||
|     merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, |     merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps, | ||||||
| }; | }; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | ||||||
|  |  | ||||||
| /// Extracts the facet string and the documents ids where this facet string appear. | /// Extracts the facet string and the documents ids where this facet string appear. | ||||||
| @@ -25,6 +26,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; | |||||||
| pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | pub fn extract_facet_string_docids<R: io::Read + io::Seek>( | ||||||
|     docid_fid_facet_string: grenad::Reader<R>, |     docid_fid_facet_string: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|  |     _settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -1,5 +1,5 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::{BTreeMap, HashSet}; | use std::collections::BTreeMap; | ||||||
| use std::convert::TryInto; | use std::convert::TryInto; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
| @@ -20,6 +20,7 @@ use crate::error::InternalError; | |||||||
| use crate::facet::value_encoding::f64_into_bytes; | use crate::facet::value_encoding::f64_into_bytes; | ||||||
| use crate::update::del_add::{DelAdd, KvWriterDelAdd}; | use crate::update::del_add::{DelAdd, KvWriterDelAdd}; | ||||||
| use crate::update::index_documents::{create_writer, writer_into_reader}; | use crate::update::index_documents::{create_writer, writer_into_reader}; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; | use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH}; | ||||||
|  |  | ||||||
| /// The length of the elements that are always in the buffer when inserting new values. | /// The length of the elements that are always in the buffer when inserting new values. | ||||||
| @@ -43,7 +44,7 @@ pub struct ExtractedFacetValues { | |||||||
| pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | ||||||
|     obkv_documents: grenad::Reader<R>, |     obkv_documents: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     faceted_fields: &HashSet<FieldId>, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
| ) -> Result<ExtractedFacetValues> { | ) -> Result<ExtractedFacetValues> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
| @@ -82,7 +83,9 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | |||||||
|         let obkv = obkv::KvReader::new(value); |         let obkv = obkv::KvReader::new(value); | ||||||
|  |  | ||||||
|         for (field_id, field_bytes) in obkv.iter() { |         for (field_id, field_bytes) in obkv.iter() { | ||||||
|             if faceted_fields.contains(&field_id) { |             let delete_faceted = settings_diff.old.faceted_fields_ids.contains(&field_id); | ||||||
|  |             let add_faceted = settings_diff.new.faceted_fields_ids.contains(&field_id); | ||||||
|  |             if delete_faceted || add_faceted { | ||||||
|                 numbers_key_buffer.clear(); |                 numbers_key_buffer.clear(); | ||||||
|                 strings_key_buffer.clear(); |                 strings_key_buffer.clear(); | ||||||
|  |  | ||||||
| @@ -99,11 +102,12 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>( | |||||||
|                 strings_key_buffer.extend_from_slice(docid_bytes); |                 strings_key_buffer.extend_from_slice(docid_bytes); | ||||||
|  |  | ||||||
|                 let del_add_obkv = obkv::KvReader::new(field_bytes); |                 let del_add_obkv = obkv::KvReader::new(field_bytes); | ||||||
|                 let del_value = match del_add_obkv.get(DelAdd::Deletion) { |                 let del_value = match del_add_obkv.get(DelAdd::Deletion).filter(|_| delete_faceted) | ||||||
|  |                 { | ||||||
|                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), |                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), | ||||||
|                     None => None, |                     None => None, | ||||||
|                 }; |                 }; | ||||||
|                 let add_value = match del_add_obkv.get(DelAdd::Addition) { |                 let add_value = match del_add_obkv.get(DelAdd::Addition).filter(|_| add_faceted) { | ||||||
|                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), |                     Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?), | ||||||
|                     None => None, |                     None => None, | ||||||
|                 }; |                 }; | ||||||
|   | |||||||
| @@ -10,6 +10,7 @@ use super::helpers::{ | |||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::Result; | use crate::Result; | ||||||
|  |  | ||||||
| const MAX_COUNTED_WORDS: usize = 30; | const MAX_COUNTED_WORDS: usize = 30; | ||||||
| @@ -23,6 +24,7 @@ const MAX_COUNTED_WORDS: usize = 30; | |||||||
| pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>( | pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>( | ||||||
|     docid_word_positions: grenad::Reader<R>, |     docid_word_positions: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|  |     _settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -17,8 +17,9 @@ use crate::error::UserError; | |||||||
| use crate::prompt::Prompt; | use crate::prompt::Prompt; | ||||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
| use crate::update::index_documents::helpers::try_split_at; | use crate::update::index_documents::helpers::try_split_at; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::vector::Embedder; | use crate::vector::Embedder; | ||||||
| use crate::{DocumentId, FieldsIdsMap, InternalError, Result, VectorOrArrayOfVectors}; | use crate::{DocumentId, InternalError, Result, ThreadPoolNoAbort, VectorOrArrayOfVectors}; | ||||||
|  |  | ||||||
| /// The length of the elements that are always in the buffer when inserting new values. | /// The length of the elements that are always in the buffer when inserting new values. | ||||||
| const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); | const TRUNCATE_SIZE: usize = size_of::<DocumentId>(); | ||||||
| @@ -71,12 +72,15 @@ impl VectorStateDelta { | |||||||
| pub fn extract_vector_points<R: io::Read + io::Seek>( | pub fn extract_vector_points<R: io::Read + io::Seek>( | ||||||
|     obkv_documents: grenad::Reader<R>, |     obkv_documents: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     field_id_map: &FieldsIdsMap, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
|     prompt: &Prompt, |     prompt: &Prompt, | ||||||
|     embedder_name: &str, |     embedder_name: &str, | ||||||
| ) -> Result<ExtractedVectorPoints> { | ) -> Result<ExtractedVectorPoints> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|  |     let old_fields_ids_map = &settings_diff.old.fields_ids_map; | ||||||
|  |     let new_fields_ids_map = &settings_diff.new.fields_ids_map; | ||||||
|  |  | ||||||
|     // (docid, _index) -> KvWriterDelAdd -> Vector |     // (docid, _index) -> KvWriterDelAdd -> Vector | ||||||
|     let mut manual_vectors_writer = create_writer( |     let mut manual_vectors_writer = create_writer( | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
| @@ -98,8 +102,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|         tempfile::tempfile()?, |         tempfile::tempfile()?, | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let vectors_fid = field_id_map.id("_vectors"); |  | ||||||
|  |  | ||||||
|     let mut key_buffer = Vec::new(); |     let mut key_buffer = Vec::new(); | ||||||
|     let mut cursor = obkv_documents.into_cursor()?; |     let mut cursor = obkv_documents.into_cursor()?; | ||||||
|     while let Some((key, value)) = cursor.move_on_next()? { |     while let Some((key, value)) = cursor.move_on_next()? { | ||||||
| @@ -116,15 +118,29 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|         // lazily get it when needed |         // lazily get it when needed | ||||||
|         let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; |         let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; | ||||||
|  |  | ||||||
|         let vectors_field = vectors_fid |         // the vector field id may have changed | ||||||
|  |         let old_vectors_fid = old_fields_ids_map.id("_vectors"); | ||||||
|  |         // filter the old vector fid if the settings has been changed forcing reindexing. | ||||||
|  |         let old_vectors_fid = old_vectors_fid.filter(|_| !settings_diff.reindex_vectors()); | ||||||
|  |  | ||||||
|  |         let new_vectors_fid = new_fields_ids_map.id("_vectors"); | ||||||
|  |         let vectors_field = { | ||||||
|  |             let del = old_vectors_fid | ||||||
|                 .and_then(|vectors_fid| obkv.get(vectors_fid)) |                 .and_then(|vectors_fid| obkv.get(vectors_fid)) | ||||||
|                 .map(KvReaderDelAdd::new) |                 .map(KvReaderDelAdd::new) | ||||||
|             .map(|obkv| to_vector_maps(obkv, document_id)) |                 .map(|obkv| to_vector_map(obkv, DelAdd::Deletion, &document_id)) | ||||||
|             .transpose()?; |                 .transpose()? | ||||||
|  |                 .flatten(); | ||||||
|  |             let add = new_vectors_fid | ||||||
|  |                 .and_then(|vectors_fid| obkv.get(vectors_fid)) | ||||||
|  |                 .map(KvReaderDelAdd::new) | ||||||
|  |                 .map(|obkv| to_vector_map(obkv, DelAdd::Addition, &document_id)) | ||||||
|  |                 .transpose()? | ||||||
|  |                 .flatten(); | ||||||
|  |             (del, add) | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         let (del_map, add_map) = vectors_field.unzip(); |         let (del_map, add_map) = vectors_field; | ||||||
|         let del_map = del_map.flatten(); |  | ||||||
|         let add_map = add_map.flatten(); |  | ||||||
|  |  | ||||||
|         let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); |         let del_value = del_map.and_then(|mut map| map.remove(embedder_name)); | ||||||
|         let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); |         let add_value = add_map.and_then(|mut map| map.remove(embedder_name)); | ||||||
| @@ -155,7 +171,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|                     VectorStateDelta::NowGenerated(prompt.render( |                     VectorStateDelta::NowGenerated(prompt.render( | ||||||
|                         obkv, |                         obkv, | ||||||
|                         DelAdd::Addition, |                         DelAdd::Addition, | ||||||
|                         field_id_map, |                         new_fields_ids_map, | ||||||
|                     )?) |                     )?) | ||||||
|                 } else { |                 } else { | ||||||
|                     VectorStateDelta::NowRemoved |                     VectorStateDelta::NowRemoved | ||||||
| @@ -182,10 +198,16 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|                 if document_is_kept { |                 if document_is_kept { | ||||||
|                     // Don't give up if the old prompt was failing |                     // Don't give up if the old prompt was failing | ||||||
|                     let old_prompt = |                     let old_prompt = Some(prompt) | ||||||
|                         prompt.render(obkv, DelAdd::Deletion, field_id_map).unwrap_or_default(); |                         // TODO: this filter works because we erase the vec database when a embedding setting changes. | ||||||
|                     let new_prompt = prompt.render(obkv, DelAdd::Addition, field_id_map)?; |                         // When vector pipeline will be optimized, this should be removed. | ||||||
|                     if old_prompt != new_prompt { |                         .filter(|_| !settings_diff.reindex_vectors()) | ||||||
|  |                         .map(|p| { | ||||||
|  |                             p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() | ||||||
|  |                         }); | ||||||
|  |                     let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; | ||||||
|  |                     if old_prompt.as_ref() != Some(&new_prompt) { | ||||||
|  |                         let old_prompt = old_prompt.unwrap_or_default(); | ||||||
|                         tracing::trace!( |                         tracing::trace!( | ||||||
|                             "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" |                             "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" | ||||||
|                         ); |                         ); | ||||||
| @@ -207,6 +229,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|             &mut manual_vectors_writer, |             &mut manual_vectors_writer, | ||||||
|             &mut key_buffer, |             &mut key_buffer, | ||||||
|             delta, |             delta, | ||||||
|  |             settings_diff, | ||||||
|         )?; |         )?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -220,15 +243,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>( | |||||||
|     }) |     }) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn to_vector_maps( |  | ||||||
|     obkv: KvReaderDelAdd, |  | ||||||
|     document_id: impl Fn() -> Value, |  | ||||||
| ) -> Result<(Option<serde_json::Map<String, Value>>, Option<serde_json::Map<String, Value>>)> { |  | ||||||
|     let del = to_vector_map(obkv, DelAdd::Deletion, &document_id)?; |  | ||||||
|     let add = to_vector_map(obkv, DelAdd::Addition, &document_id)?; |  | ||||||
|     Ok((del, add)) |  | ||||||
| } |  | ||||||
|  |  | ||||||
| fn to_vector_map( | fn to_vector_map( | ||||||
|     obkv: KvReaderDelAdd, |     obkv: KvReaderDelAdd, | ||||||
|     side: DelAdd, |     side: DelAdd, | ||||||
| @@ -256,10 +270,15 @@ fn push_vectors_diff( | |||||||
|     manual_vectors_writer: &mut Writer<BufWriter<File>>, |     manual_vectors_writer: &mut Writer<BufWriter<File>>, | ||||||
|     key_buffer: &mut Vec<u8>, |     key_buffer: &mut Vec<u8>, | ||||||
|     delta: VectorStateDelta, |     delta: VectorStateDelta, | ||||||
|  |     settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|     let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); |     let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); | ||||||
|     if must_remove { |     if must_remove | ||||||
|  |     // TODO: the below condition works because we erase the vec database when a embedding setting changes. | ||||||
|  |     // When vector pipeline will be optimized, this should be removed. | ||||||
|  |     && !settings_diff.reindex_vectors() | ||||||
|  |     { | ||||||
|         key_buffer.truncate(TRUNCATE_SIZE); |         key_buffer.truncate(TRUNCATE_SIZE); | ||||||
|         remove_vectors_writer.insert(&key_buffer, [])?; |         remove_vectors_writer.insert(&key_buffer, [])?; | ||||||
|     } |     } | ||||||
| @@ -287,6 +306,9 @@ fn push_vectors_diff( | |||||||
|         match eob { |         match eob { | ||||||
|             EitherOrBoth::Both(_, _) => (), // no need to touch anything |             EitherOrBoth::Both(_, _) => (), // no need to touch anything | ||||||
|             EitherOrBoth::Left(vector) => { |             EitherOrBoth::Left(vector) => { | ||||||
|  |                 // TODO: the below condition works because we erase the vec database when a embedding setting changes. | ||||||
|  |                 // When vector pipeline will be optimized, this should be removed. | ||||||
|  |                 if !settings_diff.reindex_vectors() { | ||||||
|                     // We insert only the Del part of the Obkv to inform |                     // We insert only the Del part of the Obkv to inform | ||||||
|                     // that we only want to remove all those vectors. |                     // that we only want to remove all those vectors. | ||||||
|                     let mut obkv = KvWriterDelAdd::memory(); |                     let mut obkv = KvWriterDelAdd::memory(); | ||||||
| @@ -294,6 +316,7 @@ fn push_vectors_diff( | |||||||
|                     let bytes = obkv.into_inner()?; |                     let bytes = obkv.into_inner()?; | ||||||
|                     manual_vectors_writer.insert(&key_buffer, bytes)?; |                     manual_vectors_writer.insert(&key_buffer, bytes)?; | ||||||
|                 } |                 } | ||||||
|  |             } | ||||||
|             EitherOrBoth::Right(vector) => { |             EitherOrBoth::Right(vector) => { | ||||||
|                 // We insert only the Add part of the Obkv to inform |                 // We insert only the Add part of the Obkv to inform | ||||||
|                 // that we only want to remove all those vectors. |                 // that we only want to remove all those vectors. | ||||||
| @@ -339,7 +362,7 @@ pub fn extract_embeddings<R: io::Read + io::Seek>( | |||||||
|     prompt_reader: grenad::Reader<R>, |     prompt_reader: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     embedder: Arc<Embedder>, |     embedder: Arc<Embedder>, | ||||||
|     request_threads: &rayon::ThreadPool, |     request_threads: &ThreadPoolNoAbort, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|     let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism |     let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism | ||||||
|   | |||||||
| @@ -1,20 +1,23 @@ | |||||||
| use std::collections::{BTreeSet, HashSet}; | use std::collections::BTreeSet; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, BufReader}; | use std::io::{self, BufReader}; | ||||||
|  |  | ||||||
| use heed::BytesDecode; | use heed::{BytesDecode, BytesEncode}; | ||||||
| use obkv::KvReaderU16; | use obkv::KvReaderU16; | ||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use super::helpers::{ | use super::helpers::{ | ||||||
|     create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, |     create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at, | ||||||
|     try_split_array_at, writer_into_reader, GrenadParameters, |     writer_into_reader, GrenadParameters, | ||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::heed_codec::StrBEU16Codec; | use crate::heed_codec::StrBEU16Codec; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::index_documents::helpers::sorter_into_reader; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::update::MergeFn; | use crate::update::MergeFn; | ||||||
| use crate::{DocumentId, FieldId, Result}; | use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the word and the documents ids where this word appear. | /// Extracts the word and the documents ids where this word appear. | ||||||
| /// | /// | ||||||
| @@ -27,7 +30,7 @@ use crate::{DocumentId, FieldId, Result}; | |||||||
| pub fn extract_word_docids<R: io::Read + io::Seek>( | pub fn extract_word_docids<R: io::Read + io::Seek>( | ||||||
|     docid_word_positions: grenad::Reader<R>, |     docid_word_positions: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     exact_attributes: &HashSet<FieldId>, |     settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<( | ) -> Result<( | ||||||
|     grenad::Reader<BufReader<File>>, |     grenad::Reader<BufReader<File>>, | ||||||
|     grenad::Reader<BufReader<File>>, |     grenad::Reader<BufReader<File>>, | ||||||
| @@ -43,7 +46,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | |||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory.map(|x| x / 3), |         max_memory.map(|m| m / 3), | ||||||
|     ); |     ); | ||||||
|     let mut key_buffer = Vec::new(); |     let mut key_buffer = Vec::new(); | ||||||
|     let mut del_words = BTreeSet::new(); |     let mut del_words = BTreeSet::new(); | ||||||
| @@ -85,13 +88,19 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | |||||||
|         add_words.clear(); |         add_words.clear(); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     let mut word_fid_docids_writer = create_writer( | ||||||
|  |         indexer.chunk_compression_type, | ||||||
|  |         indexer.chunk_compression_level, | ||||||
|  |         tempfile::tempfile()?, | ||||||
|  |     ); | ||||||
|  |  | ||||||
|     let mut word_docids_sorter = create_sorter( |     let mut word_docids_sorter = create_sorter( | ||||||
|         grenad::SortAlgorithm::Unstable, |         grenad::SortAlgorithm::Unstable, | ||||||
|         merge_deladd_cbo_roaring_bitmaps, |         merge_deladd_cbo_roaring_bitmaps, | ||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory.map(|x| x / 3), |         max_memory.map(|m| m / 3), | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut exact_word_docids_sorter = create_sorter( |     let mut exact_word_docids_sorter = create_sorter( | ||||||
| @@ -100,31 +109,45 @@ pub fn extract_word_docids<R: io::Read + io::Seek>( | |||||||
|         indexer.chunk_compression_type, |         indexer.chunk_compression_type, | ||||||
|         indexer.chunk_compression_level, |         indexer.chunk_compression_level, | ||||||
|         indexer.max_nb_chunks, |         indexer.max_nb_chunks, | ||||||
|         max_memory.map(|x| x / 3), |         max_memory.map(|m| m / 3), | ||||||
|     ); |  | ||||||
|  |  | ||||||
|     let mut word_fid_docids_writer = create_writer( |  | ||||||
|         indexer.chunk_compression_type, |  | ||||||
|         indexer.chunk_compression_level, |  | ||||||
|         tempfile::tempfile()?, |  | ||||||
|     ); |     ); | ||||||
|  |  | ||||||
|     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; |     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?; | ||||||
|     // TODO: replace sorters by writers by accumulating values into a buffer before inserting them. |     let mut buffer = Vec::new(); | ||||||
|  |     // NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters. | ||||||
|     while let Some((key, value)) = iter.next()? { |     while let Some((key, value)) = iter.next()? { | ||||||
|         // only keep the value if their is a change to apply in the DB. |         // only keep the value if their is a change to apply in the DB. | ||||||
|         if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { |         if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) { | ||||||
|             word_fid_docids_writer.insert(key, value)?; |             word_fid_docids_writer.insert(key, value)?; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let (word, fid) = StrBEU16Codec::bytes_decode(key) |         let (w, fid) = StrBEU16Codec::bytes_decode(key) | ||||||
|             .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; |             .map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; | ||||||
|  |  | ||||||
|         // every words contained in an attribute set to exact must be pushed in the exact_words list. |         // merge all deletions | ||||||
|         if exact_attributes.contains(&fid) { |         let obkv = KvReaderDelAdd::new(value); | ||||||
|             exact_word_docids_sorter.insert(word.as_bytes(), value)?; |         if let Some(value) = obkv.get(DelAdd::Deletion) { | ||||||
|  |             let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid); | ||||||
|  |             buffer.clear(); | ||||||
|  |             let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |             obkv.insert(DelAdd::Deletion, value)?; | ||||||
|  |             if delete_from_exact { | ||||||
|  |                 exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; | ||||||
|             } else { |             } else { | ||||||
|             word_docids_sorter.insert(word.as_bytes(), value)?; |                 word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         // merge all additions | ||||||
|  |         if let Some(value) = obkv.get(DelAdd::Addition) { | ||||||
|  |             let add_in_exact = settings_diff.new.exact_attributes.contains(&fid); | ||||||
|  |             buffer.clear(); | ||||||
|  |             let mut obkv = KvWriterDelAdd::new(&mut buffer); | ||||||
|  |             obkv.insert(DelAdd::Addition, value)?; | ||||||
|  |             if add_in_exact { | ||||||
|  |                 exact_word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; | ||||||
|  |             } else { | ||||||
|  |                 word_docids_sorter.insert(w, obkv.into_inner().unwrap())?; | ||||||
|  |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -178,3 +201,45 @@ fn words_into_sorter( | |||||||
|  |  | ||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] | ||||||
|  | fn docids_into_writers<W>( | ||||||
|  |     word: &str, | ||||||
|  |     deletions: &RoaringBitmap, | ||||||
|  |     additions: &RoaringBitmap, | ||||||
|  |     writer: &mut grenad::Writer<W>, | ||||||
|  | ) -> Result<()> | ||||||
|  | where | ||||||
|  |     W: std::io::Write, | ||||||
|  | { | ||||||
|  |     if deletions == additions { | ||||||
|  |         // if the same value is deleted and added, do nothing. | ||||||
|  |         return Ok(()); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // Write each value in the same KvDelAdd before inserting it in the final writer. | ||||||
|  |     let mut obkv = KvWriterDelAdd::memory(); | ||||||
|  |     // deletions: | ||||||
|  |     if !deletions.is_empty() && !deletions.is_subset(additions) { | ||||||
|  |         obkv.insert( | ||||||
|  |             DelAdd::Deletion, | ||||||
|  |             CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| { | ||||||
|  |                 SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } | ||||||
|  |             })?, | ||||||
|  |         )?; | ||||||
|  |     } | ||||||
|  |     // additions: | ||||||
|  |     if !additions.is_empty() { | ||||||
|  |         obkv.insert( | ||||||
|  |             DelAdd::Addition, | ||||||
|  |             CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| { | ||||||
|  |                 SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } | ||||||
|  |             })?, | ||||||
|  |         )?; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // insert everything in the same writer. | ||||||
|  |     writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?; | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|   | |||||||
| @@ -11,8 +11,9 @@ use super::helpers::{ | |||||||
| }; | }; | ||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::proximity::{index_proximity, MAX_DISTANCE}; | use crate::proximity::{index_proximity, ProximityPrecision, MAX_DISTANCE}; | ||||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::{DocumentId, Result}; | use crate::{DocumentId, Result}; | ||||||
|  |  | ||||||
| /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. | /// Extracts the best proximity between pairs of words and the documents ids where this pair appear. | ||||||
| @@ -23,8 +24,21 @@ use crate::{DocumentId, Result}; | |||||||
| pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | ||||||
|     docid_word_positions: grenad::Reader<R>, |     docid_word_positions: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|  |     settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |     let any_deletion = settings_diff.old.proximity_precision == ProximityPrecision::ByWord; | ||||||
|  |     let any_addition = settings_diff.new.proximity_precision == ProximityPrecision::ByWord; | ||||||
|  |  | ||||||
|  |     // early return if the data shouldn't be deleted nor created. | ||||||
|  |     if !any_deletion && !any_addition { | ||||||
|  |         let writer = create_writer( | ||||||
|  |             indexer.chunk_compression_type, | ||||||
|  |             indexer.chunk_compression_level, | ||||||
|  |             tempfile::tempfile()?, | ||||||
|  |         ); | ||||||
|  |         return writer_into_reader(writer); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     let max_memory = indexer.max_memory_by_thread(); |     let max_memory = indexer.max_memory_by_thread(); | ||||||
|  |  | ||||||
| @@ -77,6 +91,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | |||||||
|  |  | ||||||
|         let (del, add): (Result<_>, Result<_>) = rayon::join( |         let (del, add): (Result<_>, Result<_>) = rayon::join( | ||||||
|             || { |             || { | ||||||
|  |                 if !any_deletion { | ||||||
|  |                     return Ok(()); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|                 // deletions |                 // deletions | ||||||
|                 if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { |                 if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) { | ||||||
|                     for (position, word) in KvReaderU16::new(deletion).iter() { |                     for (position, word) in KvReaderU16::new(deletion).iter() { | ||||||
| @@ -106,6 +124,10 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>( | |||||||
|                 Ok(()) |                 Ok(()) | ||||||
|             }, |             }, | ||||||
|             || { |             || { | ||||||
|  |                 if !any_addition { | ||||||
|  |                     return Ok(()); | ||||||
|  |                 } | ||||||
|  |  | ||||||
|                 // additions |                 // additions | ||||||
|                 if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { |                 if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) { | ||||||
|                     for (position, word) in KvReaderU16::new(addition).iter() { |                     for (position, word) in KvReaderU16::new(addition).iter() { | ||||||
|   | |||||||
| @@ -11,6 +11,7 @@ use super::helpers::{ | |||||||
| use crate::error::SerializationError; | use crate::error::SerializationError; | ||||||
| use crate::index::db_name::DOCID_WORD_POSITIONS; | use crate::index::db_name::DOCID_WORD_POSITIONS; | ||||||
| use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; | ||||||
|  | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::update::MergeFn; | use crate::update::MergeFn; | ||||||
| use crate::{bucketed_position, DocumentId, Result}; | use crate::{bucketed_position, DocumentId, Result}; | ||||||
|  |  | ||||||
| @@ -22,6 +23,7 @@ use crate::{bucketed_position, DocumentId, Result}; | |||||||
| pub fn extract_word_position_docids<R: io::Read + io::Seek>( | pub fn extract_word_position_docids<R: io::Read + io::Seek>( | ||||||
|     docid_word_positions: grenad::Reader<R>, |     docid_word_positions: grenad::Reader<R>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|  |     _settings_diff: &InnerIndexSettingsDiff, | ||||||
| ) -> Result<grenad::Reader<BufReader<File>>> { | ) -> Result<grenad::Reader<BufReader<File>>> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
|   | |||||||
| @@ -9,9 +9,9 @@ mod extract_word_docids; | |||||||
| mod extract_word_pair_proximity_docids; | mod extract_word_pair_proximity_docids; | ||||||
| mod extract_word_position_docids; | mod extract_word_position_docids; | ||||||
|  |  | ||||||
| use std::collections::HashSet; |  | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::BufReader; | use std::io::BufReader; | ||||||
|  | use std::sync::Arc; | ||||||
|  |  | ||||||
| use crossbeam_channel::Sender; | use crossbeam_channel::Sender; | ||||||
| use rayon::prelude::*; | use rayon::prelude::*; | ||||||
| @@ -30,9 +30,8 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids | |||||||
| use self::extract_word_position_docids::extract_word_position_docids; | use self::extract_word_position_docids::extract_word_position_docids; | ||||||
| use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; | use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; | ||||||
| use super::{helpers, TypedChunk}; | use super::{helpers, TypedChunk}; | ||||||
| use crate::proximity::ProximityPrecision; | use crate::update::settings::InnerIndexSettingsDiff; | ||||||
| use crate::vector::EmbeddingConfigs; | use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; | ||||||
| use crate::{FieldId, FieldsIdsMap, Result}; |  | ||||||
|  |  | ||||||
| /// Extract data for each databases from obkv documents in parallel. | /// Extract data for each databases from obkv documents in parallel. | ||||||
| /// Send data in grenad file over provided Sender. | /// Send data in grenad file over provided Sender. | ||||||
| @@ -43,18 +42,10 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|     flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send, |     flattened_obkv_chunks: impl Iterator<Item = Result<grenad::Reader<BufReader<File>>>> + Send, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     searchable_fields: Option<HashSet<FieldId>>, |  | ||||||
|     faceted_fields: HashSet<FieldId>, |  | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|     field_id_map: FieldsIdsMap, |     settings_diff: Arc<InnerIndexSettingsDiff>, | ||||||
|     stop_words: Option<fst::Set<Vec<u8>>>, |  | ||||||
|     allowed_separators: Option<&[&str]>, |  | ||||||
|     dictionary: Option<&[&str]>, |  | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
|     exact_attributes: HashSet<FieldId>, |  | ||||||
|     proximity_precision: ProximityPrecision, |  | ||||||
|     embedders: EmbeddingConfigs, |  | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     puffin::profile_function!(); |     puffin::profile_function!(); | ||||||
|  |  | ||||||
| @@ -67,8 +58,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         original_documents_chunk, |                         original_documents_chunk, | ||||||
|                         indexer, |                         indexer, | ||||||
|                         lmdb_writer_sx.clone(), |                         lmdb_writer_sx.clone(), | ||||||
|                         field_id_map.clone(), |                         settings_diff.clone(), | ||||||
|                         embedders.clone(), |  | ||||||
|                     ) |                     ) | ||||||
|                 }) |                 }) | ||||||
|                 .collect::<Result<()>>() |                 .collect::<Result<()>>() | ||||||
| @@ -81,13 +71,9 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         flattened_obkv_chunks, |                         flattened_obkv_chunks, | ||||||
|                         indexer, |                         indexer, | ||||||
|                         lmdb_writer_sx.clone(), |                         lmdb_writer_sx.clone(), | ||||||
|                         &searchable_fields, |  | ||||||
|                         &faceted_fields, |  | ||||||
|                         primary_key_id, |                         primary_key_id, | ||||||
|                         geo_fields_ids, |                         geo_fields_ids, | ||||||
|                         &stop_words, |                         settings_diff.clone(), | ||||||
|                         &allowed_separators, |  | ||||||
|                         &dictionary, |  | ||||||
|                         max_positions_per_attributes, |                         max_positions_per_attributes, | ||||||
|                     ) |                     ) | ||||||
|                 }) |                 }) | ||||||
| @@ -100,13 +86,12 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( |                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( | ||||||
|                             docid_word_positions_chunk.clone(), |                             docid_word_positions_chunk.clone(), | ||||||
|                             indexer, |                             indexer, | ||||||
|  |                             settings_diff.clone(), | ||||||
|                             lmdb_writer_sx.clone(), |                             lmdb_writer_sx.clone(), | ||||||
|                             extract_fid_word_count_docids, |                             extract_fid_word_count_docids, | ||||||
|                             TypedChunk::FieldIdWordCountDocids, |                             TypedChunk::FieldIdWordCountDocids, | ||||||
|                             "field-id-wordcount-docids", |                             "field-id-wordcount-docids", | ||||||
|                         ); |                         ); | ||||||
|  |  | ||||||
|                         let exact_attributes = exact_attributes.clone(); |  | ||||||
|                         run_extraction_task::< |                         run_extraction_task::< | ||||||
|                             _, |                             _, | ||||||
|                             _, |                             _, | ||||||
| @@ -118,10 +103,9 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         >( |                         >( | ||||||
|                             docid_word_positions_chunk.clone(), |                             docid_word_positions_chunk.clone(), | ||||||
|                             indexer, |                             indexer, | ||||||
|  |                             settings_diff.clone(), | ||||||
|                             lmdb_writer_sx.clone(), |                             lmdb_writer_sx.clone(), | ||||||
|                             move |doc_word_pos, indexer| { |                             extract_word_docids, | ||||||
|                                 extract_word_docids(doc_word_pos, indexer, &exact_attributes) |  | ||||||
|                             }, |  | ||||||
|                             |( |                             |( | ||||||
|                                 word_docids_reader, |                                 word_docids_reader, | ||||||
|                                 exact_word_docids_reader, |                                 exact_word_docids_reader, | ||||||
| @@ -139,6 +123,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( |                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( | ||||||
|                             docid_word_positions_chunk.clone(), |                             docid_word_positions_chunk.clone(), | ||||||
|                             indexer, |                             indexer, | ||||||
|  |                             settings_diff.clone(), | ||||||
|                             lmdb_writer_sx.clone(), |                             lmdb_writer_sx.clone(), | ||||||
|                             extract_word_position_docids, |                             extract_word_position_docids, | ||||||
|                             TypedChunk::WordPositionDocids, |                             TypedChunk::WordPositionDocids, | ||||||
| @@ -152,6 +137,7 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         >( |                         >( | ||||||
|                             fid_docid_facet_strings_chunk.clone(), |                             fid_docid_facet_strings_chunk.clone(), | ||||||
|                             indexer, |                             indexer, | ||||||
|  |                             settings_diff.clone(), | ||||||
|                             lmdb_writer_sx.clone(), |                             lmdb_writer_sx.clone(), | ||||||
|                             extract_facet_string_docids, |                             extract_facet_string_docids, | ||||||
|                             TypedChunk::FieldIdFacetStringDocids, |                             TypedChunk::FieldIdFacetStringDocids, | ||||||
| @@ -161,23 +147,23 @@ pub(crate) fn data_from_obkv_documents( | |||||||
|                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( |                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( | ||||||
|                             fid_docid_facet_numbers_chunk.clone(), |                             fid_docid_facet_numbers_chunk.clone(), | ||||||
|                             indexer, |                             indexer, | ||||||
|  |                             settings_diff.clone(), | ||||||
|                             lmdb_writer_sx.clone(), |                             lmdb_writer_sx.clone(), | ||||||
|                             extract_facet_number_docids, |                             extract_facet_number_docids, | ||||||
|                             TypedChunk::FieldIdFacetNumberDocids, |                             TypedChunk::FieldIdFacetNumberDocids, | ||||||
|                             "field-id-facet-number-docids", |                             "field-id-facet-number-docids", | ||||||
|                         ); |                         ); | ||||||
|  |  | ||||||
|                         if proximity_precision == ProximityPrecision::ByWord { |  | ||||||
|                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( |                         run_extraction_task::<_, _, grenad::Reader<BufReader<File>>>( | ||||||
|                             docid_word_positions_chunk.clone(), |                             docid_word_positions_chunk.clone(), | ||||||
|                             indexer, |                             indexer, | ||||||
|  |                             settings_diff.clone(), | ||||||
|                             lmdb_writer_sx.clone(), |                             lmdb_writer_sx.clone(), | ||||||
|                             extract_word_pair_proximity_docids, |                             extract_word_pair_proximity_docids, | ||||||
|                             TypedChunk::WordPairProximityDocids, |                             TypedChunk::WordPairProximityDocids, | ||||||
|                             "word-pair-proximity-docids", |                             "word-pair-proximity-docids", | ||||||
|                         ); |                         ); | ||||||
|                     } |                     } | ||||||
|                     } |  | ||||||
|  |  | ||||||
|                     Ok(()) |                     Ok(()) | ||||||
|                 }) |                 }) | ||||||
| @@ -195,12 +181,17 @@ pub(crate) fn data_from_obkv_documents( | |||||||
| fn run_extraction_task<FE, FS, M>( | fn run_extraction_task<FE, FS, M>( | ||||||
|     chunk: grenad::Reader<CursorClonableMmap>, |     chunk: grenad::Reader<CursorClonableMmap>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|  |     settings_diff: Arc<InnerIndexSettingsDiff>, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     extract_fn: FE, |     extract_fn: FE, | ||||||
|     serialize_fn: FS, |     serialize_fn: FS, | ||||||
|     name: &'static str, |     name: &'static str, | ||||||
| ) where | ) where | ||||||
|     FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M> |     FE: Fn( | ||||||
|  |             grenad::Reader<CursorClonableMmap>, | ||||||
|  |             GrenadParameters, | ||||||
|  |             &InnerIndexSettingsDiff, | ||||||
|  |         ) -> Result<M> | ||||||
|         + Sync |         + Sync | ||||||
|         + Send |         + Send | ||||||
|         + 'static, |         + 'static, | ||||||
| @@ -213,7 +204,7 @@ fn run_extraction_task<FE, FS, M>( | |||||||
|         let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); |         let child_span = tracing::trace_span!(target: "indexing::extract::details", parent: ¤t_span, "extract_multiple_chunks"); | ||||||
|         let _entered = child_span.enter(); |         let _entered = child_span.enter(); | ||||||
|         puffin::profile_scope!("extract_multiple_chunks", name); |         puffin::profile_scope!("extract_multiple_chunks", name); | ||||||
|         match extract_fn(chunk, indexer) { |         match extract_fn(chunk, indexer, &settings_diff) { | ||||||
|             Ok(chunk) => { |             Ok(chunk) => { | ||||||
|                 let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); |                 let _ = lmdb_writer_sx.send(Ok(serialize_fn(chunk))); | ||||||
|             } |             } | ||||||
| @@ -230,8 +221,7 @@ fn send_original_documents_data( | |||||||
|     original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, |     original_documents_chunk: Result<grenad::Reader<BufReader<File>>>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     field_id_map: FieldsIdsMap, |     settings_diff: Arc<InnerIndexSettingsDiff>, | ||||||
|     embedders: EmbeddingConfigs, |  | ||||||
| ) -> Result<()> { | ) -> Result<()> { | ||||||
|     let original_documents_chunk = |     let original_documents_chunk = | ||||||
|         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; |         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?; | ||||||
| @@ -239,17 +229,19 @@ fn send_original_documents_data( | |||||||
|     let documents_chunk_cloned = original_documents_chunk.clone(); |     let documents_chunk_cloned = original_documents_chunk.clone(); | ||||||
|     let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); |     let lmdb_writer_sx_cloned = lmdb_writer_sx.clone(); | ||||||
|  |  | ||||||
|     let request_threads = rayon::ThreadPoolBuilder::new() |     let request_threads = ThreadPoolNoAbortBuilder::new() | ||||||
|         .num_threads(crate::vector::REQUEST_PARALLELISM) |         .num_threads(crate::vector::REQUEST_PARALLELISM) | ||||||
|         .thread_name(|index| format!("embedding-request-{index}")) |         .thread_name(|index| format!("embedding-request-{index}")) | ||||||
|         .build()?; |         .build()?; | ||||||
|  |  | ||||||
|  |     if settings_diff.reindex_vectors() || !settings_diff.settings_update_only() { | ||||||
|  |         let settings_diff = settings_diff.clone(); | ||||||
|         rayon::spawn(move || { |         rayon::spawn(move || { | ||||||
|         for (name, (embedder, prompt)) in embedders { |             for (name, (embedder, prompt)) in settings_diff.new.embedding_configs.clone() { | ||||||
|                 let result = extract_vector_points( |                 let result = extract_vector_points( | ||||||
|                     documents_chunk_cloned.clone(), |                     documents_chunk_cloned.clone(), | ||||||
|                     indexer, |                     indexer, | ||||||
|                 &field_id_map, |                     &settings_diff, | ||||||
|                     &prompt, |                     &prompt, | ||||||
|                     &name, |                     &name, | ||||||
|                 ); |                 ); | ||||||
| @@ -288,6 +280,7 @@ fn send_original_documents_data( | |||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         }); |         }); | ||||||
|  |     } | ||||||
|  |  | ||||||
|     // TODO: create a custom internal error |     // TODO: create a custom internal error | ||||||
|     let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); |     let _ = lmdb_writer_sx.send(Ok(TypedChunk::Documents(original_documents_chunk))); | ||||||
| @@ -306,13 +299,9 @@ fn send_and_extract_flattened_documents_data( | |||||||
|     flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>, |     flattened_documents_chunk: Result<grenad::Reader<BufReader<File>>>, | ||||||
|     indexer: GrenadParameters, |     indexer: GrenadParameters, | ||||||
|     lmdb_writer_sx: Sender<Result<TypedChunk>>, |     lmdb_writer_sx: Sender<Result<TypedChunk>>, | ||||||
|     searchable_fields: &Option<HashSet<FieldId>>, |  | ||||||
|     faceted_fields: &HashSet<FieldId>, |  | ||||||
|     primary_key_id: FieldId, |     primary_key_id: FieldId, | ||||||
|     geo_fields_ids: Option<(FieldId, FieldId)>, |     geo_fields_ids: Option<(FieldId, FieldId)>, | ||||||
|     stop_words: &Option<fst::Set<Vec<u8>>>, |     settings_diff: Arc<InnerIndexSettingsDiff>, | ||||||
|     allowed_separators: &Option<&[&str]>, |  | ||||||
|     dictionary: &Option<&[&str]>, |  | ||||||
|     max_positions_per_attributes: Option<u32>, |     max_positions_per_attributes: Option<u32>, | ||||||
| ) -> Result<( | ) -> Result<( | ||||||
|     grenad::Reader<CursorClonableMmap>, |     grenad::Reader<CursorClonableMmap>, | ||||||
| @@ -341,10 +330,7 @@ fn send_and_extract_flattened_documents_data( | |||||||
|                     extract_docid_word_positions( |                     extract_docid_word_positions( | ||||||
|                         flattened_documents_chunk.clone(), |                         flattened_documents_chunk.clone(), | ||||||
|                         indexer, |                         indexer, | ||||||
|                         searchable_fields, |                         &settings_diff, | ||||||
|                         stop_words.as_ref(), |  | ||||||
|                         *allowed_separators, |  | ||||||
|                         *dictionary, |  | ||||||
|                         max_positions_per_attributes, |                         max_positions_per_attributes, | ||||||
|                     )?; |                     )?; | ||||||
|  |  | ||||||
| @@ -367,7 +353,7 @@ fn send_and_extract_flattened_documents_data( | |||||||
|                 } = extract_fid_docid_facet_values( |                 } = extract_fid_docid_facet_values( | ||||||
|                     flattened_documents_chunk.clone(), |                     flattened_documents_chunk.clone(), | ||||||
|                     indexer, |                     indexer, | ||||||
|                     faceted_fields, |                     &settings_diff, | ||||||
|                     geo_fields_ids, |                     geo_fields_ids, | ||||||
|                 )?; |                 )?; | ||||||
|  |  | ||||||
|   | |||||||
| @@ -6,9 +6,9 @@ mod typed_chunk; | |||||||
|  |  | ||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::{HashMap, HashSet}; | ||||||
| use std::io::{Read, Seek}; | use std::io::{Read, Seek}; | ||||||
| use std::iter::FromIterator; |  | ||||||
| use std::num::NonZeroU32; | use std::num::NonZeroU32; | ||||||
| use std::result::Result as StdResult; | use std::result::Result as StdResult; | ||||||
|  | use std::sync::Arc; | ||||||
|  |  | ||||||
| use crossbeam_channel::{Receiver, Sender}; | use crossbeam_channel::{Receiver, Sender}; | ||||||
| use grenad::{Merger, MergerBuilder}; | use grenad::{Merger, MergerBuilder}; | ||||||
| @@ -33,6 +33,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; | |||||||
| pub use self::transform::{Transform, TransformOutput}; | pub use self::transform::{Transform, TransformOutput}; | ||||||
| use crate::documents::{obkv_to_object, DocumentsBatchReader}; | use crate::documents::{obkv_to_object, DocumentsBatchReader}; | ||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
|  | use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder; | ||||||
| pub use crate::update::index_documents::helpers::CursorClonableMmap; | pub use crate::update::index_documents::helpers::CursorClonableMmap; | ||||||
| use crate::update::{ | use crate::update::{ | ||||||
|     IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, |     IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, | ||||||
| @@ -259,21 +260,6 @@ where | |||||||
|             .expect("Invalid document addition state") |             .expect("Invalid document addition state") | ||||||
|             .output_from_sorter(self.wtxn, &self.progress)?; |             .output_from_sorter(self.wtxn, &self.progress)?; | ||||||
|  |  | ||||||
|         let new_facets = output.compute_real_facets(self.wtxn, self.index)?; |  | ||||||
|         self.index.put_faceted_fields(self.wtxn, &new_facets)?; |  | ||||||
|  |  | ||||||
|         // in case new fields were introduced we're going to recreate the searchable fields. |  | ||||||
|         if let Some(faceted_fields) = self.index.user_defined_searchable_fields(self.wtxn)? { |  | ||||||
|             // we can't keep references on the faceted fields while we update the index thus we need to own it. |  | ||||||
|             let faceted_fields: Vec<String> = |  | ||||||
|                 faceted_fields.into_iter().map(str::to_string).collect(); |  | ||||||
|             self.index.put_all_searchable_fields_from_fields_ids_map( |  | ||||||
|                 self.wtxn, |  | ||||||
|                 &faceted_fields.iter().map(String::as_ref).collect::<Vec<_>>(), |  | ||||||
|                 &output.fields_ids_map, |  | ||||||
|             )?; |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         let indexed_documents = output.documents_count as u64; |         let indexed_documents = output.documents_count as u64; | ||||||
|         let number_of_documents = self.execute_raw(output)?; |         let number_of_documents = self.execute_raw(output)?; | ||||||
|  |  | ||||||
| @@ -296,32 +282,35 @@ where | |||||||
|  |  | ||||||
|         let TransformOutput { |         let TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map, |             mut settings_diff, | ||||||
|             field_distribution, |             field_distribution, | ||||||
|             documents_count, |             documents_count, | ||||||
|             original_documents, |             original_documents, | ||||||
|             flattened_documents, |             flattened_documents, | ||||||
|         } = output; |         } = output; | ||||||
|  |  | ||||||
|         // The fields_ids_map is put back to the store now so the rest of the transaction sees an |         // update the internal facet and searchable list, | ||||||
|         // up to date field map. |         // because they might have changed due to the nested documents flattening. | ||||||
|         self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; |         settings_diff.new.recompute_facets(self.wtxn, self.index)?; | ||||||
|  |         settings_diff.new.recompute_searchables(self.wtxn, self.index)?; | ||||||
|  |  | ||||||
|  |         let settings_diff = Arc::new(settings_diff); | ||||||
|  |  | ||||||
|         let backup_pool; |         let backup_pool; | ||||||
|         let pool = match self.indexer_config.thread_pool { |         let pool = match self.indexer_config.thread_pool { | ||||||
|             Some(ref pool) => pool, |             Some(ref pool) => pool, | ||||||
|             #[cfg(not(test))] |  | ||||||
|             None => { |             None => { | ||||||
|                 // We initialize a bakcup pool with the default |                 // We initialize a backup pool with the default | ||||||
|                 // settings if none have already been set. |                 // settings if none have already been set. | ||||||
|                 backup_pool = rayon::ThreadPoolBuilder::new().build()?; |                 #[allow(unused_mut)] | ||||||
|                 &backup_pool |                 let mut pool_builder = ThreadPoolNoAbortBuilder::new(); | ||||||
|             } |  | ||||||
|                 #[cfg(test)] |                 #[cfg(test)] | ||||||
|             None => { |                 { | ||||||
|                 // We initialize a bakcup pool with the default |                     pool_builder = pool_builder.num_threads(1); | ||||||
|                 // settings if none have already been set. |                 } | ||||||
|                 backup_pool = rayon::ThreadPoolBuilder::new().num_threads(1).build()?; |  | ||||||
|  |                 backup_pool = pool_builder.build()?; | ||||||
|                 &backup_pool |                 &backup_pool | ||||||
|             } |             } | ||||||
|         }; |         }; | ||||||
| @@ -333,13 +322,8 @@ where | |||||||
|         ) = crossbeam_channel::unbounded(); |         ) = crossbeam_channel::unbounded(); | ||||||
|  |  | ||||||
|         // get the primary key field id |         // get the primary key field id | ||||||
|         let primary_key_id = fields_ids_map.id(&primary_key).unwrap(); |         let primary_key_id = settings_diff.new.fields_ids_map.id(&primary_key).unwrap(); | ||||||
|  |  | ||||||
|         // get searchable fields for word databases |  | ||||||
|         let searchable_fields = |  | ||||||
|             self.index.searchable_fields_ids(self.wtxn)?.map(HashSet::from_iter); |  | ||||||
|         // get filterable fields for facet databases |  | ||||||
|         let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; |  | ||||||
|         // get the fid of the `_geo.lat` and `_geo.lng` fields. |         // get the fid of the `_geo.lat` and `_geo.lng` fields. | ||||||
|         let mut field_id_map = self.index.fields_ids_map(self.wtxn)?; |         let mut field_id_map = self.index.fields_ids_map(self.wtxn)?; | ||||||
|  |  | ||||||
| @@ -362,12 +346,6 @@ where | |||||||
|             None => None, |             None => None, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|         let stop_words = self.index.stop_words(self.wtxn)?; |  | ||||||
|         let separators = self.index.allowed_separators(self.wtxn)?; |  | ||||||
|         let dictionary = self.index.dictionary(self.wtxn)?; |  | ||||||
|         let exact_attributes = self.index.exact_attributes_ids(self.wtxn)?; |  | ||||||
|         let proximity_precision = self.index.proximity_precision(self.wtxn)?.unwrap_or_default(); |  | ||||||
|  |  | ||||||
|         let pool_params = GrenadParameters { |         let pool_params = GrenadParameters { | ||||||
|             chunk_compression_type: self.indexer_config.chunk_compression_type, |             chunk_compression_type: self.indexer_config.chunk_compression_type, | ||||||
|             chunk_compression_level: self.indexer_config.chunk_compression_level, |             chunk_compression_level: self.indexer_config.chunk_compression_level, | ||||||
| @@ -400,8 +378,6 @@ where | |||||||
|  |  | ||||||
|         let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; |         let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; | ||||||
|  |  | ||||||
|         let cloned_embedder = self.embedders.clone(); |  | ||||||
|  |  | ||||||
|         let mut final_documents_ids = RoaringBitmap::new(); |         let mut final_documents_ids = RoaringBitmap::new(); | ||||||
|         let mut databases_seen = 0; |         let mut databases_seen = 0; | ||||||
|         let mut word_position_docids = None; |         let mut word_position_docids = None; | ||||||
| @@ -410,7 +386,6 @@ where | |||||||
|         let mut exact_word_docids = None; |         let mut exact_word_docids = None; | ||||||
|         let mut chunk_accumulator = ChunkAccumulator::default(); |         let mut chunk_accumulator = ChunkAccumulator::default(); | ||||||
|         let mut dimension = HashMap::new(); |         let mut dimension = HashMap::new(); | ||||||
|         let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); |  | ||||||
|  |  | ||||||
|         let current_span = tracing::Span::current(); |         let current_span = tracing::Span::current(); | ||||||
|  |  | ||||||
| @@ -428,10 +403,6 @@ where | |||||||
|                 let flattened_chunk_iter = |                 let flattened_chunk_iter = | ||||||
|                     grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); |                     grenad_obkv_into_chunks(flattened_documents, pool_params, documents_chunk_size); | ||||||
|  |  | ||||||
|                 let separators: Option<Vec<_>> = |  | ||||||
|                     separators.as_ref().map(|x| x.iter().map(String::as_str).collect()); |  | ||||||
|                 let dictionary: Option<Vec<_>> = |  | ||||||
|                     dictionary.as_ref().map(|x| x.iter().map(String::as_str).collect()); |  | ||||||
|                 let result = original_chunk_iter.and_then(|original_chunk| { |                 let result = original_chunk_iter.and_then(|original_chunk| { | ||||||
|                     let flattened_chunk = flattened_chunk_iter?; |                     let flattened_chunk = flattened_chunk_iter?; | ||||||
|                     // extract all databases from the chunked obkv douments |                     // extract all databases from the chunked obkv douments | ||||||
| @@ -440,18 +411,10 @@ where | |||||||
|                         flattened_chunk, |                         flattened_chunk, | ||||||
|                         pool_params, |                         pool_params, | ||||||
|                         lmdb_writer_sx.clone(), |                         lmdb_writer_sx.clone(), | ||||||
|                         searchable_fields, |  | ||||||
|                         faceted_fields, |  | ||||||
|                         primary_key_id, |                         primary_key_id, | ||||||
|                         geo_fields_ids, |                         geo_fields_ids, | ||||||
|                         field_id_map, |                         settings_diff.clone(), | ||||||
|                         stop_words, |  | ||||||
|                         separators.as_deref(), |  | ||||||
|                         dictionary.as_deref(), |  | ||||||
|                         max_positions_per_attributes, |                         max_positions_per_attributes, | ||||||
|                         exact_attributes, |  | ||||||
|                         proximity_precision, |  | ||||||
|                         cloned_embedder, |  | ||||||
|                     ) |                     ) | ||||||
|                 }); |                 }); | ||||||
|  |  | ||||||
| @@ -571,7 +534,7 @@ where | |||||||
|             } |             } | ||||||
|  |  | ||||||
|             Ok(()) |             Ok(()) | ||||||
|         })?; |         }).map_err(InternalError::from)??; | ||||||
|  |  | ||||||
|         // We write the field distribution into the main database |         // We write the field distribution into the main database | ||||||
|         self.index.put_field_distribution(self.wtxn, &field_distribution)?; |         self.index.put_field_distribution(self.wtxn, &field_distribution)?; | ||||||
| @@ -600,7 +563,8 @@ where | |||||||
|                     writer.build(wtxn, &mut rng, None)?; |                     writer.build(wtxn, &mut rng, None)?; | ||||||
|                 } |                 } | ||||||
|                 Result::Ok(()) |                 Result::Ok(()) | ||||||
|             })?; |             }) | ||||||
|  |             .map_err(InternalError::from)??; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         self.execute_prefix_databases( |         self.execute_prefix_databases( | ||||||
|   | |||||||
| @@ -1,12 +1,11 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::btree_map::Entry as BEntry; | use std::collections::btree_map::Entry as BEntry; | ||||||
| use std::collections::hash_map::Entry as HEntry; | use std::collections::hash_map::Entry as HEntry; | ||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::HashMap; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{Read, Seek}; | use std::io::{Read, Seek}; | ||||||
|  |  | ||||||
| use fxhash::FxHashMap; | use fxhash::FxHashMap; | ||||||
| use heed::RoTxn; |  | ||||||
| use itertools::Itertools; | use itertools::Itertools; | ||||||
| use obkv::{KvReader, KvReaderU16, KvWriter}; | use obkv::{KvReader, KvReaderU16, KvWriter}; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
| @@ -21,14 +20,17 @@ use super::{IndexDocumentsMethod, IndexerConfig}; | |||||||
| use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; | use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; | ||||||
| use crate::error::{Error, InternalError, UserError}; | use crate::error::{Error, InternalError, UserError}; | ||||||
| use crate::index::{db_name, main_key}; | use crate::index::{db_name, main_key}; | ||||||
| use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; | use crate::update::del_add::{ | ||||||
|  |     del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd, | ||||||
|  | }; | ||||||
| use crate::update::index_documents::GrenadParameters; | use crate::update::index_documents::GrenadParameters; | ||||||
| use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; | use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; | ||||||
|  | use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; | ||||||
| use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; | use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; | ||||||
|  |  | ||||||
| pub struct TransformOutput { | pub struct TransformOutput { | ||||||
|     pub primary_key: String, |     pub primary_key: String, | ||||||
|     pub fields_ids_map: FieldsIdsMap, |     pub settings_diff: InnerIndexSettingsDiff, | ||||||
|     pub field_distribution: FieldDistribution, |     pub field_distribution: FieldDistribution, | ||||||
|     pub documents_count: usize, |     pub documents_count: usize, | ||||||
|     pub original_documents: File, |     pub original_documents: File, | ||||||
| @@ -282,7 +284,9 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                     self.original_sorter |                     self.original_sorter | ||||||
|                         .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; |                         .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|                     let base_obkv = KvReader::new(base_obkv); |                     let base_obkv = KvReader::new(base_obkv); | ||||||
|                     if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? { |                     if let Some(flattened_obkv) = | ||||||
|  |                         Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)? | ||||||
|  |                     { | ||||||
|                         // we recreate our buffer with the flattened documents |                         // we recreate our buffer with the flattened documents | ||||||
|                         document_sorter_value_buffer.clear(); |                         document_sorter_value_buffer.clear(); | ||||||
|                         document_sorter_value_buffer.push(Operation::Addition as u8); |                         document_sorter_value_buffer.push(Operation::Addition as u8); | ||||||
| @@ -315,7 +319,9 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|                     .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; |                     .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; | ||||||
|  |  | ||||||
|                 let flattened_obkv = KvReader::new(&obkv_buffer); |                 let flattened_obkv = KvReader::new(&obkv_buffer); | ||||||
|                 if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { |                 if let Some(obkv) = | ||||||
|  |                     Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? | ||||||
|  |                 { | ||||||
|                     document_sorter_value_buffer.clear(); |                     document_sorter_value_buffer.clear(); | ||||||
|                     document_sorter_value_buffer.push(Operation::Addition as u8); |                     document_sorter_value_buffer.push(Operation::Addition as u8); | ||||||
|                     into_del_add_obkv( |                     into_del_add_obkv( | ||||||
| @@ -524,7 +530,9 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|  |  | ||||||
|         // flatten it and push it as to delete in the flattened_sorter |         // flatten it and push it as to delete in the flattened_sorter | ||||||
|         let flattened_obkv = KvReader::new(base_obkv); |         let flattened_obkv = KvReader::new(base_obkv); | ||||||
|         if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? { |         if let Some(obkv) = | ||||||
|  |             Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)? | ||||||
|  |         { | ||||||
|             // we recreate our buffer with the flattened documents |             // we recreate our buffer with the flattened documents | ||||||
|             document_sorter_value_buffer.clear(); |             document_sorter_value_buffer.clear(); | ||||||
|             document_sorter_value_buffer.push(Operation::Deletion as u8); |             document_sorter_value_buffer.push(Operation::Deletion as u8); | ||||||
| @@ -541,8 +549,15 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|  |  | ||||||
|     // Flatten a document from the fields ids map contained in self and insert the new |     // Flatten a document from the fields ids map contained in self and insert the new | ||||||
|     // created fields. Returns `None` if the document doesn't need to be flattened. |     // created fields. Returns `None` if the document doesn't need to be flattened. | ||||||
|     #[tracing::instrument(level = "trace", skip(self, obkv), target = "indexing::transform")] |     #[tracing::instrument( | ||||||
|     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> { |         level = "trace", | ||||||
|  |         skip(obkv, fields_ids_map), | ||||||
|  |         target = "indexing::transform" | ||||||
|  |     )] | ||||||
|  |     fn flatten_from_fields_ids_map( | ||||||
|  |         obkv: &KvReader<FieldId>, | ||||||
|  |         fields_ids_map: &mut FieldsIdsMap, | ||||||
|  |     ) -> Result<Option<Vec<u8>>> { | ||||||
|         if obkv |         if obkv | ||||||
|             .iter() |             .iter() | ||||||
|             .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) |             .all(|(_, value)| !json_depth_checker::should_flatten_from_unchecked_slice(value)) | ||||||
| @@ -563,7 +578,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         // all the raw values get inserted directly in the `key_value` vec. |         // all the raw values get inserted directly in the `key_value` vec. | ||||||
|         for (key, value) in obkv.iter() { |         for (key, value) in obkv.iter() { | ||||||
|             if json_depth_checker::should_flatten_from_unchecked_slice(value) { |             if json_depth_checker::should_flatten_from_unchecked_slice(value) { | ||||||
|                 let key = self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { |                 let key = fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId { | ||||||
|                     field_id: key, |                     field_id: key, | ||||||
|                     process: "Flatten from fields ids map.", |                     process: "Flatten from fields ids map.", | ||||||
|                 })?; |                 })?; | ||||||
| @@ -581,7 +596,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         // Once we have the flattened version we insert all the new generated fields_ids |         // Once we have the flattened version we insert all the new generated fields_ids | ||||||
|         // (if any) in the fields ids map and serialize the value. |         // (if any) in the fields ids map and serialize the value. | ||||||
|         for (key, value) in flattened.into_iter() { |         for (key, value) in flattened.into_iter() { | ||||||
|             let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; |             let fid = fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; | ||||||
|             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; |             let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; | ||||||
|             key_value.push((fid, value.into())); |             key_value.push((fid, value.into())); | ||||||
|         } |         } | ||||||
| @@ -792,9 +807,19 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             fst_new_external_documents_ids_builder.insert(key, value) |             fst_new_external_documents_ids_builder.insert(key, value) | ||||||
|         })?; |         })?; | ||||||
|  |  | ||||||
|  |         let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?; | ||||||
|  |         let mut new_inner_settings = old_inner_settings.clone(); | ||||||
|  |         new_inner_settings.fields_ids_map = self.fields_ids_map; | ||||||
|  |         let settings_diff = InnerIndexSettingsDiff { | ||||||
|  |             old: old_inner_settings, | ||||||
|  |             new: new_inner_settings, | ||||||
|  |             embedding_configs_updated: false, | ||||||
|  |             settings_update_only: false, | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         Ok(TransformOutput { |         Ok(TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map: self.fields_ids_map, |             settings_diff, | ||||||
|             field_distribution, |             field_distribution, | ||||||
|             documents_count: self.documents_count, |             documents_count: self.documents_count, | ||||||
|             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, |             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, | ||||||
| @@ -804,6 +829,44 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|         }) |         }) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /// Rebind the field_ids of the provided document to their values | ||||||
|  |     /// based on the field_ids_maps difference between the old and the new settings, | ||||||
|  |     /// then fill the provided buffers with delta documents using KvWritterDelAdd. | ||||||
|  |     fn rebind_existing_document( | ||||||
|  |         old_obkv: KvReader<FieldId>, | ||||||
|  |         settings_diff: &InnerIndexSettingsDiff, | ||||||
|  |         original_obkv_buffer: &mut Vec<u8>, | ||||||
|  |         flattened_obkv_buffer: &mut Vec<u8>, | ||||||
|  |     ) -> Result<()> { | ||||||
|  |         let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); | ||||||
|  |         let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); | ||||||
|  |         let mut obkv_writer = KvWriter::<_, FieldId>::memory(); | ||||||
|  |         // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. | ||||||
|  |         for (id, name) in new_fields_ids_map.iter() { | ||||||
|  |             if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) { | ||||||
|  |                 obkv_writer.insert(id, val)?; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         let data = obkv_writer.into_inner()?; | ||||||
|  |         let new_obkv = KvReader::<FieldId>::new(&data); | ||||||
|  |  | ||||||
|  |         // take the non-flattened version if flatten_from_fields_ids_map returns None. | ||||||
|  |         let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?; | ||||||
|  |         let old_flattened = | ||||||
|  |             old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::<FieldId>::new); | ||||||
|  |         let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?; | ||||||
|  |         let new_flattened = | ||||||
|  |             new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::<FieldId>::new); | ||||||
|  |  | ||||||
|  |         original_obkv_buffer.clear(); | ||||||
|  |         flattened_obkv_buffer.clear(); | ||||||
|  |  | ||||||
|  |         del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?; | ||||||
|  |         del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?; | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents |     /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents | ||||||
|     /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. |     /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. | ||||||
|     /// |     /// | ||||||
| @@ -811,8 +874,7 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|     pub fn prepare_for_documents_reindexing( |     pub fn prepare_for_documents_reindexing( | ||||||
|         self, |         self, | ||||||
|         wtxn: &mut heed::RwTxn<'i>, |         wtxn: &mut heed::RwTxn<'i>, | ||||||
|         old_fields_ids_map: FieldsIdsMap, |         settings_diff: InnerIndexSettingsDiff, | ||||||
|         mut new_fields_ids_map: FieldsIdsMap, |  | ||||||
|     ) -> Result<TransformOutput> { |     ) -> Result<TransformOutput> { | ||||||
|         // There already has been a document addition, the primary key should be set by now. |         // There already has been a document addition, the primary key should be set by now. | ||||||
|         let primary_key = self |         let primary_key = self | ||||||
| @@ -848,78 +910,27 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|             self.indexer_settings.max_memory.map(|mem| mem / 2), |             self.indexer_settings.max_memory.map(|mem| mem / 2), | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|         let mut obkv_buffer = Vec::new(); |         let mut original_obkv_buffer = Vec::new(); | ||||||
|  |         let mut flattened_obkv_buffer = Vec::new(); | ||||||
|         let mut document_sorter_key_buffer = Vec::new(); |         let mut document_sorter_key_buffer = Vec::new(); | ||||||
|         let mut document_sorter_value_buffer = Vec::new(); |  | ||||||
|         for result in self.index.external_documents_ids().iter(wtxn)? { |         for result in self.index.external_documents_ids().iter(wtxn)? { | ||||||
|             let (external_id, docid) = result?; |             let (external_id, docid) = result?; | ||||||
|             let obkv = self.index.documents.get(wtxn, &docid)?.ok_or( |             let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( | ||||||
|                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, |                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, | ||||||
|             )?; |             )?; | ||||||
|  |  | ||||||
|             obkv_buffer.clear(); |             Self::rebind_existing_document( | ||||||
|             let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer); |                 old_obkv, | ||||||
|  |                 &settings_diff, | ||||||
|             // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. |                 &mut original_obkv_buffer, | ||||||
|             for (id, name) in new_fields_ids_map.iter() { |                 &mut flattened_obkv_buffer, | ||||||
|                 if let Some(val) = old_fields_ids_map.id(name).and_then(|id| obkv.get(id)) { |             )?; | ||||||
|                     obkv_writer.insert(id, val)?; |  | ||||||
|                 } |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             let buffer = obkv_writer.into_inner()?; |  | ||||||
|  |  | ||||||
|             document_sorter_key_buffer.clear(); |             document_sorter_key_buffer.clear(); | ||||||
|             document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); |             document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); | ||||||
|             document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); |             document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); | ||||||
|             document_sorter_value_buffer.clear(); |             original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; | ||||||
|             into_del_add_obkv( |             flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; | ||||||
|                 KvReaderU16::new(buffer), |  | ||||||
|                 DelAddOperation::Addition, |  | ||||||
|                 &mut document_sorter_value_buffer, |  | ||||||
|             )?; |  | ||||||
|             original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?; |  | ||||||
|  |  | ||||||
|             // Once we have the document. We're going to flatten it |  | ||||||
|             // and insert it in the flattened sorter. |  | ||||||
|             let mut doc = serde_json::Map::new(); |  | ||||||
|  |  | ||||||
|             let reader = obkv::KvReader::new(buffer); |  | ||||||
|             for (k, v) in reader.iter() { |  | ||||||
|                 let key = new_fields_ids_map.name(k).ok_or(FieldIdMapMissingEntry::FieldId { |  | ||||||
|                     field_id: k, |  | ||||||
|                     process: "Accessing field distribution in transform.", |  | ||||||
|                 })?; |  | ||||||
|                 let value = serde_json::from_slice::<serde_json::Value>(v) |  | ||||||
|                     .map_err(InternalError::SerdeJson)?; |  | ||||||
|                 doc.insert(key.to_string(), value); |  | ||||||
|             } |  | ||||||
|  |  | ||||||
|             let flattened = flatten_serde_json::flatten(&doc); |  | ||||||
|  |  | ||||||
|             // Once we have the flattened version we can convert it back to obkv and |  | ||||||
|             // insert all the new generated fields_ids (if any) in the fields ids map. |  | ||||||
|             let mut buffer: Vec<u8> = Vec::new(); |  | ||||||
|             let mut writer = KvWriter::new(&mut buffer); |  | ||||||
|             let mut flattened: Vec<_> = flattened.into_iter().collect(); |  | ||||||
|             // we reorder the field to get all the known field first |  | ||||||
|             flattened.sort_unstable_by_key(|(key, _)| { |  | ||||||
|                 new_fields_ids_map.id(key).unwrap_or(FieldId::MAX) |  | ||||||
|             }); |  | ||||||
|  |  | ||||||
|             for (key, value) in flattened { |  | ||||||
|                 let fid = |  | ||||||
|                     new_fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; |  | ||||||
|                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; |  | ||||||
|                 writer.insert(fid, &value)?; |  | ||||||
|             } |  | ||||||
|             document_sorter_value_buffer.clear(); |  | ||||||
|             into_del_add_obkv( |  | ||||||
|                 KvReaderU16::new(&buffer), |  | ||||||
|                 DelAddOperation::Addition, |  | ||||||
|                 &mut document_sorter_value_buffer, |  | ||||||
|             )?; |  | ||||||
|             flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?; |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let grenad_params = GrenadParameters { |         let grenad_params = GrenadParameters { | ||||||
| @@ -934,22 +945,14 @@ impl<'a, 'i> Transform<'a, 'i> { | |||||||
|  |  | ||||||
|         let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; |         let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; | ||||||
|  |  | ||||||
|         let output = TransformOutput { |         Ok(TransformOutput { | ||||||
|             primary_key, |             primary_key, | ||||||
|             fields_ids_map: new_fields_ids_map, |  | ||||||
|             field_distribution, |             field_distribution, | ||||||
|  |             settings_diff, | ||||||
|             documents_count, |             documents_count, | ||||||
|             original_documents: original_documents.into_inner().into_inner(), |             original_documents: original_documents.into_inner().into_inner(), | ||||||
|             flattened_documents: flattened_documents.into_inner().into_inner(), |             flattened_documents: flattened_documents.into_inner().into_inner(), | ||||||
|         }; |         }) | ||||||
|  |  | ||||||
|         let new_facets = output.compute_real_facets(wtxn, self.index)?; |  | ||||||
|         self.index.put_faceted_fields(wtxn, &new_facets)?; |  | ||||||
|  |  | ||||||
|         // We clear the full database (words-fst, documents ids and documents content). |  | ||||||
|         ClearDocuments::new(wtxn, self.index).execute()?; |  | ||||||
|  |  | ||||||
|         Ok(output) |  | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -964,20 +967,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> { | |||||||
|     vec.into_iter().map(|_| unreachable!()).collect() |     vec.into_iter().map(|_| unreachable!()).collect() | ||||||
| } | } | ||||||
|  |  | ||||||
| impl TransformOutput { |  | ||||||
|     // find and insert the new field ids |  | ||||||
|     pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> { |  | ||||||
|         let user_defined_facets = index.user_defined_faceted_fields(rtxn)?; |  | ||||||
|  |  | ||||||
|         Ok(self |  | ||||||
|             .fields_ids_map |  | ||||||
|             .names() |  | ||||||
|             .filter(|&field| crate::is_faceted(field, &user_defined_facets)) |  | ||||||
|             .map(|field| field.to_string()) |  | ||||||
|             .collect()) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] | #[cfg(test)] | ||||||
| mod test { | mod test { | ||||||
|     use super::*; |     use super::*; | ||||||
|   | |||||||
| @@ -1,5 +1,6 @@ | |||||||
| use grenad::CompressionType; | use grenad::CompressionType; | ||||||
| use rayon::ThreadPool; |  | ||||||
|  | use crate::thread_pool_no_abort::ThreadPoolNoAbort; | ||||||
|  |  | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub struct IndexerConfig { | pub struct IndexerConfig { | ||||||
| @@ -9,7 +10,7 @@ pub struct IndexerConfig { | |||||||
|     pub max_memory: Option<usize>, |     pub max_memory: Option<usize>, | ||||||
|     pub chunk_compression_type: CompressionType, |     pub chunk_compression_type: CompressionType, | ||||||
|     pub chunk_compression_level: Option<u32>, |     pub chunk_compression_level: Option<u32>, | ||||||
|     pub thread_pool: Option<ThreadPool>, |     pub thread_pool: Option<ThreadPoolNoAbort>, | ||||||
|     pub max_positions_per_attributes: Option<u32>, |     pub max_positions_per_attributes: Option<u32>, | ||||||
|     pub skip_index_budget: bool, |     pub skip_index_budget: bool, | ||||||
| } | } | ||||||
|   | |||||||
| @@ -20,7 +20,7 @@ use crate::update::index_documents::IndexDocumentsMethod; | |||||||
| use crate::update::{IndexDocuments, UpdateIndexingStep}; | use crate::update::{IndexDocuments, UpdateIndexingStep}; | ||||||
| use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; | use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; | ||||||
| use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; | use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; | ||||||
| use crate::{FieldsIdsMap, Index, Result}; | use crate::{FieldId, FieldsIdsMap, Index, Result}; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, PartialEq, Eq, Copy)] | #[derive(Debug, Clone, PartialEq, Eq, Copy)] | ||||||
| pub enum Setting<T> { | pub enum Setting<T> { | ||||||
| @@ -385,14 +385,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|  |  | ||||||
|     #[tracing::instrument( |     #[tracing::instrument( | ||||||
|         level = "trace" |         level = "trace" | ||||||
|         skip(self, progress_callback, should_abort, old_fields_ids_map), |         skip(self, progress_callback, should_abort, settings_diff), | ||||||
|         target = "indexing::documents" |         target = "indexing::documents" | ||||||
|     )] |     )] | ||||||
|     fn reindex<FP, FA>( |     fn reindex<FP, FA>( | ||||||
|         &mut self, |         &mut self, | ||||||
|         progress_callback: &FP, |         progress_callback: &FP, | ||||||
|         should_abort: &FA, |         should_abort: &FA, | ||||||
|         old_fields_ids_map: FieldsIdsMap, |         settings_diff: InnerIndexSettingsDiff, | ||||||
|     ) -> Result<()> |     ) -> Result<()> | ||||||
|     where |     where | ||||||
|         FP: Fn(UpdateIndexingStep) + Sync, |         FP: Fn(UpdateIndexingStep) + Sync, | ||||||
| @@ -400,7 +400,6 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|     { |     { | ||||||
|         puffin::profile_function!(); |         puffin::profile_function!(); | ||||||
|  |  | ||||||
|         let fields_ids_map = self.index.fields_ids_map(self.wtxn)?; |  | ||||||
|         // if the settings are set before any document update, we don't need to do anything, and |         // if the settings are set before any document update, we don't need to do anything, and | ||||||
|         // will set the primary key during the first document addition. |         // will set the primary key during the first document addition. | ||||||
|         if self.index.number_of_documents(self.wtxn)? == 0 { |         if self.index.number_of_documents(self.wtxn)? == 0 { | ||||||
| @@ -416,14 +415,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. |         // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. | ||||||
|         let output = transform.prepare_for_documents_reindexing( |         let output = transform.prepare_for_documents_reindexing(self.wtxn, settings_diff)?; | ||||||
|             self.wtxn, |  | ||||||
|             old_fields_ids_map, |  | ||||||
|             fields_ids_map, |  | ||||||
|         )?; |  | ||||||
|  |  | ||||||
|         let embedder_configs = self.index.embedding_configs(self.wtxn)?; |  | ||||||
|         let embedders = self.embedders(embedder_configs)?; |  | ||||||
|  |  | ||||||
|         // We index the generated `TransformOutput` which must contain |         // We index the generated `TransformOutput` which must contain | ||||||
|         // all the documents with fields in the newly defined searchable order. |         // all the documents with fields in the newly defined searchable order. | ||||||
| @@ -436,32 +428,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|             &should_abort, |             &should_abort, | ||||||
|         )?; |         )?; | ||||||
|  |  | ||||||
|         let indexing_builder = indexing_builder.with_embedders(embedders); |  | ||||||
|         indexing_builder.execute_raw(output)?; |         indexing_builder.execute_raw(output)?; | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn embedders( |  | ||||||
|         &self, |  | ||||||
|         embedding_configs: Vec<(String, EmbeddingConfig)>, |  | ||||||
|     ) -> Result<EmbeddingConfigs> { |  | ||||||
|         let res: Result<_> = embedding_configs |  | ||||||
|             .into_iter() |  | ||||||
|             .map(|(name, EmbeddingConfig { embedder_options, prompt })| { |  | ||||||
|                 let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); |  | ||||||
|  |  | ||||||
|                 let embedder = Arc::new( |  | ||||||
|                     Embedder::new(embedder_options.clone()) |  | ||||||
|                         .map_err(crate::vector::Error::from) |  | ||||||
|                         .map_err(crate::Error::from)?, |  | ||||||
|                 ); |  | ||||||
|                 Ok((name, (embedder, prompt))) |  | ||||||
|             }) |  | ||||||
|             .collect(); |  | ||||||
|         res.map(EmbeddingConfigs::new) |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     fn update_displayed(&mut self) -> Result<bool> { |     fn update_displayed(&mut self) -> Result<bool> { | ||||||
|         match self.displayed_fields { |         match self.displayed_fields { | ||||||
|             Setting::Set(ref fields) => { |             Setting::Set(ref fields) => { | ||||||
| @@ -1038,6 +1009,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|             } |             } | ||||||
|             Setting::NotSet => false, |             Setting::NotSet => false, | ||||||
|         }; |         }; | ||||||
|  |  | ||||||
|  |         // if any changes force a reindexing | ||||||
|  |         // clear the vector database. | ||||||
|  |         if update { | ||||||
|  |             self.index.vector_arroy.clear(self.wtxn)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         Ok(update) |         Ok(update) | ||||||
|     } |     } | ||||||
|  |  | ||||||
| @@ -1066,20 +1044,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|     { |     { | ||||||
|         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; |         self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; | ||||||
|  |  | ||||||
|         // Note: this MUST be before `update_sortable` so that we can get the old value to compare with the updated value afterwards |         let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; | ||||||
|  |  | ||||||
|         let existing_fields: HashSet<_> = self |  | ||||||
|             .index |  | ||||||
|             .field_distribution(self.wtxn)? |  | ||||||
|             .into_iter() |  | ||||||
|             .filter_map(|(field, count)| (count != 0).then_some(field)) |  | ||||||
|             .collect(); |  | ||||||
|         let old_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; |  | ||||||
|         let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; |  | ||||||
|  |  | ||||||
|  |         // never trigger re-indexing | ||||||
|         self.update_displayed()?; |         self.update_displayed()?; | ||||||
|         self.update_filterable()?; |  | ||||||
|         self.update_sortable()?; |  | ||||||
|         self.update_distinct_field()?; |         self.update_distinct_field()?; | ||||||
|         self.update_criteria()?; |         self.update_criteria()?; | ||||||
|         self.update_primary_key()?; |         self.update_primary_key()?; | ||||||
| @@ -1089,16 +1057,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|         self.update_max_values_per_facet()?; |         self.update_max_values_per_facet()?; | ||||||
|         self.update_sort_facet_values_by()?; |         self.update_sort_facet_values_by()?; | ||||||
|         self.update_pagination_max_total_hits()?; |         self.update_pagination_max_total_hits()?; | ||||||
|  |         self.update_search_cutoff()?; | ||||||
|  |  | ||||||
|         let faceted_updated = self.update_faceted(existing_fields, old_faceted_fields)?; |         // could trigger re-indexing | ||||||
|         let stop_words_updated = self.update_stop_words()?; |         self.update_filterable()?; | ||||||
|         let non_separator_tokens_updated = self.update_non_separator_tokens()?; |         self.update_sortable()?; | ||||||
|         let separator_tokens_updated = self.update_separator_tokens()?; |         self.update_stop_words()?; | ||||||
|         let dictionary_updated = self.update_dictionary()?; |         self.update_non_separator_tokens()?; | ||||||
|         let synonyms_updated = self.update_synonyms()?; |         self.update_separator_tokens()?; | ||||||
|         let searchable_updated = self.update_searchable()?; |         self.update_dictionary()?; | ||||||
|         let exact_attributes_updated = self.update_exact_attributes()?; |         self.update_synonyms()?; | ||||||
|         let proximity_precision = self.update_proximity_precision()?; |         self.update_searchable()?; | ||||||
|  |         self.update_exact_attributes()?; | ||||||
|  |         self.update_proximity_precision()?; | ||||||
|         // TODO: very rough approximation of the needs for reindexing where any change will result in |         // TODO: very rough approximation of the needs for reindexing where any change will result in | ||||||
|         // a full reindexing. |         // a full reindexing. | ||||||
|         // What can be done instead: |         // What can be done instead: | ||||||
| @@ -1107,53 +1078,193 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { | |||||||
|         // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage |         // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage | ||||||
|         let embedding_configs_updated = self.update_embedding_configs()?; |         let embedding_configs_updated = self.update_embedding_configs()?; | ||||||
|  |  | ||||||
|         // never trigger re-indexing |         let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; | ||||||
|         self.update_search_cutoff()?; |         let inner_settings_diff = InnerIndexSettingsDiff { | ||||||
|  |             old: old_inner_settings, | ||||||
|  |             new: new_inner_settings, | ||||||
|  |             embedding_configs_updated, | ||||||
|  |             settings_update_only: true, | ||||||
|  |         }; | ||||||
|  |  | ||||||
|         if stop_words_updated |         if inner_settings_diff.any_reindexing_needed() { | ||||||
|             || non_separator_tokens_updated |             self.reindex(&progress_callback, &should_abort, inner_settings_diff)?; | ||||||
|             || separator_tokens_updated |  | ||||||
|             || dictionary_updated |  | ||||||
|             || faceted_updated |  | ||||||
|             || synonyms_updated |  | ||||||
|             || searchable_updated |  | ||||||
|             || exact_attributes_updated |  | ||||||
|             || proximity_precision |  | ||||||
|             || embedding_configs_updated |  | ||||||
|         { |  | ||||||
|             self.reindex(&progress_callback, &should_abort, old_fields_ids_map)?; |  | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         Ok(()) |         Ok(()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     fn update_faceted( |  | ||||||
|         &self, |  | ||||||
|         existing_fields: HashSet<String>, |  | ||||||
|         old_faceted_fields: HashSet<String>, |  | ||||||
|     ) -> Result<bool> { |  | ||||||
|         if existing_fields.iter().any(|field| field.contains('.')) { |  | ||||||
|             return Ok(true); |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  | pub struct InnerIndexSettingsDiff { | ||||||
|  |     pub(crate) old: InnerIndexSettings, | ||||||
|  |     pub(crate) new: InnerIndexSettings, | ||||||
|  |  | ||||||
|  |     // TODO: compare directly the embedders. | ||||||
|  |     pub(crate) embedding_configs_updated: bool, | ||||||
|  |  | ||||||
|  |     pub(crate) settings_update_only: bool, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl InnerIndexSettingsDiff { | ||||||
|  |     pub fn any_reindexing_needed(&self) -> bool { | ||||||
|  |         self.reindex_searchable() || self.reindex_facets() || self.reindex_vectors() | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn reindex_searchable(&self) -> bool { | ||||||
|  |         self.old | ||||||
|  |             .fields_ids_map | ||||||
|  |             .iter() | ||||||
|  |             .zip(self.new.fields_ids_map.iter()) | ||||||
|  |             .any(|(old, new)| old != new) | ||||||
|  |             || self.old.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) | ||||||
|  |                 != self.new.stop_words.as_ref().map(|set| set.as_fst().as_bytes()) | ||||||
|  |             || self.old.allowed_separators != self.new.allowed_separators | ||||||
|  |             || self.old.dictionary != self.new.dictionary | ||||||
|  |             || self.old.user_defined_searchable_fields != self.new.user_defined_searchable_fields | ||||||
|  |             || self.old.exact_attributes != self.new.exact_attributes | ||||||
|  |             || self.old.proximity_precision != self.new.proximity_precision | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn reindex_facets(&self) -> bool { | ||||||
|  |         let existing_fields = &self.new.existing_fields; | ||||||
|  |         if existing_fields.iter().any(|field| field.contains('.')) { | ||||||
|  |             return true; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let old_faceted_fields = &self.old.user_defined_faceted_fields; | ||||||
|         if old_faceted_fields.iter().any(|field| field.contains('.')) { |         if old_faceted_fields.iter().any(|field| field.contains('.')) { | ||||||
|             return Ok(true); |             return true; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         // If there is new faceted fields we indicate that we must reindex as we must |         // If there is new faceted fields we indicate that we must reindex as we must | ||||||
|         // index new fields as facets. It means that the distinct attribute, |         // index new fields as facets. It means that the distinct attribute, | ||||||
|         // an Asc/Desc criterion or a filtered attribute as be added or removed. |         // an Asc/Desc criterion or a filtered attribute as be added or removed. | ||||||
|         let new_faceted_fields = self.index.user_defined_faceted_fields(self.wtxn)?; |         let new_faceted_fields = &self.new.user_defined_faceted_fields; | ||||||
|  |  | ||||||
|         if new_faceted_fields.iter().any(|field| field.contains('.')) { |         if new_faceted_fields.iter().any(|field| field.contains('.')) { | ||||||
|             return Ok(true); |             return true; | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let faceted_updated = |         let faceted_updated = | ||||||
|             (&existing_fields - &old_faceted_fields) != (&existing_fields - &new_faceted_fields); |             (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields); | ||||||
|  |  | ||||||
|         Ok(faceted_updated) |         self.old | ||||||
|  |             .fields_ids_map | ||||||
|  |             .iter() | ||||||
|  |             .zip(self.new.fields_ids_map.iter()) | ||||||
|  |             .any(|(old, new)| old != new) | ||||||
|  |             || faceted_updated | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn reindex_vectors(&self) -> bool { | ||||||
|  |         self.embedding_configs_updated | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn settings_update_only(&self) -> bool { | ||||||
|  |         self.settings_update_only | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Clone)] | ||||||
|  | pub(crate) struct InnerIndexSettings { | ||||||
|  |     pub stop_words: Option<fst::Set<Vec<u8>>>, | ||||||
|  |     pub allowed_separators: Option<BTreeSet<String>>, | ||||||
|  |     pub dictionary: Option<BTreeSet<String>>, | ||||||
|  |     pub fields_ids_map: FieldsIdsMap, | ||||||
|  |     pub user_defined_faceted_fields: HashSet<String>, | ||||||
|  |     pub user_defined_searchable_fields: Option<Vec<String>>, | ||||||
|  |     pub faceted_fields_ids: HashSet<FieldId>, | ||||||
|  |     pub searchable_fields_ids: Option<Vec<FieldId>>, | ||||||
|  |     pub exact_attributes: HashSet<FieldId>, | ||||||
|  |     pub proximity_precision: ProximityPrecision, | ||||||
|  |     pub embedding_configs: EmbeddingConfigs, | ||||||
|  |     pub existing_fields: HashSet<String>, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl InnerIndexSettings { | ||||||
|  |     pub fn from_index(index: &Index, rtxn: &heed::RoTxn) -> Result<Self> { | ||||||
|  |         let stop_words = index.stop_words(rtxn)?; | ||||||
|  |         let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); | ||||||
|  |         let allowed_separators = index.allowed_separators(rtxn)?; | ||||||
|  |         let dictionary = index.dictionary(rtxn)?; | ||||||
|  |         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||||
|  |         let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; | ||||||
|  |         let user_defined_searchable_fields = | ||||||
|  |             user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); | ||||||
|  |         let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; | ||||||
|  |         let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; | ||||||
|  |         let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; | ||||||
|  |         let exact_attributes = index.exact_attributes_ids(rtxn)?; | ||||||
|  |         let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); | ||||||
|  |         let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; | ||||||
|  |         let existing_fields: HashSet<_> = index | ||||||
|  |             .field_distribution(rtxn)? | ||||||
|  |             .into_iter() | ||||||
|  |             .filter_map(|(field, count)| (count != 0).then_some(field)) | ||||||
|  |             .collect(); | ||||||
|  |  | ||||||
|  |         Ok(Self { | ||||||
|  |             stop_words, | ||||||
|  |             allowed_separators, | ||||||
|  |             dictionary, | ||||||
|  |             fields_ids_map, | ||||||
|  |             user_defined_faceted_fields, | ||||||
|  |             user_defined_searchable_fields, | ||||||
|  |             faceted_fields_ids, | ||||||
|  |             searchable_fields_ids, | ||||||
|  |             exact_attributes, | ||||||
|  |             proximity_precision, | ||||||
|  |             embedding_configs, | ||||||
|  |             existing_fields, | ||||||
|  |         }) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // find and insert the new field ids | ||||||
|  |     pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { | ||||||
|  |         let new_facets = self | ||||||
|  |             .fields_ids_map | ||||||
|  |             .names() | ||||||
|  |             .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) | ||||||
|  |             .map(|field| field.to_string()) | ||||||
|  |             .collect(); | ||||||
|  |         index.put_faceted_fields(wtxn, &new_facets)?; | ||||||
|  |  | ||||||
|  |         self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     // find and insert the new field ids | ||||||
|  |     pub fn recompute_searchables(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { | ||||||
|  |         // in case new fields were introduced we're going to recreate the searchable fields. | ||||||
|  |         if let Some(searchable_fields) = self.user_defined_searchable_fields.as_ref() { | ||||||
|  |             let searchable_fields = | ||||||
|  |                 searchable_fields.iter().map(String::as_ref).collect::<Vec<_>>(); | ||||||
|  |             index.put_all_searchable_fields_from_fields_ids_map( | ||||||
|  |                 wtxn, | ||||||
|  |                 &searchable_fields, | ||||||
|  |                 &self.fields_ids_map, | ||||||
|  |             )?; | ||||||
|  |             let searchable_fields_ids = index.searchable_fields_ids(wtxn)?; | ||||||
|  |             self.searchable_fields_ids = searchable_fields_ids; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> { | ||||||
|  |     let res: Result<_> = embedding_configs | ||||||
|  |         .into_iter() | ||||||
|  |         .map(|(name, EmbeddingConfig { embedder_options, prompt })| { | ||||||
|  |             let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); | ||||||
|  |  | ||||||
|  |             let embedder = Arc::new( | ||||||
|  |                 Embedder::new(embedder_options.clone()) | ||||||
|  |                     .map_err(crate::vector::Error::from) | ||||||
|  |                     .map_err(crate::Error::from)?, | ||||||
|  |             ); | ||||||
|  |             Ok((name, (embedder, prompt))) | ||||||
|  |         }) | ||||||
|  |         .collect(); | ||||||
|  |     res.map(EmbeddingConfigs::new) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn validate_prompt( | fn validate_prompt( | ||||||
| @@ -1643,6 +1754,70 @@ mod tests { | |||||||
|             .unwrap() |             .unwrap() | ||||||
|             .count(); |             .count(); | ||||||
|         assert_eq!(count, 4); |         assert_eq!(count, 4); | ||||||
|  |  | ||||||
|  |         // Set the filterable fields to be the age and the name. | ||||||
|  |         index | ||||||
|  |             .update_settings(|settings| { | ||||||
|  |                 settings.set_filterable_fields(hashset! { S("age"),  S("name") }); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         // Check that the displayed fields are correctly set. | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let fields_ids = index.filterable_fields(&rtxn).unwrap(); | ||||||
|  |         assert_eq!(fields_ids, hashset! { S("age"),  S("name") }); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         // Only count the field_id 0 and level 0 facet values. | ||||||
|  |         let count = index | ||||||
|  |             .facet_id_f64_docids | ||||||
|  |             .remap_key_type::<Bytes>() | ||||||
|  |             .prefix_iter(&rtxn, &[0, 1, 0]) | ||||||
|  |             .unwrap() | ||||||
|  |             .count(); | ||||||
|  |         assert_eq!(count, 4); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         // Only count the field_id 0 and level 0 facet values. | ||||||
|  |         let count = index | ||||||
|  |             .facet_id_string_docids | ||||||
|  |             .remap_key_type::<Bytes>() | ||||||
|  |             .prefix_iter(&rtxn, &[0, 0]) | ||||||
|  |             .unwrap() | ||||||
|  |             .count(); | ||||||
|  |         assert_eq!(count, 5); | ||||||
|  |  | ||||||
|  |         // Remove the age from the filterable fields. | ||||||
|  |         index | ||||||
|  |             .update_settings(|settings| { | ||||||
|  |                 settings.set_filterable_fields(hashset! { S("name") }); | ||||||
|  |             }) | ||||||
|  |             .unwrap(); | ||||||
|  |  | ||||||
|  |         // Check that the displayed fields are correctly set. | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let fields_ids = index.filterable_fields(&rtxn).unwrap(); | ||||||
|  |         assert_eq!(fields_ids, hashset! { S("name") }); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         // Only count the field_id 0 and level 0 facet values. | ||||||
|  |         let count = index | ||||||
|  |             .facet_id_f64_docids | ||||||
|  |             .remap_key_type::<Bytes>() | ||||||
|  |             .prefix_iter(&rtxn, &[0, 1, 0]) | ||||||
|  |             .unwrap() | ||||||
|  |             .count(); | ||||||
|  |         assert_eq!(count, 0); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         // Only count the field_id 0 and level 0 facet values. | ||||||
|  |         let count = index | ||||||
|  |             .facet_id_string_docids | ||||||
|  |             .remap_key_type::<Bytes>() | ||||||
|  |             .prefix_iter(&rtxn, &[0, 0]) | ||||||
|  |             .unwrap() | ||||||
|  |             .count(); | ||||||
|  |         assert_eq!(count, 5); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     #[test] |     #[test] | ||||||
|   | |||||||
| @@ -3,6 +3,7 @@ use std::path::PathBuf; | |||||||
| use hf_hub::api::sync::ApiError; | use hf_hub::api::sync::ApiError; | ||||||
|  |  | ||||||
| use crate::error::FaultSource; | use crate::error::FaultSource; | ||||||
|  | use crate::PanicCatched; | ||||||
|  |  | ||||||
| #[derive(Debug, thiserror::Error)] | #[derive(Debug, thiserror::Error)] | ||||||
| #[error("Error while generating embeddings: {inner}")] | #[error("Error while generating embeddings: {inner}")] | ||||||
| @@ -80,6 +81,8 @@ pub enum EmbedErrorKind { | |||||||
|     OpenAiUnexpectedDimension(usize, usize), |     OpenAiUnexpectedDimension(usize, usize), | ||||||
|     #[error("no embedding was produced")] |     #[error("no embedding was produced")] | ||||||
|     MissingEmbedding, |     MissingEmbedding, | ||||||
|  |     #[error(transparent)] | ||||||
|  |     PanicInThreadPool(#[from] PanicCatched), | ||||||
| } | } | ||||||
|  |  | ||||||
| impl EmbedError { | impl EmbedError { | ||||||
|   | |||||||
| @@ -7,6 +7,7 @@ use serde::{Deserialize, Serialize}; | |||||||
|  |  | ||||||
| use self::error::{EmbedError, NewEmbedderError}; | use self::error::{EmbedError, NewEmbedderError}; | ||||||
| use crate::prompt::{Prompt, PromptData}; | use crate::prompt::{Prompt, PromptData}; | ||||||
|  | use crate::ThreadPoolNoAbort; | ||||||
|  |  | ||||||
| pub mod error; | pub mod error; | ||||||
| pub mod hf; | pub mod hf; | ||||||
| @@ -254,7 +255,7 @@ impl Embedder { | |||||||
|     pub fn embed_chunks( |     pub fn embed_chunks( | ||||||
|         &self, |         &self, | ||||||
|         text_chunks: Vec<Vec<String>>, |         text_chunks: Vec<Vec<String>>, | ||||||
|         threads: &rayon::ThreadPool, |         threads: &ThreadPoolNoAbort, | ||||||
|     ) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { |     ) -> std::result::Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { | ||||||
|         match self { |         match self { | ||||||
|             Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), |             Embedder::HuggingFace(embedder) => embedder.embed_chunks(text_chunks), | ||||||
|   | |||||||
| @@ -3,6 +3,8 @@ use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; | |||||||
| use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; | use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; | ||||||
| use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; | use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; | ||||||
| use super::{DistributionShift, Embeddings}; | use super::{DistributionShift, Embeddings}; | ||||||
|  | use crate::error::FaultSource; | ||||||
|  | use crate::ThreadPoolNoAbort; | ||||||
|  |  | ||||||
| #[derive(Debug)] | #[derive(Debug)] | ||||||
| pub struct Embedder { | pub struct Embedder { | ||||||
| @@ -71,11 +73,16 @@ impl Embedder { | |||||||
|     pub fn embed_chunks( |     pub fn embed_chunks( | ||||||
|         &self, |         &self, | ||||||
|         text_chunks: Vec<Vec<String>>, |         text_chunks: Vec<Vec<String>>, | ||||||
|         threads: &rayon::ThreadPool, |         threads: &ThreadPoolNoAbort, | ||||||
|     ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { |     ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { | ||||||
|         threads.install(move || { |         threads | ||||||
|  |             .install(move || { | ||||||
|                 text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() |                 text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() | ||||||
|             }) |             }) | ||||||
|  |             .map_err(|error| EmbedError { | ||||||
|  |                 kind: EmbedErrorKind::PanicInThreadPool(error), | ||||||
|  |                 fault: FaultSource::Bug, | ||||||
|  |             })? | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn chunk_count_hint(&self) -> usize { |     pub fn chunk_count_hint(&self) -> usize { | ||||||
|   | |||||||
| @@ -4,7 +4,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; | |||||||
| use super::error::{EmbedError, NewEmbedderError}; | use super::error::{EmbedError, NewEmbedderError}; | ||||||
| use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; | use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; | ||||||
| use super::{DistributionShift, Embeddings}; | use super::{DistributionShift, Embeddings}; | ||||||
|  | use crate::error::FaultSource; | ||||||
| use crate::vector::error::EmbedErrorKind; | use crate::vector::error::EmbedErrorKind; | ||||||
|  | use crate::ThreadPoolNoAbort; | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] | #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] | ||||||
| pub struct EmbedderOptions { | pub struct EmbedderOptions { | ||||||
| @@ -241,11 +243,16 @@ impl Embedder { | |||||||
|     pub fn embed_chunks( |     pub fn embed_chunks( | ||||||
|         &self, |         &self, | ||||||
|         text_chunks: Vec<Vec<String>>, |         text_chunks: Vec<Vec<String>>, | ||||||
|         threads: &rayon::ThreadPool, |         threads: &ThreadPoolNoAbort, | ||||||
|     ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { |     ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { | ||||||
|         threads.install(move || { |         threads | ||||||
|  |             .install(move || { | ||||||
|                 text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() |                 text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() | ||||||
|             }) |             }) | ||||||
|  |             .map_err(|error| EmbedError { | ||||||
|  |                 kind: EmbedErrorKind::PanicInThreadPool(error), | ||||||
|  |                 fault: FaultSource::Bug, | ||||||
|  |             })? | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn chunk_count_hint(&self) -> usize { |     pub fn chunk_count_hint(&self) -> usize { | ||||||
|   | |||||||
| @@ -2,9 +2,12 @@ use deserr::Deserr; | |||||||
| use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; | use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; | ||||||
| use serde::{Deserialize, Serialize}; | use serde::{Deserialize, Serialize}; | ||||||
|  |  | ||||||
|  | use super::error::EmbedErrorKind; | ||||||
| use super::{ | use super::{ | ||||||
|     DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM, |     DistributionShift, EmbedError, Embedding, Embeddings, NewEmbedderError, REQUEST_PARALLELISM, | ||||||
| }; | }; | ||||||
|  | use crate::error::FaultSource; | ||||||
|  | use crate::ThreadPoolNoAbort; | ||||||
|  |  | ||||||
| // retrying in case of failure | // retrying in case of failure | ||||||
|  |  | ||||||
| @@ -158,11 +161,16 @@ impl Embedder { | |||||||
|     pub fn embed_chunks( |     pub fn embed_chunks( | ||||||
|         &self, |         &self, | ||||||
|         text_chunks: Vec<Vec<String>>, |         text_chunks: Vec<Vec<String>>, | ||||||
|         threads: &rayon::ThreadPool, |         threads: &ThreadPoolNoAbort, | ||||||
|     ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { |     ) -> Result<Vec<Vec<Embeddings<f32>>>, EmbedError> { | ||||||
|         threads.install(move || { |         threads | ||||||
|  |             .install(move || { | ||||||
|                 text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() |                 text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() | ||||||
|             }) |             }) | ||||||
|  |             .map_err(|error| EmbedError { | ||||||
|  |                 kind: EmbedErrorKind::PanicInThreadPool(error), | ||||||
|  |                 fault: FaultSource::Bug, | ||||||
|  |             })? | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     pub fn chunk_count_hint(&self) -> usize { |     pub fn chunk_count_hint(&self) -> usize { | ||||||
|   | |||||||
| @@ -301,10 +301,14 @@ impl From<EmbeddingConfig> for EmbeddingSettings { | |||||||
|     fn from(value: EmbeddingConfig) -> Self { |     fn from(value: EmbeddingConfig) -> Self { | ||||||
|         let EmbeddingConfig { embedder_options, prompt } = value; |         let EmbeddingConfig { embedder_options, prompt } = value; | ||||||
|         match embedder_options { |         match embedder_options { | ||||||
|             super::EmbedderOptions::HuggingFace(options) => Self { |             super::EmbedderOptions::HuggingFace(super::hf::EmbedderOptions { | ||||||
|  |                 model, | ||||||
|  |                 revision, | ||||||
|  |                 distribution, | ||||||
|  |             }) => Self { | ||||||
|                 source: Setting::Set(EmbedderSource::HuggingFace), |                 source: Setting::Set(EmbedderSource::HuggingFace), | ||||||
|                 model: Setting::Set(options.model), |                 model: Setting::Set(model), | ||||||
|                 revision: options.revision.map(Setting::Set).unwrap_or_default(), |                 revision: revision.map(Setting::Set).unwrap_or_default(), | ||||||
|                 api_key: Setting::NotSet, |                 api_key: Setting::NotSet, | ||||||
|                 dimensions: Setting::NotSet, |                 dimensions: Setting::NotSet, | ||||||
|                 document_template: Setting::Set(prompt.template), |                 document_template: Setting::Set(prompt.template), | ||||||
| @@ -314,14 +318,19 @@ impl From<EmbeddingConfig> for EmbeddingSettings { | |||||||
|                 path_to_embeddings: Setting::NotSet, |                 path_to_embeddings: Setting::NotSet, | ||||||
|                 embedding_object: Setting::NotSet, |                 embedding_object: Setting::NotSet, | ||||||
|                 input_type: Setting::NotSet, |                 input_type: Setting::NotSet, | ||||||
|                 distribution: options.distribution.map(Setting::Set).unwrap_or_default(), |                 distribution: distribution.map(Setting::Set).unwrap_or_default(), | ||||||
|             }, |             }, | ||||||
|             super::EmbedderOptions::OpenAi(options) => Self { |             super::EmbedderOptions::OpenAi(super::openai::EmbedderOptions { | ||||||
|  |                 api_key, | ||||||
|  |                 embedding_model, | ||||||
|  |                 dimensions, | ||||||
|  |                 distribution, | ||||||
|  |             }) => Self { | ||||||
|                 source: Setting::Set(EmbedderSource::OpenAi), |                 source: Setting::Set(EmbedderSource::OpenAi), | ||||||
|                 model: Setting::Set(options.embedding_model.name().to_owned()), |                 model: Setting::Set(embedding_model.name().to_owned()), | ||||||
|                 revision: Setting::NotSet, |                 revision: Setting::NotSet, | ||||||
|                 api_key: options.api_key.map(Setting::Set).unwrap_or_default(), |                 api_key: api_key.map(Setting::Set).unwrap_or_default(), | ||||||
|                 dimensions: options.dimensions.map(Setting::Set).unwrap_or_default(), |                 dimensions: dimensions.map(Setting::Set).unwrap_or_default(), | ||||||
|                 document_template: Setting::Set(prompt.template), |                 document_template: Setting::Set(prompt.template), | ||||||
|                 url: Setting::NotSet, |                 url: Setting::NotSet, | ||||||
|                 query: Setting::NotSet, |                 query: Setting::NotSet, | ||||||
| @@ -329,29 +338,37 @@ impl From<EmbeddingConfig> for EmbeddingSettings { | |||||||
|                 path_to_embeddings: Setting::NotSet, |                 path_to_embeddings: Setting::NotSet, | ||||||
|                 embedding_object: Setting::NotSet, |                 embedding_object: Setting::NotSet, | ||||||
|                 input_type: Setting::NotSet, |                 input_type: Setting::NotSet, | ||||||
|                 distribution: options.distribution.map(Setting::Set).unwrap_or_default(), |                 distribution: distribution.map(Setting::Set).unwrap_or_default(), | ||||||
|             }, |             }, | ||||||
|             super::EmbedderOptions::Ollama(options) => Self { |             super::EmbedderOptions::Ollama(super::ollama::EmbedderOptions { | ||||||
|  |                 embedding_model, | ||||||
|  |                 url, | ||||||
|  |                 api_key, | ||||||
|  |                 distribution, | ||||||
|  |             }) => Self { | ||||||
|                 source: Setting::Set(EmbedderSource::Ollama), |                 source: Setting::Set(EmbedderSource::Ollama), | ||||||
|                 model: Setting::Set(options.embedding_model.to_owned()), |                 model: Setting::Set(embedding_model), | ||||||
|                 revision: Setting::NotSet, |                 revision: Setting::NotSet, | ||||||
|                 api_key: Setting::NotSet, |                 api_key: api_key.map(Setting::Set).unwrap_or_default(), | ||||||
|                 dimensions: Setting::NotSet, |                 dimensions: Setting::NotSet, | ||||||
|                 document_template: Setting::Set(prompt.template), |                 document_template: Setting::Set(prompt.template), | ||||||
|                 url: Setting::NotSet, |                 url: url.map(Setting::Set).unwrap_or_default(), | ||||||
|                 query: Setting::NotSet, |                 query: Setting::NotSet, | ||||||
|                 input_field: Setting::NotSet, |                 input_field: Setting::NotSet, | ||||||
|                 path_to_embeddings: Setting::NotSet, |                 path_to_embeddings: Setting::NotSet, | ||||||
|                 embedding_object: Setting::NotSet, |                 embedding_object: Setting::NotSet, | ||||||
|                 input_type: Setting::NotSet, |                 input_type: Setting::NotSet, | ||||||
|                 distribution: options.distribution.map(Setting::Set).unwrap_or_default(), |                 distribution: distribution.map(Setting::Set).unwrap_or_default(), | ||||||
|             }, |             }, | ||||||
|             super::EmbedderOptions::UserProvided(options) => Self { |             super::EmbedderOptions::UserProvided(super::manual::EmbedderOptions { | ||||||
|  |                 dimensions, | ||||||
|  |                 distribution, | ||||||
|  |             }) => Self { | ||||||
|                 source: Setting::Set(EmbedderSource::UserProvided), |                 source: Setting::Set(EmbedderSource::UserProvided), | ||||||
|                 model: Setting::NotSet, |                 model: Setting::NotSet, | ||||||
|                 revision: Setting::NotSet, |                 revision: Setting::NotSet, | ||||||
|                 api_key: Setting::NotSet, |                 api_key: Setting::NotSet, | ||||||
|                 dimensions: Setting::Set(options.dimensions), |                 dimensions: Setting::Set(dimensions), | ||||||
|                 document_template: Setting::NotSet, |                 document_template: Setting::NotSet, | ||||||
|                 url: Setting::NotSet, |                 url: Setting::NotSet, | ||||||
|                 query: Setting::NotSet, |                 query: Setting::NotSet, | ||||||
| @@ -359,7 +376,7 @@ impl From<EmbeddingConfig> for EmbeddingSettings { | |||||||
|                 path_to_embeddings: Setting::NotSet, |                 path_to_embeddings: Setting::NotSet, | ||||||
|                 embedding_object: Setting::NotSet, |                 embedding_object: Setting::NotSet, | ||||||
|                 input_type: Setting::NotSet, |                 input_type: Setting::NotSet, | ||||||
|                 distribution: options.distribution.map(Setting::Set).unwrap_or_default(), |                 distribution: distribution.map(Setting::Set).unwrap_or_default(), | ||||||
|             }, |             }, | ||||||
|             super::EmbedderOptions::Rest(super::rest::EmbedderOptions { |             super::EmbedderOptions::Rest(super::rest::EmbedderOptions { | ||||||
|                 api_key, |                 api_key, | ||||||
|   | |||||||
| @@ -217,9 +217,7 @@ fn add_memory_samples( | |||||||
|     memory_counters: &mut Option<MemoryCounterHandles>, |     memory_counters: &mut Option<MemoryCounterHandles>, | ||||||
|     last_memory: &mut MemoryStats, |     last_memory: &mut MemoryStats, | ||||||
| ) -> Option<MemoryStats> { | ) -> Option<MemoryStats> { | ||||||
|     let Some(stats) = memory else { |     let stats = memory?; | ||||||
|         return None; |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     let memory_counters = |     let memory_counters = | ||||||
|         memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main)); |         memory_counters.get_or_insert_with(|| MemoryCounterHandles::new(profile, main)); | ||||||
|   | |||||||
							
								
								
									
										68
									
								
								workloads/movies-subset-hf-embeddings.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										68
									
								
								workloads/movies-subset-hf-embeddings.json
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,68 @@ | |||||||
|  | { | ||||||
|  |   "name": "movies-subset-hf-embeddings", | ||||||
|  |   "run_count": 5, | ||||||
|  |   "extra_cli_args": [ | ||||||
|  |     "--max-indexing-threads=4" | ||||||
|  |   ], | ||||||
|  |   "assets": { | ||||||
|  |     "movies-100.json": { | ||||||
|  |       "local_location": null, | ||||||
|  |       "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json", | ||||||
|  |       "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" | ||||||
|  |     } | ||||||
|  |   }, | ||||||
|  |   "commands": [ | ||||||
|  |     { | ||||||
|  |       "route": "experimental-features", | ||||||
|  |       "method": "PATCH", | ||||||
|  |       "body": { | ||||||
|  |         "inline": { | ||||||
|  |           "vectorStore": true | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "synchronous": "DontWait" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "route": "indexes/movies/settings", | ||||||
|  |       "method": "PATCH", | ||||||
|  |       "body": { | ||||||
|  |         "inline": { | ||||||
|  |           "searchableAttributes": [ | ||||||
|  |             "title", | ||||||
|  |             "overview" | ||||||
|  |           ], | ||||||
|  |           "filterableAttributes": [ | ||||||
|  |             "genres", | ||||||
|  |             "release_date" | ||||||
|  |           ], | ||||||
|  |           "sortableAttributes": [ | ||||||
|  |             "release_date" | ||||||
|  |           ] | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "synchronous": "WaitForTask" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "route": "indexes/movies/settings", | ||||||
|  |       "method": "PATCH", | ||||||
|  |       "body": { | ||||||
|  |         "inline": { | ||||||
|  |           "embedders": { | ||||||
|  |             "default": { | ||||||
|  |               "source": "huggingFace" | ||||||
|  |             } | ||||||
|  |           } | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "synchronous": "WaitForTask" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "route": "indexes/movies/documents", | ||||||
|  |       "method": "POST", | ||||||
|  |       "body": { | ||||||
|  |         "asset": "movies-100.json" | ||||||
|  |       }, | ||||||
|  |       "synchronous": "WaitForTask" | ||||||
|  |     } | ||||||
|  |   ] | ||||||
|  | } | ||||||
							
								
								
									
										72
									
								
								workloads/settings-add-embeddings.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										72
									
								
								workloads/settings-add-embeddings.json
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,72 @@ | |||||||
|  | { | ||||||
|  |   "name": "settings-add-embeddings-hf", | ||||||
|  |   "run_count": 5, | ||||||
|  |   "extra_cli_args": [ | ||||||
|  |     "--max-indexing-threads=4" | ||||||
|  |   ], | ||||||
|  |   "assets": { | ||||||
|  |     "movies-100.json": { | ||||||
|  |       "local_location": null, | ||||||
|  |       "remote_location": "https://milli-benchmarks.fra1.digitaloceanspaces.com/bench/datasets/movies-100.json", | ||||||
|  |       "sha256": "d215e395e4240f12f03b8f1f68901eac82d9e7ded5b462cbf4a6b8efde76c6c6" | ||||||
|  |     } | ||||||
|  |   }, | ||||||
|  |   "commands": [ | ||||||
|  |     { | ||||||
|  |       "route": "experimental-features", | ||||||
|  |       "method": "PATCH", | ||||||
|  |       "body": { | ||||||
|  |         "inline": { | ||||||
|  |           "vectorStore": true | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "synchronous": "DontWait" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "route": "indexes/movies/settings", | ||||||
|  |       "method": "PATCH", | ||||||
|  |       "body": { | ||||||
|  |         "inline": { | ||||||
|  |           "searchableAttributes": [ | ||||||
|  |             "title", | ||||||
|  |             "overview" | ||||||
|  |           ], | ||||||
|  |           "filterableAttributes": [ | ||||||
|  |             "genres", | ||||||
|  |             "release_date" | ||||||
|  |           ], | ||||||
|  |           "sortableAttributes": [ | ||||||
|  |             "release_date" | ||||||
|  |           ] | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "synchronous": "DontWait" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "route": "indexes/movies/documents", | ||||||
|  |       "method": "POST", | ||||||
|  |       "body": { | ||||||
|  |         "asset": "movies-100.json" | ||||||
|  |       }, | ||||||
|  |       "synchronous": "WaitForTask" | ||||||
|  |     }, | ||||||
|  |     { | ||||||
|  |       "route": "indexes/movies/settings", | ||||||
|  |       "method": "PATCH", | ||||||
|  |       "body": { | ||||||
|  |         "inline": { | ||||||
|  |           "embedders": { | ||||||
|  |             "default": { | ||||||
|  |               "source": "huggingFace", | ||||||
|  |               "model": null, | ||||||
|  |               "revision": null, | ||||||
|  |               "documentTemplate": null, | ||||||
|  |               "distribution": null | ||||||
|  |             } | ||||||
|  |           } | ||||||
|  |         } | ||||||
|  |       }, | ||||||
|  |       "synchronous": "WaitForTask" | ||||||
|  |     } | ||||||
|  |   ] | ||||||
|  | } | ||||||
| @@ -1,6 +1,6 @@ | |||||||
| { | { | ||||||
|   "name": "settings-add-remove-filters.json", |   "name": "settings-add-remove-filters.json", | ||||||
|   "run_count": 2, |   "run_count": 5, | ||||||
|   "extra_cli_args": [ |   "extra_cli_args": [ | ||||||
|     "--max-indexing-threads=4" |     "--max-indexing-threads=4" | ||||||
|   ], |   ], | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| { | { | ||||||
|   "name": "settings-proximity-precision.json", |   "name": "settings-proximity-precision.json", | ||||||
|   "run_count": 2, |   "run_count": 5, | ||||||
|   "extra_cli_args": [ |   "extra_cli_args": [ | ||||||
|     "--max-indexing-threads=4" |     "--max-indexing-threads=4" | ||||||
|   ], |   ], | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| { | { | ||||||
|   "name": "settings-remove-add-swap-searchable.json", |   "name": "settings-remove-add-swap-searchable.json", | ||||||
|   "run_count": 2, |   "run_count": 5, | ||||||
|   "extra_cli_args": [ |   "extra_cli_args": [ | ||||||
|     "--max-indexing-threads=4" |     "--max-indexing-threads=4" | ||||||
|   ], |   ], | ||||||
|   | |||||||
| @@ -1,6 +1,6 @@ | |||||||
| { | { | ||||||
|   "name": "settings-typo.json", |   "name": "settings-typo.json", | ||||||
|   "run_count": 2, |   "run_count": 5, | ||||||
|   "extra_cli_args": [ |   "extra_cli_args": [ | ||||||
|     "--max-indexing-threads=4" |     "--max-indexing-threads=4" | ||||||
|   ], |   ], | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user