mirror of
				https://github.com/meilisearch/meilisearch.git
				synced 2025-10-26 05:26:27 +00:00 
			
		
		
		
	Merge pull request #38 from meilisearch/facet-queries
Introduce a facet filter system
This commit is contained in:
		
							
								
								
									
										143
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										143
									
								
								Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -45,6 +45,27 @@ version = "1.2.1" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" | checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "block-buffer" | ||||||
|  | version = "0.7.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "c0940dc441f31689269e10ac70eb1002a3a1d3ad1390e030043662eb7fe4688b" | ||||||
|  | dependencies = [ | ||||||
|  |  "block-padding", | ||||||
|  |  "byte-tools", | ||||||
|  |  "byteorder", | ||||||
|  |  "generic-array", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "block-padding" | ||||||
|  | version = "0.1.5" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "fa79dedbb091f449f1f39e53edf88d5dbe95f895dae6135a8d7b881fb5af73f5" | ||||||
|  | dependencies = [ | ||||||
|  |  "byte-tools", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "bstr" | name = "bstr" | ||||||
| version = "0.2.13" | version = "0.2.13" | ||||||
| @@ -63,6 +84,12 @@ version = "3.4.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" | checksum = "2e8c087f005730276d1096a652e92a8bacee2e2472bcc9715a74d2bec38b5820" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "byte-tools" | ||||||
|  | version = "0.3.1" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "e3b5ca7a04898ad4bcd41c90c5285445ff5b791899bb1b0abdd2a2aa791211d7" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "byteorder" | name = "byteorder" | ||||||
| version = "1.3.4" | version = "1.3.4" | ||||||
| @@ -285,12 +312,27 @@ dependencies = [ | |||||||
|  "memchr", |  "memchr", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "digest" | ||||||
|  | version = "0.8.1" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "f3d0c8c8752312f9713efd397ff63acb9f85585afbf179282e720e7704954dd5" | ||||||
|  | dependencies = [ | ||||||
|  |  "generic-array", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "either" | name = "either" | ||||||
| version = "1.6.1" | version = "1.6.1" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" | checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "fake-simd" | ||||||
|  | version = "0.1.2" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "e88a8acf291dafb59c2d96e8f59828f3838bb1a70398823ade51a84de6a6deed" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "flate2" | name = "flate2" | ||||||
| version = "1.0.17" | version = "1.0.17" | ||||||
| @@ -324,6 +366,15 @@ dependencies = [ | |||||||
|  "byteorder", |  "byteorder", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "generic-array" | ||||||
|  | version = "0.12.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "c68f0274ae0e023facc3c97b2e00f076be70e254bc851d972503b328db79b2ec" | ||||||
|  | dependencies = [ | ||||||
|  |  "typenum", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "getrandom" | name = "getrandom" | ||||||
| version = "0.1.14" | version = "0.1.14" | ||||||
| @@ -378,9 +429,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "heed" | name = "heed" | ||||||
| version = "0.10.1" | version = "0.10.4" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" | checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  "heed-traits", |  "heed-traits", | ||||||
| @@ -617,9 +668,12 @@ dependencies = [ | |||||||
|  "maplit", |  "maplit", | ||||||
|  "memmap", |  "memmap", | ||||||
|  "near-proximity", |  "near-proximity", | ||||||
|  |  "num-traits", | ||||||
|  "obkv", |  "obkv", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "ordered-float", |  "ordered-float", | ||||||
|  |  "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", | ||||||
|  |  "pest_derive", | ||||||
|  "rayon", |  "rayon", | ||||||
|  "ringtail", |  "ringtail", | ||||||
|  "roaring", |  "roaring", | ||||||
| @@ -675,9 +729,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "num-traits" | name = "num-traits" | ||||||
| version = "0.2.12" | version = "0.2.14" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "ac267bcc07f48ee5f8935ab0d24f316fb722d7a1292e2913f0cc196b29ffd611" | checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "autocfg", |  "autocfg", | ||||||
| ] | ] | ||||||
| @@ -716,6 +770,12 @@ version = "11.1.2" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" | checksum = "a170cebd8021a008ea92e4db85a72f80b35df514ec664b296fdcbb654eac0b2c" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "opaque-debug" | ||||||
|  | version = "0.2.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "ordered-float" | name = "ordered-float" | ||||||
| version = "2.0.0" | version = "2.0.0" | ||||||
| @@ -741,6 +801,57 @@ version = "2.1.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" | ||||||
|  | dependencies = [ | ||||||
|  |  "ucd-trie", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" | ||||||
|  | dependencies = [ | ||||||
|  |  "ucd-trie", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest_derive" | ||||||
|  | version = "2.1.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" | ||||||
|  | dependencies = [ | ||||||
|  |  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "pest_generator", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest_generator" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" | ||||||
|  | dependencies = [ | ||||||
|  |  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "pest_meta", | ||||||
|  |  "proc-macro2", | ||||||
|  |  "quote", | ||||||
|  |  "syn", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest_meta" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" | ||||||
|  | dependencies = [ | ||||||
|  |  "maplit", | ||||||
|  |  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "sha-1", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "pkg-config" | name = "pkg-config" | ||||||
| version = "0.3.19" | version = "0.3.19" | ||||||
| @@ -1025,6 +1136,18 @@ dependencies = [ | |||||||
|  "serde", |  "serde", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "sha-1" | ||||||
|  | version = "0.8.2" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "f7d94d0bede923b3cea61f3f1ff57ff8cdfd77b400fb8f9998949e0cf04163df" | ||||||
|  | dependencies = [ | ||||||
|  |  "block-buffer", | ||||||
|  |  "digest", | ||||||
|  |  "fake-simd", | ||||||
|  |  "opaque-debug", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "slice-group-by" | name = "slice-group-by" | ||||||
| version = "0.2.6" | version = "0.2.6" | ||||||
| @@ -1233,6 +1356,18 @@ version = "0.1.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" | checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "typenum" | ||||||
|  | version = "1.12.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "ucd-trie" | ||||||
|  | version = "0.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicode-bidi" | name = "unicode-bidi" | ||||||
| version = "0.3.4" | version = "0.3.4" | ||||||
|   | |||||||
| @@ -14,13 +14,14 @@ flate2 = "1.0.17" | |||||||
| fst = "0.4.4" | fst = "0.4.4" | ||||||
| fxhash = "0.2.1" | fxhash = "0.2.1" | ||||||
| grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } | grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } | ||||||
| heed = { version = "0.10.1", default-features = false, features = ["lmdb", "sync-read-txn"] } | heed = { version = "0.10.4", default-features = false, features = ["lmdb", "sync-read-txn"] } | ||||||
| human_format = "1.0.3" | human_format = "1.0.3" | ||||||
| jemallocator = "0.3.2" | jemallocator = "0.3.2" | ||||||
| levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } | ||||||
| linked-hash-map = "0.5.3" | linked-hash-map = "0.5.3" | ||||||
| memmap = "0.7.0" | memmap = "0.7.0" | ||||||
| near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } | near-proximity = { git = "https://github.com/Kerollmops/plane-sweep-proximity", rev = "6608205" } | ||||||
|  | num-traits = "0.2.14" | ||||||
| obkv = "0.1.0" | obkv = "0.1.0" | ||||||
| once_cell = "1.4.0" | once_cell = "1.4.0" | ||||||
| ordered-float = "2.0.0" | ordered-float = "2.0.0" | ||||||
| @@ -36,6 +37,10 @@ structopt = { version = "0.3.14", default-features = false, features = ["wrap_he | |||||||
| tempfile = "3.1.0" | tempfile = "3.1.0" | ||||||
| uuid = { version = "0.8.1", features = ["v4"] } | uuid = { version = "0.8.1", features = ["v4"] } | ||||||
|  |  | ||||||
|  | # facet filter parser | ||||||
|  | pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } | ||||||
|  | pest_derive = "2.1.0" | ||||||
|  |  | ||||||
| # documents words self-join | # documents words self-join | ||||||
| itertools = "0.9.0" | itertools = "0.9.0" | ||||||
|  |  | ||||||
|   | |||||||
							
								
								
									
										70
									
								
								http-ui/Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										70
									
								
								http-ui/Cargo.lock
									
									
									
										generated
									
									
									
								
							| @@ -654,9 +654,9 @@ dependencies = [ | |||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "heed" | name = "heed" | ||||||
| version = "0.10.1" | version = "0.10.4" | ||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "8e25a69175d737e523d9e289b44e3588616b14a97ee3756abf0ae6bd3c832797" | checksum = "cddc0d0d20adfc803b3e57c2d84447e134cad636202e68e275c65e3cbe63c616" | ||||||
| dependencies = [ | dependencies = [ | ||||||
|  "byteorder", |  "byteorder", | ||||||
|  "heed-traits", |  "heed-traits", | ||||||
| @@ -934,6 +934,12 @@ dependencies = [ | |||||||
|  "cfg-if 0.1.10", |  "cfg-if 0.1.10", | ||||||
| ] | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "maplit" | ||||||
|  | version = "1.0.2" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "matches" | name = "matches" | ||||||
| version = "0.1.8" | version = "0.1.8" | ||||||
| @@ -987,9 +993,12 @@ dependencies = [ | |||||||
|  "log", |  "log", | ||||||
|  "memmap", |  "memmap", | ||||||
|  "near-proximity", |  "near-proximity", | ||||||
|  |  "num-traits", | ||||||
|  "obkv", |  "obkv", | ||||||
|  "once_cell", |  "once_cell", | ||||||
|  "ordered-float", |  "ordered-float", | ||||||
|  |  "pest 2.1.3 (git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67)", | ||||||
|  |  "pest_derive", | ||||||
|  "rayon", |  "rayon", | ||||||
|  "ringtail", |  "ringtail", | ||||||
|  "roaring", |  "roaring", | ||||||
| @@ -1231,6 +1240,57 @@ version = "2.1.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53" | ||||||
|  | dependencies = [ | ||||||
|  |  "ucd-trie", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67" | ||||||
|  | dependencies = [ | ||||||
|  |  "ucd-trie", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest_derive" | ||||||
|  | version = "2.1.0" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "833d1ae558dc601e9a60366421196a8d94bc0ac980476d0b67e1d0988d72b2d0" | ||||||
|  | dependencies = [ | ||||||
|  |  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "pest_generator", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest_generator" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "99b8db626e31e5b81787b9783425769681b347011cc59471e33ea46d2ea0cf55" | ||||||
|  | dependencies = [ | ||||||
|  |  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "pest_meta", | ||||||
|  |  "proc-macro2", | ||||||
|  |  "quote", | ||||||
|  |  "syn", | ||||||
|  | ] | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "pest_meta" | ||||||
|  | version = "2.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "54be6e404f5317079812fc8f9f5279de376d8856929e21c184ecf6bbd692a11d" | ||||||
|  | dependencies = [ | ||||||
|  |  "maplit", | ||||||
|  |  "pest 2.1.3 (registry+https://github.com/rust-lang/crates.io-index)", | ||||||
|  |  "sha-1 0.8.2", | ||||||
|  | ] | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "pin-project" | name = "pin-project" | ||||||
| version = "0.4.27" | version = "0.4.27" | ||||||
| @@ -2024,6 +2084,12 @@ version = "1.12.0" | |||||||
| source = "registry+https://github.com/rust-lang/crates.io-index" | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
| checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" | checksum = "373c8a200f9e67a0c95e62a4f52fbf80c23b4381c05a17845531982fa99e6b33" | ||||||
|  |  | ||||||
|  | [[package]] | ||||||
|  | name = "ucd-trie" | ||||||
|  | version = "0.1.3" | ||||||
|  | source = "registry+https://github.com/rust-lang/crates.io-index" | ||||||
|  | checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" | ||||||
|  |  | ||||||
| [[package]] | [[package]] | ||||||
| name = "unicase" | name = "unicase" | ||||||
| version = "2.6.0" | version = "2.6.0" | ||||||
|   | |||||||
| @@ -8,7 +8,7 @@ edition = "2018" | |||||||
| [dependencies] | [dependencies] | ||||||
| anyhow = "1.0.28" | anyhow = "1.0.28" | ||||||
| grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } | grenad = { git = "https://github.com/Kerollmops/grenad.git", rev = "3eb7ad9" } | ||||||
| heed = "0.10.1" | heed = "0.10.4" | ||||||
| memmap = "0.7.0" | memmap = "0.7.0" | ||||||
| milli = { path = ".." } | milli = { path = ".." } | ||||||
| once_cell = "1.4.1" | once_cell = "1.4.1" | ||||||
|   | |||||||
| @@ -1,8 +1,9 @@ | |||||||
| var request = null; | var request = null; | ||||||
| var timeoutID = null; | var timeoutID = null; | ||||||
|  |  | ||||||
| $('#search').on('input', function () { | $('#query, #facet').on('input', function () { | ||||||
|   var query = $(this).val(); |   var query = $('#query').val(); | ||||||
|  |   var facet = $('#facet').val(); | ||||||
|   var timeoutMs = 100; |   var timeoutMs = 100; | ||||||
|  |  | ||||||
|   if (timeoutID !== null) { |   if (timeoutID !== null) { | ||||||
| @@ -14,7 +15,7 @@ $('#search').on('input', function () { | |||||||
|       type: "POST", |       type: "POST", | ||||||
|       url: "query", |       url: "query", | ||||||
|       contentType: 'application/json', |       contentType: 'application/json', | ||||||
|       data: JSON.stringify({ 'query': query }), |       data: JSON.stringify({ 'query': query, 'facetCondition': facet }), | ||||||
|       contentType: 'application/json', |       contentType: 'application/json', | ||||||
|       success: function (data, textStatus, request) { |       success: function (data, textStatus, request) { | ||||||
|         results.innerHTML = ''; |         results.innerHTML = ''; | ||||||
| @@ -77,5 +78,5 @@ $('#db-size').text(function(index, text) { | |||||||
| // We trigger the input when we load the script, this way | // We trigger the input when we load the script, this way | ||||||
| // we execute a placeholder search when the input is empty. | // we execute a placeholder search when the input is empty. | ||||||
| $(window).on('load', function () { | $(window).on('load', function () { | ||||||
|   $('#search').trigger('input'); |   $('#query').trigger('input'); | ||||||
| }); | }); | ||||||
|   | |||||||
| @@ -2,6 +2,7 @@ use std::borrow::Cow; | |||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::{HashMap, HashSet}; | ||||||
| use std::fs::{File, create_dir_all}; | use std::fs::{File, create_dir_all}; | ||||||
| use std::net::SocketAddr; | use std::net::SocketAddr; | ||||||
|  | use std::num::NonZeroUsize; | ||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
| use std::str::FromStr; | use std::str::FromStr; | ||||||
| use std::sync::Arc; | use std::sync::Arc; | ||||||
| @@ -28,7 +29,7 @@ use warp::{Filter, http::Response}; | |||||||
| use milli::tokenizer::{simple_tokenizer, TokenType}; | use milli::tokenizer::{simple_tokenizer, TokenType}; | ||||||
| use milli::update::UpdateIndexingStep::*; | use milli::update::UpdateIndexingStep::*; | ||||||
| use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; | use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; | ||||||
| use milli::{obkv_to_json, Index, UpdateStore, SearchResult}; | use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; | ||||||
|  |  | ||||||
| static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new(); | static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new(); | ||||||
|  |  | ||||||
| @@ -196,6 +197,7 @@ enum UpdateMeta { | |||||||
|     DocumentsAddition { method: String, format: String }, |     DocumentsAddition { method: String, format: String }, | ||||||
|     ClearDocuments, |     ClearDocuments, | ||||||
|     Settings(Settings), |     Settings(Settings), | ||||||
|  |     Facets(Facets), | ||||||
| } | } | ||||||
|  |  | ||||||
| #[derive(Debug, Clone, Serialize, Deserialize)] | #[derive(Debug, Clone, Serialize, Deserialize)] | ||||||
| @@ -231,6 +233,14 @@ struct Settings { | |||||||
|     faceted_attributes: Option<HashMap<String, String>>, |     faceted_attributes: Option<HashMap<String, String>>, | ||||||
| } | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, Serialize, Deserialize)] | ||||||
|  | #[serde(deny_unknown_fields)] | ||||||
|  | #[serde(rename_all = "camelCase")] | ||||||
|  | struct Facets { | ||||||
|  |     level_group_size: Option<NonZeroUsize>, | ||||||
|  |     min_level_size: Option<NonZeroUsize>, | ||||||
|  | } | ||||||
|  |  | ||||||
| // Any value that is present is considered Some value, including null. | // Any value that is present is considered Some value, including null. | ||||||
| fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error> | fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error> | ||||||
| where T: Deserialize<'de>, | where T: Deserialize<'de>, | ||||||
| @@ -399,6 +409,21 @@ async fn main() -> anyhow::Result<()> { | |||||||
|                         Ok(_count) => wtxn.commit().map_err(Into::into), |                         Ok(_count) => wtxn.commit().map_err(Into::into), | ||||||
|                         Err(e) => Err(e.into()) |                         Err(e) => Err(e.into()) | ||||||
|                     } |                     } | ||||||
|  |                 }, | ||||||
|  |                 UpdateMeta::Facets(levels) => { | ||||||
|  |                     // We must use the write transaction of the update here. | ||||||
|  |                     let mut wtxn = index_cloned.write_txn()?; | ||||||
|  |                     let mut builder = update_builder.facets(&mut wtxn, &index_cloned); | ||||||
|  |                     if let Some(value) = levels.level_group_size { | ||||||
|  |                         builder.level_group_size(value); | ||||||
|  |                     } | ||||||
|  |                     if let Some(value) = levels.min_level_size { | ||||||
|  |                         builder.min_level_size(value); | ||||||
|  |                     } | ||||||
|  |                     match builder.execute() { | ||||||
|  |                         Ok(()) => wtxn.commit().map_err(Into::into), | ||||||
|  |                         Err(e) => Err(e.into()) | ||||||
|  |                     } | ||||||
|                 } |                 } | ||||||
|             }; |             }; | ||||||
|  |  | ||||||
| @@ -550,9 +575,12 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             .body(include_str!("../public/logo-black.svg")) |             .body(include_str!("../public/logo-black.svg")) | ||||||
|         ); |         ); | ||||||
|  |  | ||||||
|     #[derive(Deserialize)] |     #[derive(Debug, Deserialize)] | ||||||
|  |     #[serde(deny_unknown_fields)] | ||||||
|  |     #[serde(rename_all = "camelCase")] | ||||||
|     struct QueryBody { |     struct QueryBody { | ||||||
|         query: Option<String>, |         query: Option<String>, | ||||||
|  |         facet_condition: Option<String>, | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let disable_highlighting = opt.disable_highlighting; |     let disable_highlighting = opt.disable_highlighting; | ||||||
| @@ -569,6 +597,12 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             if let Some(query) = query.query { |             if let Some(query) = query.query { | ||||||
|                 search.query(query); |                 search.query(query); | ||||||
|             } |             } | ||||||
|  |             if let Some(condition) = query.facet_condition { | ||||||
|  |                 if !condition.trim().is_empty() { | ||||||
|  |                     let condition = FacetCondition::from_str(&rtxn, &index, &condition).unwrap(); | ||||||
|  |                     search.facet_condition(condition); | ||||||
|  |                 } | ||||||
|  |             } | ||||||
|  |  | ||||||
|             let SearchResult { found_words, documents_ids } = search.execute().unwrap(); |             let SearchResult { found_words, documents_ids } = search.execute().unwrap(); | ||||||
|  |  | ||||||
| @@ -751,6 +785,19 @@ async fn main() -> anyhow::Result<()> { | |||||||
|             Ok(warp::reply()) |             Ok(warp::reply()) | ||||||
|         }); |         }); | ||||||
|  |  | ||||||
|  |     let update_store_cloned = update_store.clone(); | ||||||
|  |     let update_status_sender_cloned = update_status_sender.clone(); | ||||||
|  |     let change_facet_levels_route = warp::filters::method::post() | ||||||
|  |         .and(warp::path!("facet-level-sizes")) | ||||||
|  |         .and(warp::body::json()) | ||||||
|  |         .map(move |levels: Facets| { | ||||||
|  |             let meta = UpdateMeta::Facets(levels); | ||||||
|  |             let update_id = update_store_cloned.register_update(&meta, &[]).unwrap(); | ||||||
|  |             let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta }); | ||||||
|  |             eprintln!("update {} registered", update_id); | ||||||
|  |             warp::reply() | ||||||
|  |         }); | ||||||
|  |  | ||||||
|     let update_ws_route = warp::ws() |     let update_ws_route = warp::ws() | ||||||
|         .and(warp::path!("updates" / "ws")) |         .and(warp::path!("updates" / "ws")) | ||||||
|         .map(move |ws: warp::ws::Ws| { |         .map(move |ws: warp::ws::Ws| { | ||||||
| @@ -799,6 +846,7 @@ async fn main() -> anyhow::Result<()> { | |||||||
|         .or(indexing_json_stream_route) |         .or(indexing_json_stream_route) | ||||||
|         .or(clearing_route) |         .or(clearing_route) | ||||||
|         .or(change_settings_route) |         .or(change_settings_route) | ||||||
|  |         .or(change_facet_levels_route) | ||||||
|         .or(update_ws_route); |         .or(update_ws_route); | ||||||
|  |  | ||||||
|     let addr = SocketAddr::from_str(&opt.http_listen_addr)?; |     let addr = SocketAddr::from_str(&opt.http_listen_addr)?; | ||||||
|   | |||||||
| @@ -55,7 +55,8 @@ | |||||||
|         <div class="level-left"> |         <div class="level-left"> | ||||||
|           <div class="level-item"> |           <div class="level-item"> | ||||||
|             <div class="field has-addons has-addons-right"> |             <div class="field has-addons has-addons-right"> | ||||||
|               <input id="search" class="input" type="text" autofocus placeholder="e.g. George Clooney"> |               <input id="query" class="input" type="text" autofocus placeholder="e.g. George Clooney"> | ||||||
|  |               <input id="facet" class="input" type="text" placeholder="facet filter like released >= 1577836800"> | ||||||
|             </div> |             </div> | ||||||
|           </div> |           </div> | ||||||
|           <div class="level-item"></div> |           <div class="level-item"></div> | ||||||
|   | |||||||
							
								
								
									
										86
									
								
								src/heed_codec/facet/facet_level_value_f64_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										86
									
								
								src/heed_codec/facet/facet_level_value_f64_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,86 @@ | |||||||
|  | use std::borrow::Cow; | ||||||
|  | use std::convert::TryInto; | ||||||
|  |  | ||||||
|  | use crate::facet::value_encoding::f64_into_bytes; | ||||||
|  |  | ||||||
|  | // TODO do not de/serialize right bound when level = 0 | ||||||
|  | pub struct FacetLevelValueF64Codec; | ||||||
|  |  | ||||||
|  | impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { | ||||||
|  |     type DItem = (u8, u8, f64, f64); | ||||||
|  |  | ||||||
|  |     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||||
|  |         let (field_id, bytes) = bytes.split_first()?; | ||||||
|  |         let (level, bytes) = bytes.split_first()?; | ||||||
|  |  | ||||||
|  |         let (left, right) = if *level != 0 { | ||||||
|  |             let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; | ||||||
|  |             let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; | ||||||
|  |             (left, right) | ||||||
|  |         } else { | ||||||
|  |             let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; | ||||||
|  |             (left, left) | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         Some((*field_id, *level, left, right)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { | ||||||
|  |     type EItem = (u8, u8, f64, f64); | ||||||
|  |  | ||||||
|  |     fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> { | ||||||
|  |         let mut buffer = [0u8; 32]; | ||||||
|  |  | ||||||
|  |         let len = if *level != 0 { | ||||||
|  |             // Write the globally ordered floats. | ||||||
|  |             let bytes = f64_into_bytes(*left)?; | ||||||
|  |             buffer[..8].copy_from_slice(&bytes[..]); | ||||||
|  |  | ||||||
|  |             let bytes = f64_into_bytes(*right)?; | ||||||
|  |             buffer[8..16].copy_from_slice(&bytes[..]); | ||||||
|  |  | ||||||
|  |             // Then the f64 values just to be able to read them back. | ||||||
|  |             let bytes = left.to_be_bytes(); | ||||||
|  |             buffer[16..24].copy_from_slice(&bytes[..]); | ||||||
|  |  | ||||||
|  |             let bytes = right.to_be_bytes(); | ||||||
|  |             buffer[24..].copy_from_slice(&bytes[..]); | ||||||
|  |  | ||||||
|  |             32 // length | ||||||
|  |         } else { | ||||||
|  |             // Write the globally ordered floats. | ||||||
|  |             let bytes = f64_into_bytes(*left)?; | ||||||
|  |             buffer[..8].copy_from_slice(&bytes[..]); | ||||||
|  |  | ||||||
|  |             // Then the f64 values just to be able to read them back. | ||||||
|  |             let bytes = left.to_be_bytes(); | ||||||
|  |             buffer[8..16].copy_from_slice(&bytes[..]); | ||||||
|  |  | ||||||
|  |             16 // length | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         let mut bytes = Vec::with_capacity(len + 2); | ||||||
|  |         bytes.push(*field_id); | ||||||
|  |         bytes.push(*level); | ||||||
|  |         bytes.extend_from_slice(&buffer[..len]); | ||||||
|  |         Some(Cow::Owned(bytes)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use heed::{BytesEncode, BytesDecode}; | ||||||
|  |     use super::*; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn globally_ordered_f64() { | ||||||
|  |         let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); | ||||||
|  |         let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); | ||||||
|  |         assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); | ||||||
|  |  | ||||||
|  |         let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); | ||||||
|  |         let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); | ||||||
|  |         assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										43
									
								
								src/heed_codec/facet/facet_level_value_i64_codec.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										43
									
								
								src/heed_codec/facet/facet_level_value_i64_codec.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,43 @@ | |||||||
|  | use std::borrow::Cow; | ||||||
|  | use std::convert::TryInto; | ||||||
|  |  | ||||||
|  | use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; | ||||||
|  |  | ||||||
|  | pub struct FacetLevelValueI64Codec; | ||||||
|  |  | ||||||
|  | impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec { | ||||||
|  |     type DItem = (u8, u8, i64, i64); | ||||||
|  |  | ||||||
|  |     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { | ||||||
|  |         let (field_id, bytes) = bytes.split_first()?; | ||||||
|  |         let (level, bytes) = bytes.split_first()?; | ||||||
|  |  | ||||||
|  |         let left = bytes[..8].try_into().map(i64_from_bytes).ok()?; | ||||||
|  |         let right = if *level != 0 { | ||||||
|  |             bytes[8..].try_into().map(i64_from_bytes).ok()? | ||||||
|  |         } else { | ||||||
|  |             left | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         Some((*field_id, *level, left, right)) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl heed::BytesEncode<'_> for FacetLevelValueI64Codec { | ||||||
|  |     type EItem = (u8, u8, i64, i64); | ||||||
|  |  | ||||||
|  |     fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> { | ||||||
|  |         let left = i64_into_bytes(*left); | ||||||
|  |         let right = i64_into_bytes(*right); | ||||||
|  |  | ||||||
|  |         let mut bytes = Vec::with_capacity(2 + left.len() + right.len()); | ||||||
|  |         bytes.push(*field_id); | ||||||
|  |         bytes.push(*level); | ||||||
|  |         bytes.extend_from_slice(&left[..]); | ||||||
|  |         if *level != 0 { | ||||||
|  |             bytes.extend_from_slice(&right[..]); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Some(Cow::Owned(bytes)) | ||||||
|  |     } | ||||||
|  | } | ||||||
| @@ -1,50 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::convert::TryInto; |  | ||||||
|  |  | ||||||
| use crate::facet::value_encoding::f64_into_bytes; |  | ||||||
|  |  | ||||||
| pub struct FacetValueF64Codec; |  | ||||||
|  |  | ||||||
| impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { |  | ||||||
|     type DItem = (u8, f64); |  | ||||||
|  |  | ||||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { |  | ||||||
|         let (field_id, buffer) = bytes.split_first()?; |  | ||||||
|         let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; |  | ||||||
|         Some((*field_id, value)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl heed::BytesEncode<'_> for FacetValueF64Codec { |  | ||||||
|     type EItem = (u8, f64); |  | ||||||
|  |  | ||||||
|     fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> { |  | ||||||
|         let mut buffer = [0u8; 16]; |  | ||||||
|  |  | ||||||
|         // Write the globally ordered float. |  | ||||||
|         let bytes = f64_into_bytes(*value)?; |  | ||||||
|         buffer[..8].copy_from_slice(&bytes[..]); |  | ||||||
|  |  | ||||||
|         // Then the f64 value just to be able to read it back. |  | ||||||
|         let bytes = value.to_be_bytes(); |  | ||||||
|         buffer[8..].copy_from_slice(&bytes[..]); |  | ||||||
|  |  | ||||||
|         let mut bytes = Vec::with_capacity(buffer.len() + 1); |  | ||||||
|         bytes.push(*field_id); |  | ||||||
|         bytes.extend_from_slice(&buffer[..]); |  | ||||||
|         Some(Cow::Owned(bytes)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| #[cfg(test)] |  | ||||||
| mod tests { |  | ||||||
|     use heed::{BytesEncode, BytesDecode}; |  | ||||||
|     use super::*; |  | ||||||
|  |  | ||||||
|     #[test] |  | ||||||
|     fn globally_ordered_f64() { |  | ||||||
|         let bytes = FacetValueF64Codec::bytes_encode(&(3, -32.0)).unwrap(); |  | ||||||
|         let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); |  | ||||||
|         assert_eq!((name, value), (3, -32.0)); |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,28 +0,0 @@ | |||||||
| use std::borrow::Cow; |  | ||||||
| use std::convert::TryInto; |  | ||||||
|  |  | ||||||
| use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; |  | ||||||
|  |  | ||||||
| pub struct FacetValueI64Codec; |  | ||||||
|  |  | ||||||
| impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { |  | ||||||
|     type DItem = (u8, i64); |  | ||||||
|  |  | ||||||
|     fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { |  | ||||||
|         let (field_id, buffer) = bytes.split_first()?; |  | ||||||
|         let value = buffer.try_into().map(i64_from_bytes).ok()?; |  | ||||||
|         Some((*field_id, value)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
|  |  | ||||||
| impl heed::BytesEncode<'_> for FacetValueI64Codec { |  | ||||||
|     type EItem = (u8, i64); |  | ||||||
|  |  | ||||||
|     fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> { |  | ||||||
|         let value = i64_into_bytes(*value); |  | ||||||
|         let mut bytes = Vec::with_capacity(value.len() + 1); |  | ||||||
|         bytes.push(*field_id); |  | ||||||
|         bytes.extend_from_slice(&value[..]); |  | ||||||
|         Some(Cow::Owned(bytes)) |  | ||||||
|     } |  | ||||||
| } |  | ||||||
| @@ -1,7 +1,7 @@ | |||||||
| mod facet_value_f64_codec; | mod facet_level_value_f64_codec; | ||||||
| mod facet_value_i64_codec; | mod facet_level_value_i64_codec; | ||||||
| mod facet_value_string_codec; | mod facet_value_string_codec; | ||||||
|  |  | ||||||
| pub use self::facet_value_f64_codec::FacetValueF64Codec; | pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; | ||||||
| pub use self::facet_value_i64_codec::FacetValueI64Codec; | pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec; | ||||||
| pub use self::facet_value_string_codec::FacetValueStringCodec; | pub use self::facet_value_string_codec::FacetValueStringCodec; | ||||||
|   | |||||||
							
								
								
									
										22
									
								
								src/index.rs
									
									
									
									
									
								
							
							
						
						
									
										22
									
								
								src/index.rs
									
									
									
									
									
								
							| @@ -18,6 +18,7 @@ use crate::{ | |||||||
|  |  | ||||||
| pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields"; | ||||||
| pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; | ||||||
|  | pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids"; | ||||||
| pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; | pub const FACETED_FIELDS_KEY: &str = "faceted-fields"; | ||||||
| pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; | ||||||
| pub const PRIMARY_KEY_KEY: &str = "primary-key"; | pub const PRIMARY_KEY_KEY: &str = "primary-key"; | ||||||
| @@ -224,6 +225,27 @@ impl Index { | |||||||
|         Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) |         Ok(self.main.get::<_, Str, SerdeJson<_>>(wtxn, FACETED_FIELDS_KEY)?.unwrap_or_default()) | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     /* faceted documents ids */ | ||||||
|  |  | ||||||
|  |     /// Writes the documents ids that are faceted under this field id. | ||||||
|  |     pub fn put_faceted_documents_ids(&self, wtxn: &mut RwTxn, field_id: u8, docids: &RoaringBitmap) -> heed::Result<()> { | ||||||
|  |         let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||||
|  |         buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||||
|  |         *buffer.last_mut().unwrap() = field_id; | ||||||
|  |         self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     /// Retrieve all the documents ids that faceted under this field id. | ||||||
|  |     pub fn faceted_documents_ids(&self, rtxn: &RoTxn, field_id: u8) -> heed::Result<RoaringBitmap> { | ||||||
|  |         let mut buffer = [0u8; FACETED_DOCUMENTS_IDS_PREFIX.len() + 1]; | ||||||
|  |         buffer[..FACETED_DOCUMENTS_IDS_PREFIX.len()].clone_from_slice(FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); | ||||||
|  |         *buffer.last_mut().unwrap() = field_id; | ||||||
|  |         match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { | ||||||
|  |             Some(docids) => Ok(docids), | ||||||
|  |             None => Ok(RoaringBitmap::new()), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|     /* words fst */ |     /* words fst */ | ||||||
|  |  | ||||||
|     /// Writes the FST which is the words dictionnary of the engine. |     /// Writes the FST which is the words dictionnary of the engine. | ||||||
|   | |||||||
| @@ -1,3 +1,5 @@ | |||||||
|  | #[macro_use] extern crate pest_derive; | ||||||
|  |  | ||||||
| mod criterion; | mod criterion; | ||||||
| mod external_documents_ids; | mod external_documents_ids; | ||||||
| mod fields_ids_map; | mod fields_ids_map; | ||||||
| @@ -24,7 +26,7 @@ pub use self::criterion::{Criterion, default_criteria}; | |||||||
| pub use self::external_documents_ids::ExternalDocumentsIds; | pub use self::external_documents_ids::ExternalDocumentsIds; | ||||||
| pub use self::fields_ids_map::FieldsIdsMap; | pub use self::fields_ids_map::FieldsIdsMap; | ||||||
| pub use self::index::Index; | pub use self::index::Index; | ||||||
| pub use self::search::{Search, SearchResult}; | pub use self::search::{Search, FacetCondition, SearchResult}; | ||||||
| pub use self::heed_codec::{ | pub use self::heed_codec::{ | ||||||
|     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, |     RoaringBitmapCodec, BEU32StrCodec, StrStrU8Codec, | ||||||
|     ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, |     ObkvCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, | ||||||
|   | |||||||
							
								
								
									
										29
									
								
								src/search/facet/grammar.pest
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								src/search/facet/grammar.pest
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,29 @@ | |||||||
|  | key = _{quoted | word} | ||||||
|  | value = _{quoted | word} | ||||||
|  | quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP  } | ||||||
|  | string = {char*} | ||||||
|  | word = ${(LETTER | NUMBER | "_" | "-" | ".")+} | ||||||
|  |  | ||||||
|  | char =  _{ !(PEEK | "\\") ~ ANY | ||||||
|  |     | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") | ||||||
|  |     | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} | ||||||
|  |  | ||||||
|  | condition = _{between | eq | greater | less | geq | leq | neq} | ||||||
|  | between = {key ~ value ~ "TO" ~ value} | ||||||
|  | geq = {key ~ ">=" ~ value} | ||||||
|  | leq = {key ~ "<=" ~ value} | ||||||
|  | neq = {key ~ "!=" ~ value} | ||||||
|  | eq = {key ~ "=" ~ value} | ||||||
|  | greater = {key ~ ">" ~ value} | ||||||
|  | less = {key ~ "<" ~ value} | ||||||
|  |  | ||||||
|  | prgm = {SOI ~ expr ~ EOI} | ||||||
|  | expr = _{ ( term ~ (operation ~ term)* ) } | ||||||
|  | term = { ("(" ~ expr ~ ")") | condition | not } | ||||||
|  | operation = _{ and | or } | ||||||
|  | and = {"AND"} | ||||||
|  | or = {"OR"} | ||||||
|  |  | ||||||
|  | not = {"NOT" ~ term} | ||||||
|  |  | ||||||
|  | WHITESPACE = _{ " " } | ||||||
							
								
								
									
										655
									
								
								src/search/facet/mod.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										655
									
								
								src/search/facet/mod.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,655 @@ | |||||||
|  | use std::collections::HashMap; | ||||||
|  | use std::fmt::Debug; | ||||||
|  | use std::ops::Bound::{self, Unbounded, Included, Excluded}; | ||||||
|  | use std::str::FromStr; | ||||||
|  |  | ||||||
|  | use heed::types::{ByteSlice, DecodeIgnore}; | ||||||
|  | use log::debug; | ||||||
|  | use num_traits::Bounded; | ||||||
|  | use parser::{PREC_CLIMBER, FilterParser}; | ||||||
|  | use pest::error::{Error as PestError, ErrorVariant}; | ||||||
|  | use pest::iterators::{Pair, Pairs}; | ||||||
|  | use pest::Parser; | ||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use crate::facet::FacetType; | ||||||
|  | use crate::heed_codec::facet::FacetValueStringCodec; | ||||||
|  | use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; | ||||||
|  | use crate::{Index, FieldsIdsMap, CboRoaringBitmapCodec}; | ||||||
|  |  | ||||||
|  | use self::FacetCondition::*; | ||||||
|  | use self::FacetNumberOperator::*; | ||||||
|  | use self::parser::Rule; | ||||||
|  |  | ||||||
|  | mod parser; | ||||||
|  |  | ||||||
|  | #[derive(Debug, Copy, Clone, PartialEq)] | ||||||
|  | pub enum FacetNumberOperator<T> { | ||||||
|  |     GreaterThan(T), | ||||||
|  |     GreaterThanOrEqual(T), | ||||||
|  |     Equal(T), | ||||||
|  |     NotEqual(T), | ||||||
|  |     LowerThan(T), | ||||||
|  |     LowerThanOrEqual(T), | ||||||
|  |     Between(T, T), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<T> FacetNumberOperator<T> { | ||||||
|  |     /// This method can return two operations in case it must express | ||||||
|  |     /// an OR operation for the between case (i.e. `TO`). | ||||||
|  |     fn negate(self) -> (Self, Option<Self>) { | ||||||
|  |         match self { | ||||||
|  |             GreaterThan(x)        => (LowerThanOrEqual(x), None), | ||||||
|  |             GreaterThanOrEqual(x) => (LowerThan(x), None), | ||||||
|  |             Equal(x)              => (NotEqual(x), None), | ||||||
|  |             NotEqual(x)           => (Equal(x), None), | ||||||
|  |             LowerThan(x)          => (GreaterThanOrEqual(x), None), | ||||||
|  |             LowerThanOrEqual(x)   => (GreaterThan(x), None), | ||||||
|  |             Between(x, y)         => (LowerThan(x), Some(GreaterThan(y))), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, PartialEq)] | ||||||
|  | pub enum FacetStringOperator { | ||||||
|  |     Equal(String), | ||||||
|  |     NotEqual(String), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl FacetStringOperator { | ||||||
|  |     fn negate(self) -> Self { | ||||||
|  |         match self { | ||||||
|  |             FacetStringOperator::Equal(x)    => FacetStringOperator::NotEqual(x), | ||||||
|  |             FacetStringOperator::NotEqual(x) => FacetStringOperator::Equal(x), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[derive(Debug, Clone, PartialEq)] | ||||||
|  | pub enum FacetCondition { | ||||||
|  |     OperatorI64(u8, FacetNumberOperator<i64>), | ||||||
|  |     OperatorF64(u8, FacetNumberOperator<f64>), | ||||||
|  |     OperatorString(u8, FacetStringOperator), | ||||||
|  |     Or(Box<Self>, Box<Self>), | ||||||
|  |     And(Box<Self>, Box<Self>), | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn get_field_id_facet_type<'a>( | ||||||
|  |     fields_ids_map: &FieldsIdsMap, | ||||||
|  |     faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |     items: &mut Pairs<'a, Rule>, | ||||||
|  | ) -> Result<(u8, FacetType), PestError<Rule>> | ||||||
|  | { | ||||||
|  |     // lexing ensures that we at least have a key | ||||||
|  |     let key = items.next().unwrap(); | ||||||
|  |     let field_id = fields_ids_map | ||||||
|  |         .id(key.as_str()) | ||||||
|  |         .ok_or_else(|| { | ||||||
|  |             PestError::new_from_span( | ||||||
|  |                 ErrorVariant::CustomError { | ||||||
|  |                     message: format!( | ||||||
|  |                         "attribute `{}` not found, available attributes are: {}", | ||||||
|  |                         key.as_str(), | ||||||
|  |                         fields_ids_map.iter().map(|(_, n)| n).collect::<Vec<_>>().join(", ") | ||||||
|  |                     ), | ||||||
|  |                 }, | ||||||
|  |                 key.as_span(), | ||||||
|  |             ) | ||||||
|  |         })?; | ||||||
|  |  | ||||||
|  |     let facet_type = faceted_fields | ||||||
|  |         .get(&field_id) | ||||||
|  |         .copied() | ||||||
|  |         .ok_or_else(|| { | ||||||
|  |             PestError::new_from_span( | ||||||
|  |                 ErrorVariant::CustomError { | ||||||
|  |                     message: format!( | ||||||
|  |                         "attribute `{}` is not faceted, available faceted attributes are: {}", | ||||||
|  |                         key.as_str(), | ||||||
|  |                         faceted_fields.keys().flat_map(|id| fields_ids_map.name(*id)).collect::<Vec<_>>().join(", ") | ||||||
|  |                     ), | ||||||
|  |                 }, | ||||||
|  |                 key.as_span(), | ||||||
|  |             ) | ||||||
|  |         })?; | ||||||
|  |  | ||||||
|  |     Ok((field_id, facet_type)) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn pest_parse<T>(pair: Pair<Rule>) -> Result<T, pest::error::Error<Rule>> | ||||||
|  | where T: FromStr, | ||||||
|  |       T::Err: ToString, | ||||||
|  | { | ||||||
|  |     match pair.as_str().parse() { | ||||||
|  |         Ok(value) => Ok(value), | ||||||
|  |         Err(e) => { | ||||||
|  |             Err(PestError::<Rule>::new_from_span( | ||||||
|  |                 ErrorVariant::CustomError { message: e.to_string() }, | ||||||
|  |                 pair.as_span(), | ||||||
|  |             )) | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl FacetCondition { | ||||||
|  |     pub fn from_str( | ||||||
|  |         rtxn: &heed::RoTxn, | ||||||
|  |         index: &Index, | ||||||
|  |         expression: &str, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||||
|  |         let faceted_fields = index.faceted_fields(rtxn)?; | ||||||
|  |         let lexed = FilterParser::parse(Rule::prgm, expression)?; | ||||||
|  |         FacetCondition::from_pairs(&fields_ids_map, &faceted_fields, lexed) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn from_pairs( | ||||||
|  |         fim: &FieldsIdsMap, | ||||||
|  |         ff: &HashMap<u8, FacetType>, | ||||||
|  |         expression: Pairs<Rule>, | ||||||
|  |     ) -> anyhow::Result<Self> | ||||||
|  |     { | ||||||
|  |         PREC_CLIMBER.climb( | ||||||
|  |             expression, | ||||||
|  |             |pair: Pair<Rule>| match pair.as_rule() { | ||||||
|  |                 Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), | ||||||
|  |                 Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), | ||||||
|  |                 Rule::eq => Ok(Self::equal(fim, ff, pair)?), | ||||||
|  |                 Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), | ||||||
|  |                 Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), | ||||||
|  |                 Rule::less => Ok(Self::lower_than(fim, ff, pair)?), | ||||||
|  |                 Rule::between => Ok(Self::between(fim, ff, pair)?), | ||||||
|  |                 Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), | ||||||
|  |                 Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), | ||||||
|  |                 Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), | ||||||
|  |                 _ => unreachable!(), | ||||||
|  |             }, | ||||||
|  |             |lhs: anyhow::Result<Self>, op: Pair<Rule>, rhs: anyhow::Result<Self>| { | ||||||
|  |                 match op.as_rule() { | ||||||
|  |                     Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), | ||||||
|  |                     Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), | ||||||
|  |                     _ => unreachable!(), | ||||||
|  |                 } | ||||||
|  |             }, | ||||||
|  |         ) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn negate(self) -> FacetCondition { | ||||||
|  |         match self { | ||||||
|  |             OperatorI64(fid, op) => match op.negate() { | ||||||
|  |                 (op, None) => OperatorI64(fid, op), | ||||||
|  |                 (a, Some(b)) => Or(Box::new(OperatorI64(fid, a)), Box::new(OperatorI64(fid, b))), | ||||||
|  |             }, | ||||||
|  |             OperatorF64(fid, op) => match op.negate() { | ||||||
|  |                 (op, None) => OperatorF64(fid, op), | ||||||
|  |                 (a, Some(b)) => Or(Box::new(OperatorF64(fid, a)), Box::new(OperatorF64(fid, b))), | ||||||
|  |             }, | ||||||
|  |             OperatorString(fid, op) => OperatorString(fid, op.negate()), | ||||||
|  |             Or(a, b) => And(Box::new(a.negate()), Box::new(b.negate())), | ||||||
|  |             And(a, b) => Or(Box::new(a.negate()), Box::new(b.negate())), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn between( | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |         item: Pair<Rule>, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let item_span = item.as_span(); | ||||||
|  |         let mut items = item.into_inner(); | ||||||
|  |         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||||
|  |         let lvalue = items.next().unwrap(); | ||||||
|  |         let rvalue = items.next().unwrap(); | ||||||
|  |         match ftype { | ||||||
|  |             FacetType::Integer => { | ||||||
|  |                 let lvalue = pest_parse(lvalue)?; | ||||||
|  |                 let rvalue = pest_parse(rvalue)?; | ||||||
|  |                 Ok(OperatorI64(fid, Between(lvalue, rvalue))) | ||||||
|  |             }, | ||||||
|  |             FacetType::Float => { | ||||||
|  |                 let lvalue = pest_parse(lvalue)?; | ||||||
|  |                 let rvalue = pest_parse(rvalue)?; | ||||||
|  |                 Ok(OperatorF64(fid, Between(lvalue, rvalue))) | ||||||
|  |             }, | ||||||
|  |             FacetType::String => { | ||||||
|  |                 Err(PestError::<Rule>::new_from_span( | ||||||
|  |                     ErrorVariant::CustomError { | ||||||
|  |                         message: format!("invalid operator on a faceted string"), | ||||||
|  |                     }, | ||||||
|  |                     item_span, | ||||||
|  |                 ).into()) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn equal( | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |         item: Pair<Rule>, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let mut items = item.into_inner(); | ||||||
|  |         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||||
|  |         let value = items.next().unwrap(); | ||||||
|  |         match ftype { | ||||||
|  |             FacetType::Integer => Ok(OperatorI64(fid, Equal(pest_parse(value)?))), | ||||||
|  |             FacetType::Float => Ok(OperatorF64(fid, Equal(pest_parse(value)?))), | ||||||
|  |             FacetType::String => { | ||||||
|  |                 Ok(OperatorString(fid, FacetStringOperator::Equal(value.as_str().to_string()))) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn greater_than( | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |         item: Pair<Rule>, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let item_span = item.as_span(); | ||||||
|  |         let mut items = item.into_inner(); | ||||||
|  |         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||||
|  |         let value = items.next().unwrap(); | ||||||
|  |         match ftype { | ||||||
|  |             FacetType::Integer => Ok(OperatorI64(fid, GreaterThan(pest_parse(value)?))), | ||||||
|  |             FacetType::Float => Ok(OperatorF64(fid, GreaterThan(pest_parse(value)?))), | ||||||
|  |             FacetType::String => { | ||||||
|  |                 Err(PestError::<Rule>::new_from_span( | ||||||
|  |                     ErrorVariant::CustomError { | ||||||
|  |                         message: format!("invalid operator on a faceted string"), | ||||||
|  |                     }, | ||||||
|  |                     item_span, | ||||||
|  |                 ).into()) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn greater_than_or_equal( | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |         item: Pair<Rule>, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let item_span = item.as_span(); | ||||||
|  |         let mut items = item.into_inner(); | ||||||
|  |         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||||
|  |         let value = items.next().unwrap(); | ||||||
|  |         match ftype { | ||||||
|  |             FacetType::Integer => Ok(OperatorI64(fid, GreaterThanOrEqual(pest_parse(value)?))), | ||||||
|  |             FacetType::Float => Ok(OperatorF64(fid, GreaterThanOrEqual(pest_parse(value)?))), | ||||||
|  |             FacetType::String => { | ||||||
|  |                 Err(PestError::<Rule>::new_from_span( | ||||||
|  |                     ErrorVariant::CustomError { | ||||||
|  |                         message: format!("invalid operator on a faceted string"), | ||||||
|  |                     }, | ||||||
|  |                     item_span, | ||||||
|  |                 ).into()) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn lower_than( | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |         item: Pair<Rule>, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let item_span = item.as_span(); | ||||||
|  |         let mut items = item.into_inner(); | ||||||
|  |         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||||
|  |         let value = items.next().unwrap(); | ||||||
|  |         match ftype { | ||||||
|  |             FacetType::Integer => Ok(OperatorI64(fid, LowerThan(pest_parse(value)?))), | ||||||
|  |             FacetType::Float => Ok(OperatorF64(fid, LowerThan(pest_parse(value)?))), | ||||||
|  |             FacetType::String => { | ||||||
|  |                 Err(PestError::<Rule>::new_from_span( | ||||||
|  |                     ErrorVariant::CustomError { | ||||||
|  |                         message: format!("invalid operator on a faceted string"), | ||||||
|  |                     }, | ||||||
|  |                     item_span, | ||||||
|  |                 ).into()) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn lower_than_or_equal( | ||||||
|  |         fields_ids_map: &FieldsIdsMap, | ||||||
|  |         faceted_fields: &HashMap<u8, FacetType>, | ||||||
|  |         item: Pair<Rule>, | ||||||
|  |     ) -> anyhow::Result<FacetCondition> | ||||||
|  |     { | ||||||
|  |         let item_span = item.as_span(); | ||||||
|  |         let mut items = item.into_inner(); | ||||||
|  |         let (fid, ftype) = get_field_id_facet_type(fields_ids_map, faceted_fields, &mut items)?; | ||||||
|  |         let value = items.next().unwrap(); | ||||||
|  |         match ftype { | ||||||
|  |             FacetType::Integer => Ok(OperatorI64(fid, LowerThanOrEqual(pest_parse(value)?))), | ||||||
|  |             FacetType::Float => Ok(OperatorF64(fid, LowerThanOrEqual(pest_parse(value)?))), | ||||||
|  |             FacetType::String => { | ||||||
|  |                 Err(PestError::<Rule>::new_from_span( | ||||||
|  |                     ErrorVariant::CustomError { | ||||||
|  |                         message: format!("invalid operator on a faceted string"), | ||||||
|  |                     }, | ||||||
|  |                     item_span, | ||||||
|  |                 ).into()) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl FacetCondition { | ||||||
|  |     /// Aggregates the documents ids that are part of the specified range automatically | ||||||
|  |     /// going deeper through the levels. | ||||||
|  |     fn explore_facet_levels<'t, T: 't, KC>( | ||||||
|  |         rtxn: &'t heed::RoTxn, | ||||||
|  |         db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||||
|  |         field_id: u8, | ||||||
|  |         level: u8, | ||||||
|  |         left: Bound<T>, | ||||||
|  |         right: Bound<T>, | ||||||
|  |         output: &mut RoaringBitmap, | ||||||
|  |     ) -> anyhow::Result<()> | ||||||
|  |     where | ||||||
|  |         T: Copy + PartialEq + PartialOrd + Bounded + Debug, | ||||||
|  |         KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||||
|  |         KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||||
|  |     { | ||||||
|  |         match (left, right) { | ||||||
|  |             // If the request is an exact value we must go directly to the deepest level. | ||||||
|  |             (Included(l), Included(r)) if l == r && level > 0 => { | ||||||
|  |                 return Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, 0, left, right, output); | ||||||
|  |             }, | ||||||
|  |             // lower TO upper when lower > upper must return no result | ||||||
|  |             (Included(l), Included(r)) if l > r => return Ok(()), | ||||||
|  |             (Included(l), Excluded(r)) if l >= r => return Ok(()), | ||||||
|  |             (Excluded(l), Excluded(r)) if l >= r => return Ok(()), | ||||||
|  |             (Excluded(l), Included(r)) if l >= r => return Ok(()), | ||||||
|  |             (_, _) => (), | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let mut left_found = None; | ||||||
|  |         let mut right_found = None; | ||||||
|  |  | ||||||
|  |         // We must create a custom iterator to be able to iterate over the | ||||||
|  |         // requested range as the range iterator cannot express some conditions. | ||||||
|  |         let left_bound = match left { | ||||||
|  |             Included(left) => Included((field_id, level, left, T::min_value())), | ||||||
|  |             Excluded(left) => Excluded((field_id, level, left, T::min_value())), | ||||||
|  |             Unbounded => Unbounded, | ||||||
|  |         }; | ||||||
|  |         let right_bound = Included((field_id, level, T::max_value(), T::max_value())); | ||||||
|  |         // We also make sure that we don't decode the data before we are sure we must return it. | ||||||
|  |         let iter = db | ||||||
|  |             .remap_key_type::<KC>() | ||||||
|  |             .lazily_decode_data() | ||||||
|  |             .range(rtxn, &(left_bound, right_bound))? | ||||||
|  |             .take_while(|r| r.as_ref().map_or(true, |((.., r), _)| { | ||||||
|  |                 match right { | ||||||
|  |                     Included(right) => *r <= right, | ||||||
|  |                     Excluded(right) => *r < right, | ||||||
|  |                     Unbounded => true, | ||||||
|  |                 } | ||||||
|  |             })) | ||||||
|  |             .map(|r| r.and_then(|(key, lazy)| lazy.decode().map(|data| (key, data)))); | ||||||
|  |  | ||||||
|  |         debug!("Iterating between {:?} and {:?} (level {})", left, right, level); | ||||||
|  |  | ||||||
|  |         for (i, result) in iter.enumerate() { | ||||||
|  |             let ((_fid, level, l, r), docids) = result?; | ||||||
|  |             debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); | ||||||
|  |             output.union_with(&docids); | ||||||
|  |             // We save the leftest and rightest bounds we actually found at this level. | ||||||
|  |             if i == 0 { left_found = Some(l); } | ||||||
|  |             right_found = Some(r); | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         // Can we go deeper? | ||||||
|  |         let deeper_level = match level.checked_sub(1) { | ||||||
|  |             Some(level) => level, | ||||||
|  |             None => return Ok(()), | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         // We must refine the left and right bounds of this range by retrieving the | ||||||
|  |         // missing part in a deeper level. | ||||||
|  |         match left_found.zip(right_found) { | ||||||
|  |             Some((left_found, right_found)) => { | ||||||
|  |                 // If the bound is satisfied we avoid calling this function again. | ||||||
|  |                 if !matches!(left, Included(l) if l == left_found) { | ||||||
|  |                     let sub_right = Excluded(left_found); | ||||||
|  |                     debug!("calling left with {:?} to {:?} (level {})",  left, sub_right, deeper_level); | ||||||
|  |                     Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, sub_right, output)?; | ||||||
|  |                 } | ||||||
|  |                 if !matches!(right, Included(r) if r == right_found) { | ||||||
|  |                     let sub_left = Excluded(right_found); | ||||||
|  |                     debug!("calling right with {:?} to {:?} (level {})", sub_left, right, deeper_level); | ||||||
|  |                     Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, sub_left, right, output)?; | ||||||
|  |                 } | ||||||
|  |             }, | ||||||
|  |             None => { | ||||||
|  |                 // If we found nothing at this level it means that we must find | ||||||
|  |                 // the same bounds but at a deeper, more precise level. | ||||||
|  |                 Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, deeper_level, left, right, output)?; | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate_number_operator<'t, T: 't, KC>( | ||||||
|  |         rtxn: &'t heed::RoTxn, | ||||||
|  |         index: &Index, | ||||||
|  |         db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||||
|  |         field_id: u8, | ||||||
|  |         operator: FacetNumberOperator<T>, | ||||||
|  |     ) -> anyhow::Result<RoaringBitmap> | ||||||
|  |     where | ||||||
|  |         T: Copy + PartialEq + PartialOrd + Bounded + Debug, | ||||||
|  |         KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||||
|  |         KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||||
|  |     { | ||||||
|  |         // Make sure we always bound the ranges with the field id and the level, | ||||||
|  |         // as the facets values are all in the same database and prefixed by the | ||||||
|  |         // field id and the level. | ||||||
|  |         let (left, right) = match operator { | ||||||
|  |             GreaterThan(val)        => (Excluded(val),            Included(T::max_value())), | ||||||
|  |             GreaterThanOrEqual(val) => (Included(val),            Included(T::max_value())), | ||||||
|  |             Equal(val)              => (Included(val),            Included(val)), | ||||||
|  |             NotEqual(val)           => { | ||||||
|  |                 let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; | ||||||
|  |                 let docids = Self::evaluate_number_operator::<T, KC>(rtxn, index, db, field_id, Equal(val))?; | ||||||
|  |                 return Ok(all_documents_ids - docids); | ||||||
|  |             }, | ||||||
|  |             LowerThan(val)          => (Included(T::min_value()), Excluded(val)), | ||||||
|  |             LowerThanOrEqual(val)   => (Included(T::min_value()), Included(val)), | ||||||
|  |             Between(left, right)    => (Included(left),           Included(right)), | ||||||
|  |         }; | ||||||
|  |  | ||||||
|  |         // Ask for the biggest value that can exist for this specific field, if it exists | ||||||
|  |         // that's fine if it don't, the value just before will be returned instead. | ||||||
|  |         let biggest_level = db | ||||||
|  |             .remap_types::<KC, DecodeIgnore>() | ||||||
|  |             .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, T::max_value(), T::max_value()))? | ||||||
|  |             .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); | ||||||
|  |  | ||||||
|  |         match biggest_level { | ||||||
|  |             Some(level) => { | ||||||
|  |                 let mut output = RoaringBitmap::new(); | ||||||
|  |                 Self::explore_facet_levels::<T, KC>(rtxn, db, field_id, level, left, right, &mut output)?; | ||||||
|  |                 Ok(output) | ||||||
|  |             }, | ||||||
|  |             None => Ok(RoaringBitmap::new()), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     fn evaluate_string_operator( | ||||||
|  |         rtxn: &heed::RoTxn, | ||||||
|  |         index: &Index, | ||||||
|  |         db: heed::Database<FacetValueStringCodec, CboRoaringBitmapCodec>, | ||||||
|  |         field_id: u8, | ||||||
|  |         operator: &FacetStringOperator, | ||||||
|  |     ) -> anyhow::Result<RoaringBitmap> | ||||||
|  |     { | ||||||
|  |         match operator { | ||||||
|  |             FacetStringOperator::Equal(string) => { | ||||||
|  |                 match db.get(rtxn, &(field_id, string))? { | ||||||
|  |                     Some(docids) => Ok(docids), | ||||||
|  |                     None => Ok(RoaringBitmap::new()) | ||||||
|  |                 } | ||||||
|  |             }, | ||||||
|  |             FacetStringOperator::NotEqual(string) => { | ||||||
|  |                 let all_documents_ids = index.faceted_documents_ids(rtxn, field_id)?; | ||||||
|  |                 let op = FacetStringOperator::Equal(string.clone()); | ||||||
|  |                 let docids = Self::evaluate_string_operator(rtxn, index, db, field_id, &op)?; | ||||||
|  |                 return Ok(all_documents_ids - docids); | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn evaluate( | ||||||
|  |         &self, | ||||||
|  |         rtxn: &heed::RoTxn, | ||||||
|  |         index: &Index, | ||||||
|  |     ) -> anyhow::Result<RoaringBitmap> | ||||||
|  |     { | ||||||
|  |         let db = index.facet_field_id_value_docids; | ||||||
|  |         match self { | ||||||
|  |             OperatorI64(fid, op) => { | ||||||
|  |                 Self::evaluate_number_operator::<i64, FacetLevelValueI64Codec>(rtxn, index, db, *fid, *op) | ||||||
|  |             }, | ||||||
|  |             OperatorF64(fid, op) => { | ||||||
|  |                 Self::evaluate_number_operator::<f64, FacetLevelValueF64Codec>(rtxn, index, db, *fid, *op) | ||||||
|  |             }, | ||||||
|  |             OperatorString(fid, op) => { | ||||||
|  |                 let db = db.remap_key_type::<FacetValueStringCodec>(); | ||||||
|  |                 Self::evaluate_string_operator(rtxn, index, db, *fid, op) | ||||||
|  |             }, | ||||||
|  |             Or(lhs, rhs) => { | ||||||
|  |                 let lhs = lhs.evaluate(rtxn, index)?; | ||||||
|  |                 let rhs = rhs.evaluate(rtxn, index)?; | ||||||
|  |                 Ok(lhs | rhs) | ||||||
|  |             }, | ||||||
|  |             And(lhs, rhs) => { | ||||||
|  |                 let lhs = lhs.evaluate(rtxn, index)?; | ||||||
|  |                 let rhs = rhs.evaluate(rtxn, index)?; | ||||||
|  |                 Ok(lhs & rhs) | ||||||
|  |             }, | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | #[cfg(test)] | ||||||
|  | mod tests { | ||||||
|  |     use super::*; | ||||||
|  |     use crate::update::Settings; | ||||||
|  |     use heed::EnvOpenOptions; | ||||||
|  |     use maplit::hashmap; | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn string() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|  |         // Set the faceted fields to be the channel. | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = Settings::new(&mut wtxn, &index); | ||||||
|  |         builder.set_faceted_fields(hashmap!{ "channel".into() => "string".into() }); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         // Test that the facet condition is correctly generated. | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let condition = FacetCondition::from_str(&rtxn, &index, "channel = ponce").unwrap(); | ||||||
|  |         let expected = OperatorString(1, FacetStringOperator::Equal("ponce".into())); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |  | ||||||
|  |         let condition = FacetCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); | ||||||
|  |         let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |  | ||||||
|  |         let condition = FacetCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); | ||||||
|  |         let expected = OperatorString(1, FacetStringOperator::NotEqual("ponce".into())); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn i64() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|  |         // Set the faceted fields to be the channel. | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = Settings::new(&mut wtxn, &index); | ||||||
|  |         builder.set_faceted_fields(hashmap!{ "timestamp".into() => "integer".into() }); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         // Test that the facet condition is correctly generated. | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let condition = FacetCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); | ||||||
|  |         let expected = OperatorI64(1, Between(22, 44)); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |  | ||||||
|  |         let condition = FacetCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); | ||||||
|  |         let expected = Or( | ||||||
|  |             Box::new(OperatorI64(1, LowerThan(22))), | ||||||
|  |             Box::new(OperatorI64(1, GreaterThan(44))), | ||||||
|  |         ); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     #[test] | ||||||
|  |     fn parentheses() { | ||||||
|  |         let path = tempfile::tempdir().unwrap(); | ||||||
|  |         let mut options = EnvOpenOptions::new(); | ||||||
|  |         options.map_size(10 * 1024 * 1024); // 10 MB | ||||||
|  |         let index = Index::new(options, &path).unwrap(); | ||||||
|  |  | ||||||
|  |         // Set the faceted fields to be the channel. | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let mut builder = Settings::new(&mut wtxn, &index); | ||||||
|  |         builder.set_searchable_fields(vec!["channel".into(), "timestamp".into()]); // to keep the fields order | ||||||
|  |         builder.set_faceted_fields(hashmap!{ | ||||||
|  |             "channel".into() => "string".into(), | ||||||
|  |             "timestamp".into() => "integer".into(), | ||||||
|  |         }); | ||||||
|  |         builder.execute(|_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         // Test that the facet condition is correctly generated. | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         let condition = FacetCondition::from_str( | ||||||
|  |             &rtxn, &index, | ||||||
|  |             "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", | ||||||
|  |         ).unwrap(); | ||||||
|  |         let expected = Or( | ||||||
|  |             Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), | ||||||
|  |             Box::new(And( | ||||||
|  |                 Box::new(OperatorI64(1, Between(22, 44))), | ||||||
|  |                 Box::new(OperatorString(0, FacetStringOperator::NotEqual("ponce".into()))), | ||||||
|  |             )) | ||||||
|  |         ); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |  | ||||||
|  |         let condition = FacetCondition::from_str( | ||||||
|  |             &rtxn, &index, | ||||||
|  |             "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", | ||||||
|  |         ).unwrap(); | ||||||
|  |         let expected = Or( | ||||||
|  |             Box::new(OperatorString(0, FacetStringOperator::Equal("gotaga".into()))), | ||||||
|  |             Box::new(Or( | ||||||
|  |                 Box::new(Or( | ||||||
|  |                     Box::new(OperatorI64(1, LowerThan(22))), | ||||||
|  |                     Box::new(OperatorI64(1, GreaterThan(44))), | ||||||
|  |                 )), | ||||||
|  |                 Box::new(OperatorString(0, FacetStringOperator::Equal("ponce".into()))), | ||||||
|  |             )), | ||||||
|  |         ); | ||||||
|  |         assert_eq!(condition, expected); | ||||||
|  |     } | ||||||
|  | } | ||||||
							
								
								
									
										12
									
								
								src/search/facet/parser.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								src/search/facet/parser.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,12 @@ | |||||||
|  | use once_cell::sync::Lazy; | ||||||
|  | use pest::prec_climber::{Operator, Assoc, PrecClimber}; | ||||||
|  |  | ||||||
|  | pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| { | ||||||
|  |     use Assoc::*; | ||||||
|  |     use Rule::*; | ||||||
|  |     pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) | ||||||
|  | }); | ||||||
|  |  | ||||||
|  | #[derive(Parser)] | ||||||
|  | #[grammar = "search/facet/grammar.pest"] | ||||||
|  | pub struct FilterParser; | ||||||
| @@ -1,5 +1,6 @@ | |||||||
| use std::borrow::Cow; | use std::borrow::Cow; | ||||||
| use std::collections::{HashMap, HashSet}; | use std::collections::{HashMap, HashSet}; | ||||||
|  | use std::fmt; | ||||||
| 
 | 
 | ||||||
| use fst::{IntoStreamer, Streamer}; | use fst::{IntoStreamer, Streamer}; | ||||||
| use levenshtein_automata::DFA; | use levenshtein_automata::DFA; | ||||||
| @@ -8,17 +9,22 @@ use log::debug; | |||||||
| use once_cell::sync::Lazy; | use once_cell::sync::Lazy; | ||||||
| use roaring::bitmap::RoaringBitmap; | use roaring::bitmap::RoaringBitmap; | ||||||
| 
 | 
 | ||||||
| use crate::query_tokens::{QueryTokens, QueryToken}; |  | ||||||
| use crate::mdfs::Mdfs; | use crate::mdfs::Mdfs; | ||||||
|  | use crate::query_tokens::{QueryTokens, QueryToken}; | ||||||
| use crate::{Index, DocumentId}; | use crate::{Index, DocumentId}; | ||||||
| 
 | 
 | ||||||
|  | pub use self::facet::FacetCondition; | ||||||
|  | 
 | ||||||
| // Building these factories is not free.
 | // Building these factories is not free.
 | ||||||
| static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); | ||||||
| static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true)); | static LEVDIST1: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(1, true)); | ||||||
| static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true)); | ||||||
| 
 | 
 | ||||||
|  | mod facet; | ||||||
|  | 
 | ||||||
| pub struct Search<'a> { | pub struct Search<'a> { | ||||||
|     query: Option<String>, |     query: Option<String>, | ||||||
|  |     facet_condition: Option<FacetCondition>, | ||||||
|     offset: usize, |     offset: usize, | ||||||
|     limit: usize, |     limit: usize, | ||||||
|     rtxn: &'a heed::RoTxn<'a>, |     rtxn: &'a heed::RoTxn<'a>, | ||||||
| @@ -27,7 +33,7 @@ pub struct Search<'a> { | |||||||
| 
 | 
 | ||||||
| impl<'a> Search<'a> { | impl<'a> Search<'a> { | ||||||
|     pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { |     pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { | ||||||
|         Search { query: None, offset: 0, limit: 20, rtxn, index } |         Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> { |     pub fn query(&mut self, query: impl Into<String>) -> &mut Search<'a> { | ||||||
| @@ -45,6 +51,11 @@ impl<'a> Search<'a> { | |||||||
|         self |         self | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|  |     pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { | ||||||
|  |         self.facet_condition = Some(condition); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|     /// Extracts the query words from the query string and returns the DFAs accordingly.
 |     /// Extracts the query words from the query string and returns the DFAs accordingly.
 | ||||||
|     /// TODO introduce settings for the number of typos regarding the words lengths.
 |     /// TODO introduce settings for the number of typos regarding the words lengths.
 | ||||||
|     fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { |     fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { | ||||||
| @@ -135,22 +146,44 @@ impl<'a> Search<'a> { | |||||||
| 
 | 
 | ||||||
|     pub fn execute(&self) -> anyhow::Result<SearchResult> { |     pub fn execute(&self) -> anyhow::Result<SearchResult> { | ||||||
|         let limit = self.limit; |         let limit = self.limit; | ||||||
| 
 |  | ||||||
|         let fst = self.index.words_fst(self.rtxn)?; |         let fst = self.index.words_fst(self.rtxn)?; | ||||||
| 
 | 
 | ||||||
|         // Construct the DFAs related to the query words.
 |         // Construct the DFAs related to the query words.
 | ||||||
|         let dfas = match self.query.as_deref().map(Self::generate_query_dfas) { |         let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { | ||||||
|             Some(dfas) if !dfas.is_empty() => dfas, |             Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), | ||||||
|             _ => { |             _otherwise => None, | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         // We create the original candidates with the facet conditions results.
 | ||||||
|  |         let facet_candidates = match &self.facet_condition { | ||||||
|  |             Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), | ||||||
|  |             None => None, | ||||||
|  |         }; | ||||||
|  | 
 | ||||||
|  |         debug!("facet candidates: {:?}", facet_candidates); | ||||||
|  | 
 | ||||||
|  |         let (candidates, derived_words) = match (facet_candidates, derived_words) { | ||||||
|  |             (Some(mut facet_candidates), Some(derived_words)) => { | ||||||
|  |                 let words_candidates = Self::compute_candidates(&derived_words); | ||||||
|  |                 facet_candidates.intersect_with(&words_candidates); | ||||||
|  |                 (facet_candidates, derived_words) | ||||||
|  |             }, | ||||||
|  |             (None, Some(derived_words)) => { | ||||||
|  |                 (Self::compute_candidates(&derived_words), derived_words) | ||||||
|  |             }, | ||||||
|  |             (Some(facet_candidates), None) => { | ||||||
|  |                 // If the query is not set or results in no DFAs but
 | ||||||
|  |                 // there is some facet conditions we return a placeholder.
 | ||||||
|  |                 let documents_ids = facet_candidates.iter().take(limit).collect(); | ||||||
|  |                 return Ok(SearchResult { documents_ids, ..Default::default() }) | ||||||
|  |             }, | ||||||
|  |             (None, None) => { | ||||||
|                 // If the query is not set or results in no DFAs we return a placeholder.
 |                 // If the query is not set or results in no DFAs we return a placeholder.
 | ||||||
|                 let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); |                 let documents_ids = self.index.documents_ids(self.rtxn)?.iter().take(limit).collect(); | ||||||
|                 return Ok(SearchResult { documents_ids, ..Default::default() }) |                 return Ok(SearchResult { documents_ids, ..Default::default() }) | ||||||
|             }, |             }, | ||||||
|         }; |         }; | ||||||
| 
 | 
 | ||||||
|         let derived_words = self.fetch_words_docids(&fst, dfas)?; |  | ||||||
|         let candidates = Self::compute_candidates(&derived_words); |  | ||||||
| 
 |  | ||||||
|         debug!("candidates: {:?}", candidates); |         debug!("candidates: {:?}", candidates); | ||||||
| 
 | 
 | ||||||
|         // The mana depth first search is a revised DFS that explore
 |         // The mana depth first search is a revised DFS that explore
 | ||||||
| @@ -175,6 +208,18 @@ impl<'a> Search<'a> { | |||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | impl fmt::Debug for Search<'_> { | ||||||
|  |     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||||||
|  |         let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self; | ||||||
|  |         f.debug_struct("Search") | ||||||
|  |             .field("query", query) | ||||||
|  |             .field("facet_condition", facet_condition) | ||||||
|  |             .field("offset", offset) | ||||||
|  |             .field("limit", limit) | ||||||
|  |             .finish() | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| #[derive(Default)] | #[derive(Default)] | ||||||
| pub struct SearchResult { | pub struct SearchResult { | ||||||
|     pub found_words: HashSet<String>, |     pub found_words: HashSet<String>, | ||||||
| @@ -1,10 +1,10 @@ | |||||||
| use std::path::PathBuf; | use std::path::PathBuf; | ||||||
| use std::{str, io}; | use std::{str, io, fmt}; | ||||||
|  |  | ||||||
| use anyhow::Context; | use anyhow::Context; | ||||||
| use crate::Index; |  | ||||||
| use heed::EnvOpenOptions; | use heed::EnvOpenOptions; | ||||||
| use structopt::StructOpt; | use structopt::StructOpt; | ||||||
|  | use crate::Index; | ||||||
|  |  | ||||||
| use Command::*; | use Command::*; | ||||||
|  |  | ||||||
| @@ -89,6 +89,12 @@ enum Command { | |||||||
|         field_name: String, |         field_name: String, | ||||||
|     }, |     }, | ||||||
|  |  | ||||||
|  |     /// Outputs some facets statistics for the given facet name. | ||||||
|  |     FacetStats { | ||||||
|  |         /// The field name in the document. | ||||||
|  |         field_name: String, | ||||||
|  |     }, | ||||||
|  |  | ||||||
|     /// Outputs the total size of all the docid-word-positions keys and values. |     /// Outputs the total size of all the docid-word-positions keys and values. | ||||||
|     TotalDocidWordPositionsSize, |     TotalDocidWordPositionsSize, | ||||||
|  |  | ||||||
| @@ -165,6 +171,7 @@ pub fn run(opt: Opt) -> anyhow::Result<()> { | |||||||
|         FacetValuesDocids { full_display, field_name } => { |         FacetValuesDocids { full_display, field_name } => { | ||||||
|             facet_values_docids(&index, &rtxn, !full_display, field_name) |             facet_values_docids(&index, &rtxn, !full_display, field_name) | ||||||
|         }, |         }, | ||||||
|  |         FacetStats { field_name } => facet_stats(&index, &rtxn, field_name), | ||||||
|         TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), |         TotalDocidWordPositionsSize => total_docid_word_positions_size(&index, &rtxn), | ||||||
|         AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), |         AverageNumberOfWordsByDoc => average_number_of_words_by_doc(&index, &rtxn), | ||||||
|         AverageNumberOfPositionsByWord => { |         AverageNumberOfPositionsByWord => { | ||||||
| @@ -225,46 +232,140 @@ fn most_common_words(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow: | |||||||
|     Ok(wtr.flush()?) |     Ok(wtr.flush()?) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | /// Helper function that converts the facet value key to a unique type | ||||||
|  | /// that can be used to log or display purposes. | ||||||
|  | fn facet_values_iter<'txn, DC: 'txn, T>( | ||||||
|  |     rtxn: &'txn heed::RoTxn, | ||||||
|  |     db: heed::Database<heed::types::ByteSlice, DC>, | ||||||
|  |     field_id: u8, | ||||||
|  |     facet_type: crate::facet::FacetType, | ||||||
|  |     string_fn: impl Fn(&str) -> T + 'txn, | ||||||
|  |     float_fn: impl Fn(u8, f64, f64) -> T + 'txn, | ||||||
|  |     integer_fn: impl Fn(u8, i64, i64) -> T + 'txn, | ||||||
|  | ) -> heed::Result<Box<dyn Iterator<Item=heed::Result<(T, DC::DItem)>> + 'txn>> | ||||||
|  | where | ||||||
|  |     DC: heed::BytesDecode<'txn>, | ||||||
|  | { | ||||||
|  |     use crate::facet::FacetType; | ||||||
|  |     use crate::heed_codec::facet::{ | ||||||
|  |         FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec, | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     let iter = db.prefix_iter(&rtxn, &[field_id])?; | ||||||
|  |     match facet_type { | ||||||
|  |         FacetType::String => { | ||||||
|  |             let iter = iter.remap_key_type::<FacetValueStringCodec>() | ||||||
|  |                 .map(move |r| r.map(|((_, key), value)| (string_fn(key), value))); | ||||||
|  |             Ok(Box::new(iter) as Box<dyn Iterator<Item=_>>) | ||||||
|  |         }, | ||||||
|  |         FacetType::Float => { | ||||||
|  |             let iter = iter.remap_key_type::<FacetLevelValueF64Codec>() | ||||||
|  |                 .map(move |r| r.map(|((_, level, left, right), value)| { | ||||||
|  |                     (float_fn(level, left, right), value) | ||||||
|  |                 })); | ||||||
|  |             Ok(Box::new(iter)) | ||||||
|  |         }, | ||||||
|  |         FacetType::Integer => { | ||||||
|  |             let iter = iter.remap_key_type::<FacetLevelValueI64Codec>() | ||||||
|  |                 .map(move |r| r.map(|((_, level, left, right), value)| { | ||||||
|  |                     (integer_fn(level, left, right), value) | ||||||
|  |                 })); | ||||||
|  |             Ok(Box::new(iter)) | ||||||
|  |         }, | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn facet_number_value_to_string<T: fmt::Debug>(level: u8, left: T, right: T) -> String { | ||||||
|  |     if level == 0 { | ||||||
|  |         format!("{:?} (level {})", left, level) | ||||||
|  |     } else { | ||||||
|  |         format!("{:?} to {:?} (level {})", left, right, level) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
| fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyhow::Result<()> { | ||||||
|     use std::cmp::Reverse; |     use std::cmp::Reverse; | ||||||
|     use std::collections::BinaryHeap; |     use std::collections::BinaryHeap; | ||||||
|     use heed::types::{Str, ByteSlice}; |     use heed::types::{Str, ByteSlice}; | ||||||
|     use crate::heed_codec::BEU32StrCodec; |  | ||||||
|  |     let Index { | ||||||
|  |         env: _env, | ||||||
|  |         main, | ||||||
|  |         word_docids, | ||||||
|  |         docid_word_positions, | ||||||
|  |         word_pair_proximity_docids, | ||||||
|  |         facet_field_id_value_docids, | ||||||
|  |         documents, | ||||||
|  |     } = index; | ||||||
|  |  | ||||||
|     let main_name = "main"; |     let main_name = "main"; | ||||||
|     let word_docids_name = "word_docids"; |     let word_docids_name = "word_docids"; | ||||||
|     let docid_word_positions_name = "docid_word_positions"; |     let docid_word_positions_name = "docid_word_positions"; | ||||||
|  |     let word_pair_proximity_docids_name = "word_pair_proximity_docids"; | ||||||
|  |     let facet_field_id_value_docids_name = "facet_field_id_value_docids"; | ||||||
|  |     let documents_name = "documents"; | ||||||
|  |  | ||||||
|     let mut heap = BinaryHeap::with_capacity(limit + 1); |     let mut heap = BinaryHeap::with_capacity(limit + 1); | ||||||
|  |  | ||||||
|     if limit > 0 { |     if limit > 0 { | ||||||
|         let words_fst = index.words_fst(rtxn)?; |         let words_fst = index.words_fst(rtxn)?; | ||||||
|  |  | ||||||
|         heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); |         heap.push(Reverse((words_fst.as_fst().as_bytes().len(), format!("words-fst"), main_name))); | ||||||
|         if heap.len() > limit { heap.pop(); } |         if heap.len() > limit { heap.pop(); } | ||||||
|  |  | ||||||
|         if let Some(documents) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents")? { |         if let Some(documents_ids) = main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { | ||||||
|             heap.push(Reverse((documents.len(), format!("documents"), main_name))); |  | ||||||
|             if heap.len() > limit { heap.pop(); } |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if let Some(documents_ids) = index.main.get::<_, Str, ByteSlice>(rtxn, "documents-ids")? { |  | ||||||
|             heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); |             heap.push(Reverse((documents_ids.len(), format!("documents-ids"), main_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { heap.pop(); } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in index.word_docids.as_polymorph().iter::<_, Str, ByteSlice>(rtxn)? { |         for result in word_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let (word, value) = result?; |             let (word, value) = result?; | ||||||
|             heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); |             heap.push(Reverse((value.len(), word.to_string(), word_docids_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { heap.pop(); } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         for result in index.docid_word_positions.as_polymorph().iter::<_, BEU32StrCodec, ByteSlice>(rtxn)? { |         for result in docid_word_positions.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|             let ((docid, word), value) = result?; |             let ((docid, word), value) = result?; | ||||||
|             let key = format!("{} {}", docid, word); |             let key = format!("{} {}", docid, word); | ||||||
|             heap.push(Reverse((value.len(), key, docid_word_positions_name))); |             heap.push(Reverse((value.len(), key, docid_word_positions_name))); | ||||||
|             if heap.len() > limit { heap.pop(); } |             if heap.len() > limit { heap.pop(); } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         for result in word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|  |             let ((word1, word2, prox), value) = result?; | ||||||
|  |             let key = format!("{} {} {}", word1, word2, prox); | ||||||
|  |             heap.push(Reverse((value.len(), key, word_pair_proximity_docids_name))); | ||||||
|  |             if heap.len() > limit { heap.pop(); } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         let faceted_fields = index.faceted_fields(rtxn)?; | ||||||
|  |         let fields_ids_map = index.fields_ids_map(rtxn)?; | ||||||
|  |         for (field_id, field_type) in faceted_fields { | ||||||
|  |             let facet_name = fields_ids_map.name(field_id).unwrap(); | ||||||
|  |  | ||||||
|  |             let db = facet_field_id_value_docids.remap_data_type::<ByteSlice>(); | ||||||
|  |             let iter = facet_values_iter( | ||||||
|  |                 rtxn, | ||||||
|  |                 db, | ||||||
|  |                 field_id, | ||||||
|  |                 field_type, | ||||||
|  |                 |key| key.to_owned(), | ||||||
|  |                 facet_number_value_to_string, | ||||||
|  |                 facet_number_value_to_string, | ||||||
|  |             )?; | ||||||
|  |  | ||||||
|  |             for result in iter { | ||||||
|  |                 let (fvalue, value) = result?; | ||||||
|  |                 let key = format!("{} {}", facet_name, fvalue); | ||||||
|  |                 heap.push(Reverse((value.len(), key, facet_field_id_value_docids_name))); | ||||||
|  |                 if heap.len() > limit { heap.pop(); } | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         for result in documents.remap_data_type::<ByteSlice>().iter(rtxn)? { | ||||||
|  |             let (id, value) = result?; | ||||||
|  |             heap.push(Reverse((value.len(), id.to_string(), documents_name))); | ||||||
|  |             if heap.len() > limit { heap.pop(); } | ||||||
|  |         } | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
| @@ -298,10 +399,6 @@ fn words_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, words: Vec<Strin | |||||||
| } | } | ||||||
|  |  | ||||||
| fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { | fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_name: String) -> anyhow::Result<()> { | ||||||
|     use crate::facet::FacetType; |  | ||||||
|     use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; |  | ||||||
|     use heed::{BytesDecode, Error::Decoding}; |  | ||||||
|  |  | ||||||
|     let fields_ids_map = index.fields_ids_map(&rtxn)?; |     let fields_ids_map = index.fields_ids_map(&rtxn)?; | ||||||
|     let faceted_fields = index.faceted_fields(&rtxn)?; |     let faceted_fields = index.faceted_fields(&rtxn)?; | ||||||
|  |  | ||||||
| @@ -310,51 +407,78 @@ fn facet_values_docids(index: &Index, rtxn: &heed::RoTxn, debug: bool, field_nam | |||||||
|     let field_type = faceted_fields.get(&field_id) |     let field_type = faceted_fields.get(&field_id) | ||||||
|         .with_context(|| format!("field {} is not faceted", field_name))?; |         .with_context(|| format!("field {} is not faceted", field_name))?; | ||||||
|  |  | ||||||
|     let iter = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[field_id])?; |  | ||||||
|     let iter = match field_type { |  | ||||||
|         FacetType::String => { |  | ||||||
|             let iter = iter |  | ||||||
|                 .map(|result| result.and_then(|(key, value)| { |  | ||||||
|                     let (_, key) = FacetValueStringCodec::bytes_decode(key).ok_or(Decoding)?; |  | ||||||
|                     Ok((key.to_string(), value)) |  | ||||||
|                 })); |  | ||||||
|             Box::new(iter) as Box<dyn Iterator<Item=_>> |  | ||||||
|         }, |  | ||||||
|         FacetType::Float => { |  | ||||||
|             let iter = iter |  | ||||||
|                 .map(|result| result.and_then(|(key, value)| { |  | ||||||
|                     let (_, key) = FacetValueF64Codec::bytes_decode(key).ok_or(Decoding)?; |  | ||||||
|                     Ok((key.to_string(), value)) |  | ||||||
|                 })); |  | ||||||
|             Box::new(iter) |  | ||||||
|         }, |  | ||||||
|         FacetType::Integer => { |  | ||||||
|             let iter = iter |  | ||||||
|                 .map(|result| result.and_then(|(key, value)| { |  | ||||||
|                     let (_, key) = FacetValueI64Codec::bytes_decode(key).ok_or(Decoding)?; |  | ||||||
|                     Ok((key.to_string(), value)) |  | ||||||
|                 })); |  | ||||||
|             Box::new(iter) |  | ||||||
|         }, |  | ||||||
|     }; |  | ||||||
|  |  | ||||||
|     let stdout = io::stdout(); |     let stdout = io::stdout(); | ||||||
|     let mut wtr = csv::Writer::from_writer(stdout.lock()); |     let mut wtr = csv::Writer::from_writer(stdout.lock()); | ||||||
|     wtr.write_record(&["facet_value", "documents_ids"])?; |     wtr.write_record(&["facet_value", "documents_count", "documents_ids"])?; | ||||||
|  |  | ||||||
|  |     let db = index.facet_field_id_value_docids; | ||||||
|  |     let iter = facet_values_iter( | ||||||
|  |         rtxn, | ||||||
|  |         db, | ||||||
|  |         field_id, | ||||||
|  |         *field_type, | ||||||
|  |         |key| key.to_owned(), | ||||||
|  |         facet_number_value_to_string, | ||||||
|  |         facet_number_value_to_string, | ||||||
|  |     )?; | ||||||
|  |  | ||||||
|     for result in iter { |     for result in iter { | ||||||
|         let (value, docids) = result?; |         let (value, docids) = result?; | ||||||
|  |         let count = docids.len(); | ||||||
|         let docids = if debug { |         let docids = if debug { | ||||||
|             format!("{:?}", docids) |             format!("{:?}", docids) | ||||||
|         } else { |         } else { | ||||||
|             format!("{:?}", docids.iter().collect::<Vec<_>>()) |             format!("{:?}", docids.iter().collect::<Vec<_>>()) | ||||||
|         }; |         }; | ||||||
|         wtr.write_record(&[value, docids])?; |         wtr.write_record(&[value, count.to_string(), docids])?; | ||||||
|     } |     } | ||||||
|  |  | ||||||
|     Ok(wtr.flush()?) |     Ok(wtr.flush()?) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | fn facet_stats(index: &Index, rtxn: &heed::RoTxn, field_name: String) -> anyhow::Result<()> { | ||||||
|  |     let fields_ids_map = index.fields_ids_map(&rtxn)?; | ||||||
|  |     let faceted_fields = index.faceted_fields(&rtxn)?; | ||||||
|  |  | ||||||
|  |     let field_id = fields_ids_map.id(&field_name) | ||||||
|  |         .with_context(|| format!("field {} not found", field_name))?; | ||||||
|  |     let field_type = faceted_fields.get(&field_id) | ||||||
|  |         .with_context(|| format!("field {} is not faceted", field_name))?; | ||||||
|  |  | ||||||
|  |     let db = index.facet_field_id_value_docids; | ||||||
|  |     let iter = facet_values_iter( | ||||||
|  |         rtxn, | ||||||
|  |         db, | ||||||
|  |         field_id, | ||||||
|  |         *field_type, | ||||||
|  |         |_key| 0u8, | ||||||
|  |         |level, _left, _right| level, | ||||||
|  |         |level, _left, _right| level, | ||||||
|  |     )?; | ||||||
|  |  | ||||||
|  |     println!("The database {:?} facet stats", field_name); | ||||||
|  |  | ||||||
|  |     let mut level_size = 0; | ||||||
|  |     let mut current_level = None; | ||||||
|  |     for result in iter { | ||||||
|  |         let (level, _) = result?; | ||||||
|  |         if let Some(current) = current_level { | ||||||
|  |             if current != level { | ||||||
|  |                 println!("\tnumber of groups at level {}: {}", current, level_size); | ||||||
|  |                 level_size = 0; | ||||||
|  |             } | ||||||
|  |         } | ||||||
|  |         current_level = Some(level); | ||||||
|  |         level_size += 1; | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     if let Some(current) = current_level { | ||||||
|  |         println!("\tnumber of groups at level {}: {}", current, level_size); | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
|  |  | ||||||
| fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { | fn export_words_fst(index: &Index, rtxn: &heed::RoTxn, output: PathBuf) -> anyhow::Result<()> { | ||||||
|     use std::fs::File; |     use std::fs::File; | ||||||
|     use std::io::Write as _; |     use std::io::Write as _; | ||||||
|   | |||||||
| @@ -24,12 +24,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { | |||||||
|  |  | ||||||
|         // We retrieve the number of documents ids that we are deleting. |         // We retrieve the number of documents ids that we are deleting. | ||||||
|         let number_of_documents = self.index.number_of_documents(self.wtxn)?; |         let number_of_documents = self.index.number_of_documents(self.wtxn)?; | ||||||
|  |         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||||
|  |  | ||||||
|         // We clean some of the main engine datastructures. |         // We clean some of the main engine datastructures. | ||||||
|         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; |         self.index.put_words_fst(self.wtxn, &fst::Set::default())?; | ||||||
|         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; |         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; | ||||||
|         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; |         self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; | ||||||
|  |  | ||||||
|  |         // We clean all the faceted documents ids. | ||||||
|  |         for (field_id, _) in faceted_fields { | ||||||
|  |             self.index.put_faceted_documents_ids(self.wtxn, field_id, &RoaringBitmap::default())?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         // Clear the other databases. |         // Clear the other databases. | ||||||
|         word_docids.clear(self.wtxn)?; |         word_docids.clear(self.wtxn)?; | ||||||
|         docid_word_positions.clear(self.wtxn)?; |         docid_word_positions.clear(self.wtxn)?; | ||||||
|   | |||||||
| @@ -1,4 +1,5 @@ | |||||||
| use fst::IntoStreamer; | use fst::IntoStreamer; | ||||||
|  | use heed::types::ByteSlice; | ||||||
| use roaring::RoaringBitmap; | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
| use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; | use crate::{Index, BEU32, SmallString32, ExternalDocumentsIds}; | ||||||
| @@ -132,11 +133,12 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|             let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; |             let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; | ||||||
|             if let Some((key, mut docids)) = iter.next().transpose()? { |             if let Some((key, mut docids)) = iter.next().transpose()? { | ||||||
|                 if key == word.as_ref() { |                 if key == word.as_ref() { | ||||||
|  |                     let previous_len = docids.len(); | ||||||
|                     docids.difference_with(&self.documents_ids); |                     docids.difference_with(&self.documents_ids); | ||||||
|                     if docids.is_empty() { |                     if docids.is_empty() { | ||||||
|                         iter.del_current()?; |                         iter.del_current()?; | ||||||
|                         *must_remove = true; |                         *must_remove = true; | ||||||
|                     } else { |                     } else if docids.len() != previous_len { | ||||||
|                         iter.put_current(key, &docids)?; |                         iter.put_current(key, &docids)?; | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
| @@ -168,27 +170,37 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { | |||||||
|         // We delete the documents ids that are under the pairs of words, |         // We delete the documents ids that are under the pairs of words, | ||||||
|         // it is faster and use no memory to iterate over all the words pairs than |         // it is faster and use no memory to iterate over all the words pairs than | ||||||
|         // to compute the cartesian product of every words of the deleted documents. |         // to compute the cartesian product of every words of the deleted documents. | ||||||
|         let mut iter = word_pair_proximity_docids.iter_mut(self.wtxn)?; |         let mut iter = word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?; | ||||||
|         while let Some(result) = iter.next() { |         while let Some(result) = iter.next() { | ||||||
|             let ((w1, w2, prox), mut docids) = result?; |             let (bytes, mut docids) = result?; | ||||||
|  |             let previous_len = docids.len(); | ||||||
|             docids.difference_with(&self.documents_ids); |             docids.difference_with(&self.documents_ids); | ||||||
|             if docids.is_empty() { |             if docids.is_empty() { | ||||||
|                 iter.del_current()?; |                 iter.del_current()?; | ||||||
|             } else { |             } else if docids.len() != previous_len { | ||||||
|                 iter.put_current(&(w1, w2, prox), &docids)?; |                 iter.put_current(bytes, &docids)?; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         drop(iter); |         drop(iter); | ||||||
|  |  | ||||||
|  |         // Remove the documents ids from the faceted documents ids. | ||||||
|  |         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||||
|  |         for (field_id, _) in faceted_fields { | ||||||
|  |             let mut docids = self.index.faceted_documents_ids(self.wtxn, field_id)?; | ||||||
|  |             docids.difference_with(&self.documents_ids); | ||||||
|  |             self.index.put_faceted_documents_ids(self.wtxn, field_id, &docids)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|         // We delete the documents ids that are under the facet field id values. |         // We delete the documents ids that are under the facet field id values. | ||||||
|         let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; |         let mut iter = facet_field_id_value_docids.iter_mut(self.wtxn)?; | ||||||
|         while let Some(result) = iter.next() { |         while let Some(result) = iter.next() { | ||||||
|             let (bytes, mut docids) = result?; |             let (bytes, mut docids) = result?; | ||||||
|  |             let previous_len = docids.len(); | ||||||
|             docids.difference_with(&self.documents_ids); |             docids.difference_with(&self.documents_ids); | ||||||
|             if docids.is_empty() { |             if docids.is_empty() { | ||||||
|                 iter.del_current()?; |                 iter.del_current()?; | ||||||
|             } else { |             } else if docids.len() != previous_len { | ||||||
|                 iter.put_current(bytes, &docids)?; |                 iter.put_current(bytes, &docids)?; | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|   | |||||||
							
								
								
									
										256
									
								
								src/update/facets.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										256
									
								
								src/update/facets.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,256 @@ | |||||||
|  | use std::cmp; | ||||||
|  | use std::fs::File; | ||||||
|  | use std::num::NonZeroUsize; | ||||||
|  |  | ||||||
|  | use grenad::{CompressionType, Reader, Writer, FileFuse}; | ||||||
|  | use heed::types::{ByteSlice, DecodeIgnore}; | ||||||
|  | use heed::{BytesEncode, Error}; | ||||||
|  | use log::debug; | ||||||
|  | use num_traits::{Bounded, Zero}; | ||||||
|  | use roaring::RoaringBitmap; | ||||||
|  |  | ||||||
|  | use crate::facet::FacetType; | ||||||
|  | use crate::heed_codec::CboRoaringBitmapCodec; | ||||||
|  | use crate::heed_codec::facet::{FacetLevelValueI64Codec, FacetLevelValueF64Codec}; | ||||||
|  | use crate::Index; | ||||||
|  | use crate::update::index_documents::WriteMethod; | ||||||
|  | use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database}; | ||||||
|  |  | ||||||
|  | pub struct Facets<'t, 'u, 'i> { | ||||||
|  |     wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|  |     index: &'i Index, | ||||||
|  |     pub(crate) chunk_compression_type: CompressionType, | ||||||
|  |     pub(crate) chunk_compression_level: Option<u32>, | ||||||
|  |     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||||
|  |     level_group_size: NonZeroUsize, | ||||||
|  |     min_level_size: NonZeroUsize, | ||||||
|  | } | ||||||
|  |  | ||||||
|  | impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { | ||||||
|  |     pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { | ||||||
|  |         Facets { | ||||||
|  |             wtxn, | ||||||
|  |             index, | ||||||
|  |             chunk_compression_type: CompressionType::None, | ||||||
|  |             chunk_compression_level: None, | ||||||
|  |             chunk_fusing_shrink_size: None, | ||||||
|  |             level_group_size: NonZeroUsize::new(4).unwrap(), | ||||||
|  |             min_level_size: NonZeroUsize::new(5).unwrap(), | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { | ||||||
|  |         self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { | ||||||
|  |         self.min_level_size = value; | ||||||
|  |         self | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     pub fn execute(self) -> anyhow::Result<()> { | ||||||
|  |         // We get the faceted fields to be able to create the facet levels. | ||||||
|  |         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||||
|  |  | ||||||
|  |         debug!("Computing and writing the facet values levels docids into LMDB on disk..."); | ||||||
|  |         for (field_id, facet_type) in faceted_fields { | ||||||
|  |             let (content, documents_ids) = match facet_type { | ||||||
|  |                 FacetType::Integer => { | ||||||
|  |                     clear_field_levels::<i64, FacetLevelValueI64Codec>( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     let documents_ids = compute_faceted_documents_ids( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     let content = compute_facet_levels::<i64, FacetLevelValueI64Codec>( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         self.chunk_compression_type, | ||||||
|  |                         self.chunk_compression_level, | ||||||
|  |                         self.chunk_fusing_shrink_size, | ||||||
|  |                         self.level_group_size, | ||||||
|  |                         self.min_level_size, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     (Some(content), documents_ids) | ||||||
|  |                 }, | ||||||
|  |                 FacetType::Float => { | ||||||
|  |                     clear_field_levels::<f64, FacetLevelValueF64Codec>( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     let documents_ids = compute_faceted_documents_ids( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     let content = compute_facet_levels::<f64, FacetLevelValueF64Codec>( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         self.chunk_compression_type, | ||||||
|  |                         self.chunk_compression_level, | ||||||
|  |                         self.chunk_fusing_shrink_size, | ||||||
|  |                         self.level_group_size, | ||||||
|  |                         self.min_level_size, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     (Some(content), documents_ids) | ||||||
|  |                 }, | ||||||
|  |                 FacetType::String => { | ||||||
|  |                     let documents_ids = compute_faceted_documents_ids( | ||||||
|  |                         self.wtxn, | ||||||
|  |                         self.index.facet_field_id_value_docids, | ||||||
|  |                         field_id, | ||||||
|  |                     )?; | ||||||
|  |  | ||||||
|  |                     (None, documents_ids) | ||||||
|  |                 }, | ||||||
|  |             }; | ||||||
|  |  | ||||||
|  |             if let Some(content) = content { | ||||||
|  |                 write_into_lmdb_database( | ||||||
|  |                     self.wtxn, | ||||||
|  |                     *self.index.facet_field_id_value_docids.as_polymorph(), | ||||||
|  |                     content, | ||||||
|  |                     |_, _| anyhow::bail!("invalid facet level merging"), | ||||||
|  |                     WriteMethod::GetMergePut, | ||||||
|  |                 )?; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             self.index.put_faceted_documents_ids(self.wtxn, field_id, &documents_ids)?; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         Ok(()) | ||||||
|  |     } | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn clear_field_levels<'t, T: 't, KC>( | ||||||
|  |     wtxn: &'t mut heed::RwTxn, | ||||||
|  |     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||||
|  |     field_id: u8, | ||||||
|  | ) -> heed::Result<()> | ||||||
|  | where | ||||||
|  |     T: Copy + Bounded, | ||||||
|  |     KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||||
|  |     KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||||
|  | { | ||||||
|  |     let left = (field_id, 1, T::min_value(), T::min_value()); | ||||||
|  |     let right = (field_id, u8::MAX, T::max_value(), T::max_value()); | ||||||
|  |     let range = left..=right; | ||||||
|  |     db.remap_key_type::<KC>().delete_range(wtxn, &range).map(drop) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn compute_facet_levels<'t, T: 't, KC>( | ||||||
|  |     rtxn: &'t heed::RoTxn, | ||||||
|  |     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||||
|  |     compression_type: CompressionType, | ||||||
|  |     compression_level: Option<u32>, | ||||||
|  |     shrink_size: Option<u64>, | ||||||
|  |     level_group_size: NonZeroUsize, | ||||||
|  |     min_level_size: NonZeroUsize, | ||||||
|  |     field_id: u8, | ||||||
|  | ) -> anyhow::Result<Reader<FileFuse>> | ||||||
|  | where | ||||||
|  |     T: Copy + PartialEq + PartialOrd + Bounded + Zero, | ||||||
|  |     KC: heed::BytesDecode<'t, DItem = (u8, u8, T, T)>, | ||||||
|  |     KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||||
|  | { | ||||||
|  |     let first_level_size = db.prefix_iter(rtxn, &[field_id])? | ||||||
|  |         .remap_types::<DecodeIgnore, DecodeIgnore>() | ||||||
|  |         .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; | ||||||
|  |  | ||||||
|  |     // It is forbidden to keep a cursor and write in a database at the same time with LMDB | ||||||
|  |     // therefore we write the facet levels entries into a grenad file before transfering them. | ||||||
|  |     let mut writer = tempfile::tempfile().and_then(|file| { | ||||||
|  |         create_writer(compression_type, compression_level, file) | ||||||
|  |     })?; | ||||||
|  |  | ||||||
|  |     let level_0_range = { | ||||||
|  |         let left = (field_id, 0, T::min_value(), T::min_value()); | ||||||
|  |         let right = (field_id, 0, T::max_value(), T::max_value()); | ||||||
|  |         left..=right | ||||||
|  |     }; | ||||||
|  |  | ||||||
|  |     // Groups sizes are always a power of the original level_group_size and therefore a group | ||||||
|  |     // always maps groups of the previous level and never splits previous levels groups in half. | ||||||
|  |     let group_size_iter = (1u8..) | ||||||
|  |         .map(|l| (l, level_group_size.get().pow(l as u32))) | ||||||
|  |         .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); | ||||||
|  |  | ||||||
|  |     for (level, group_size) in group_size_iter { | ||||||
|  |         let mut left = T::zero(); | ||||||
|  |         let mut right = T::zero(); | ||||||
|  |         let mut group_docids = RoaringBitmap::new(); | ||||||
|  |  | ||||||
|  |         let db = db.remap_key_type::<KC>(); | ||||||
|  |         for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { | ||||||
|  |             let ((_field_id, _level, value, _right), docids) = result?; | ||||||
|  |  | ||||||
|  |             if i == 0 { | ||||||
|  |                 left = value; | ||||||
|  |             } else if i % group_size == 0 { | ||||||
|  |                 // we found the first bound of the next group, we must store the left | ||||||
|  |                 // and right bounds associated with the docids. | ||||||
|  |                 write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?; | ||||||
|  |  | ||||||
|  |                 // We save the left bound for the new group and also reset the docids. | ||||||
|  |                 group_docids = RoaringBitmap::new(); | ||||||
|  |                 left = value; | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |             // The right bound is always the bound we run through. | ||||||
|  |             group_docids.union_with(&docids); | ||||||
|  |             right = value; | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |         if !group_docids.is_empty() { | ||||||
|  |             write_entry::<T, KC>(&mut writer, field_id, level, left, right, &group_docids)?; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  |  | ||||||
|  |     writer_into_reader(writer, shrink_size) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn compute_faceted_documents_ids( | ||||||
|  |     rtxn: &heed::RoTxn, | ||||||
|  |     db: heed::Database<ByteSlice, CboRoaringBitmapCodec>, | ||||||
|  |     field_id: u8, | ||||||
|  | ) -> anyhow::Result<RoaringBitmap> | ||||||
|  | { | ||||||
|  |     let mut documents_ids = RoaringBitmap::new(); | ||||||
|  |     for result in db.prefix_iter(rtxn, &[field_id])? { | ||||||
|  |         let (_key, docids) = result?; | ||||||
|  |         documents_ids.union_with(&docids); | ||||||
|  |     } | ||||||
|  |     Ok(documents_ids) | ||||||
|  | } | ||||||
|  |  | ||||||
|  | fn write_entry<T, KC>( | ||||||
|  |     writer: &mut Writer<File>, | ||||||
|  |     field_id: u8, | ||||||
|  |     level: u8, | ||||||
|  |     left: T, | ||||||
|  |     right: T, | ||||||
|  |     ids: &RoaringBitmap, | ||||||
|  | ) -> anyhow::Result<()> | ||||||
|  | where | ||||||
|  |     KC: for<'x> heed::BytesEncode<'x, EItem = (u8, u8, T, T)>, | ||||||
|  | { | ||||||
|  |     let key = (field_id, level, left, right); | ||||||
|  |     let key = KC::bytes_encode(&key).ok_or(Error::Encoding)?; | ||||||
|  |     let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; | ||||||
|  |     writer.insert(&key, &data)?; | ||||||
|  |     Ok(()) | ||||||
|  | } | ||||||
| @@ -2,6 +2,7 @@ use std::borrow::Cow; | |||||||
| use std::collections::HashSet; | use std::collections::HashSet; | ||||||
| use std::fs::File; | use std::fs::File; | ||||||
| use std::io::{self, Seek, SeekFrom}; | use std::io::{self, Seek, SeekFrom}; | ||||||
|  | use std::num::NonZeroUsize; | ||||||
| use std::sync::mpsc::sync_channel; | use std::sync::mpsc::sync_channel; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|  |  | ||||||
| @@ -15,7 +16,7 @@ use rayon::prelude::*; | |||||||
| use rayon::ThreadPool; | use rayon::ThreadPool; | ||||||
|  |  | ||||||
| use crate::index::Index; | use crate::index::Index; | ||||||
| use crate::update::UpdateIndexingStep; | use crate::update::{Facets, UpdateIndexingStep}; | ||||||
| use self::store::{Store, Readers}; | use self::store::{Store, Readers}; | ||||||
| use self::merge_function::{ | use self::merge_function::{ | ||||||
|     main_merge, word_docids_merge, words_pairs_proximities_docids_merge, |     main_merge, word_docids_merge, words_pairs_proximities_docids_merge, | ||||||
| @@ -31,12 +32,12 @@ mod store; | |||||||
| mod transform; | mod transform; | ||||||
|  |  | ||||||
| #[derive(Debug, Copy, Clone)] | #[derive(Debug, Copy, Clone)] | ||||||
| enum WriteMethod { | pub enum WriteMethod { | ||||||
|     Append, |     Append, | ||||||
|     GetMergePut, |     GetMergePut, | ||||||
| } | } | ||||||
|  |  | ||||||
| fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { | pub fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Result<Writer<File>> { | ||||||
|     let mut builder = Writer::builder(); |     let mut builder = Writer::builder(); | ||||||
|     builder.compression_type(typ); |     builder.compression_type(typ); | ||||||
|     if let Some(level) = level { |     if let Some(level) = level { | ||||||
| @@ -45,7 +46,7 @@ fn create_writer(typ: CompressionType, level: Option<u32>, file: File) -> io::Re | |||||||
|     builder.build(file) |     builder.build(file) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn create_sorter( | pub fn create_sorter( | ||||||
|     merge: MergeFn, |     merge: MergeFn, | ||||||
|     chunk_compression_type: CompressionType, |     chunk_compression_type: CompressionType, | ||||||
|     chunk_compression_level: Option<u32>, |     chunk_compression_level: Option<u32>, | ||||||
| @@ -71,7 +72,7 @@ fn create_sorter( | |||||||
|     builder.build() |     builder.build() | ||||||
| } | } | ||||||
|  |  | ||||||
| fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> { | pub fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow::Result<Reader<FileFuse>> { | ||||||
|     let mut file = writer.into_inner()?; |     let mut file = writer.into_inner()?; | ||||||
|     file.seek(SeekFrom::Start(0))?; |     file.seek(SeekFrom::Start(0))?; | ||||||
|     let file = if let Some(shrink_size) = shrink_size { |     let file = if let Some(shrink_size) = shrink_size { | ||||||
| @@ -82,13 +83,13 @@ fn writer_into_reader(writer: Writer<File>, shrink_size: Option<u64>) -> anyhow: | |||||||
|     Reader::new(file).map_err(Into::into) |     Reader::new(file).map_err(Into::into) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> { | pub fn merge_readers(sources: Vec<Reader<FileFuse>>, merge: MergeFn) -> Merger<FileFuse, MergeFn> { | ||||||
|     let mut builder = Merger::builder(merge); |     let mut builder = Merger::builder(merge); | ||||||
|     builder.extend(sources); |     builder.extend(sources); | ||||||
|     builder.build() |     builder.build() | ||||||
| } | } | ||||||
|  |  | ||||||
| fn merge_into_lmdb_database( | pub fn merge_into_lmdb_database( | ||||||
|     wtxn: &mut heed::RwTxn, |     wtxn: &mut heed::RwTxn, | ||||||
|     database: heed::PolyDatabase, |     database: heed::PolyDatabase, | ||||||
|     sources: Vec<Reader<FileFuse>>, |     sources: Vec<Reader<FileFuse>>, | ||||||
| @@ -132,7 +133,7 @@ fn merge_into_lmdb_database( | |||||||
|     Ok(()) |     Ok(()) | ||||||
| } | } | ||||||
|  |  | ||||||
| fn write_into_lmdb_database( | pub fn write_into_lmdb_database( | ||||||
|     wtxn: &mut heed::RwTxn, |     wtxn: &mut heed::RwTxn, | ||||||
|     database: heed::PolyDatabase, |     database: heed::PolyDatabase, | ||||||
|     mut reader: Reader<FileFuse>, |     mut reader: Reader<FileFuse>, | ||||||
| @@ -157,7 +158,7 @@ fn write_into_lmdb_database( | |||||||
|                 match iter.next().transpose()? { |                 match iter.next().transpose()? { | ||||||
|                     Some((key, old_val)) if key == k => { |                     Some((key, old_val)) if key == k => { | ||||||
|                         let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; |                         let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; | ||||||
|                         let val = merge(k, &vals).expect("merge failed"); |                         let val = merge(k, &vals)?; | ||||||
|                         iter.put_current(k, &val)?; |                         iter.put_current(k, &val)?; | ||||||
|                     }, |                     }, | ||||||
|                     _ => { |                     _ => { | ||||||
| @@ -207,6 +208,8 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|     pub(crate) chunk_compression_level: Option<u32>, |     pub(crate) chunk_compression_level: Option<u32>, | ||||||
|     pub(crate) chunk_fusing_shrink_size: Option<u64>, |     pub(crate) chunk_fusing_shrink_size: Option<u64>, | ||||||
|     pub(crate) thread_pool: Option<&'a ThreadPool>, |     pub(crate) thread_pool: Option<&'a ThreadPool>, | ||||||
|  |     facet_level_group_size: Option<NonZeroUsize>, | ||||||
|  |     facet_min_level_size: Option<NonZeroUsize>, | ||||||
|     update_method: IndexDocumentsMethod, |     update_method: IndexDocumentsMethod, | ||||||
|     update_format: UpdateFormat, |     update_format: UpdateFormat, | ||||||
|     autogenerate_docids: bool, |     autogenerate_docids: bool, | ||||||
| @@ -225,6 +228,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|             chunk_compression_level: None, |             chunk_compression_level: None, | ||||||
|             chunk_fusing_shrink_size: None, |             chunk_fusing_shrink_size: None, | ||||||
|             thread_pool: None, |             thread_pool: None, | ||||||
|  |             facet_level_group_size: None, | ||||||
|  |             facet_min_level_size: None, | ||||||
|             update_method: IndexDocumentsMethod::ReplaceDocuments, |             update_method: IndexDocumentsMethod::ReplaceDocuments, | ||||||
|             update_format: UpdateFormat::Json, |             update_format: UpdateFormat::Json, | ||||||
|             autogenerate_docids: true, |             autogenerate_docids: true, | ||||||
| @@ -308,8 +313,10 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                 thread_pool: self.thread_pool, |                 thread_pool: self.thread_pool, | ||||||
|             }; |             }; | ||||||
|             let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; |             let mut deletion_builder = update_builder.delete_documents(self.wtxn, self.index)?; | ||||||
|  |             debug!("documents to delete {:?}", replaced_documents_ids); | ||||||
|             deletion_builder.delete_documents(&replaced_documents_ids); |             deletion_builder.delete_documents(&replaced_documents_ids); | ||||||
|             let _deleted_documents_count = deletion_builder.execute()?; |             let deleted_documents_count = deletion_builder.execute()?; | ||||||
|  |             debug!("{} documents actually deleted", deleted_documents_count); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let mmap; |         let mmap; | ||||||
| @@ -327,7 +334,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|         enum DatabaseType { |         enum DatabaseType { | ||||||
|             Main, |             Main, | ||||||
|             WordDocids, |             WordDocids, | ||||||
|             FacetValuesDocids, |             FacetLevel0ValuesDocids, | ||||||
|         } |         } | ||||||
|  |  | ||||||
|         let faceted_fields = self.index.faceted_fields(self.wtxn)?; |         let faceted_fields = self.index.faceted_fields(self.wtxn)?; | ||||||
| @@ -427,7 +434,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                     (DatabaseType::Main, main_readers, main_merge as MergeFn), |                     (DatabaseType::Main, main_readers, main_merge as MergeFn), | ||||||
|                     (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), |                     (DatabaseType::WordDocids, word_docids_readers, word_docids_merge), | ||||||
|                     ( |                     ( | ||||||
|                         DatabaseType::FacetValuesDocids, |                         DatabaseType::FacetLevel0ValuesDocids, | ||||||
|                         facet_field_value_docids_readers, |                         facet_field_value_docids_readers, | ||||||
|                         facet_field_value_docids_merge, |                         facet_field_value_docids_merge, | ||||||
|                     ), |                     ), | ||||||
| @@ -557,7 +564,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|                         write_method, |                         write_method, | ||||||
|                     )?; |                     )?; | ||||||
|                 }, |                 }, | ||||||
|                 DatabaseType::FacetValuesDocids => { |                 DatabaseType::FacetLevel0ValuesDocids => { | ||||||
|                     debug!("Writing the facet values docids into LMDB on disk..."); |                     debug!("Writing the facet values docids into LMDB on disk..."); | ||||||
|                     let db = *self.index.facet_field_id_value_docids.as_polymorph(); |                     let db = *self.index.facet_field_id_value_docids.as_polymorph(); | ||||||
|                     write_into_lmdb_database( |                     write_into_lmdb_database( | ||||||
| @@ -577,6 +584,18 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { | |||||||
|             }); |             }); | ||||||
|         } |         } | ||||||
|  |  | ||||||
|  |         let mut builder = Facets::new(self.wtxn, self.index); | ||||||
|  |         builder.chunk_compression_type = self.chunk_compression_type; | ||||||
|  |         builder.chunk_compression_level = self.chunk_compression_level; | ||||||
|  |         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||||
|  |         if let Some(value) = self.facet_level_group_size { | ||||||
|  |             builder.level_group_size(value); | ||||||
|  |         } | ||||||
|  |         if let Some(value) = self.facet_min_level_size { | ||||||
|  |             builder.min_level_size(value); | ||||||
|  |         } | ||||||
|  |         builder.execute()?; | ||||||
|  |  | ||||||
|         debug_assert_eq!(database_count, total_databases); |         debug_assert_eq!(database_count, total_databases); | ||||||
|  |  | ||||||
|         info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); |         info!("Transform output indexed in {:.02?}", before_indexing.elapsed()); | ||||||
|   | |||||||
| @@ -19,7 +19,7 @@ use tempfile::tempfile; | |||||||
|  |  | ||||||
| use crate::facet::FacetType; | use crate::facet::FacetType; | ||||||
| use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; | ||||||
| use crate::heed_codec::facet::{FacetValueStringCodec, FacetValueF64Codec, FacetValueI64Codec}; | use crate::heed_codec::facet::{FacetValueStringCodec, FacetLevelValueF64Codec, FacetLevelValueI64Codec}; | ||||||
| use crate::tokenizer::{simple_tokenizer, only_token}; | use crate::tokenizer::{simple_tokenizer, only_token}; | ||||||
| use crate::update::UpdateIndexingStep; | use crate::update::UpdateIndexingStep; | ||||||
| use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; | use crate::{json_to_string, SmallVec8, SmallVec32, SmallString32, Position, DocumentId}; | ||||||
| @@ -337,8 +337,8 @@ impl Store { | |||||||
|         for ((field_id, value), docids) in iter { |         for ((field_id, value), docids) in iter { | ||||||
|             let result = match value { |             let result = match value { | ||||||
|                 String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), |                 String(s) => FacetValueStringCodec::bytes_encode(&(field_id, &s)).map(Cow::into_owned), | ||||||
|                 Float(f) => FacetValueF64Codec::bytes_encode(&(field_id, *f)).map(Cow::into_owned), |                 Float(f) => FacetLevelValueF64Codec::bytes_encode(&(field_id, 0, *f, *f)).map(Cow::into_owned), | ||||||
|                 Integer(i) => FacetValueI64Codec::bytes_encode(&(field_id, i)).map(Cow::into_owned), |                 Integer(i) => FacetLevelValueI64Codec::bytes_encode(&(field_id, 0, i, i)).map(Cow::into_owned), | ||||||
|             }; |             }; | ||||||
|             let key = result.context("could not serialize facet key")?; |             let key = result.context("could not serialize facet key")?; | ||||||
|             let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) |             let bytes = CboRoaringBitmapCodec::bytes_encode(&docids) | ||||||
| @@ -399,7 +399,7 @@ impl Store { | |||||||
|             // We skip documents that must not be indexed by this thread. |             // We skip documents that must not be indexed by this thread. | ||||||
|             if count % num_threads == thread_index { |             if count % num_threads == thread_index { | ||||||
|                 // This is a log routine that we do every `log_every_n` documents. |                 // This is a log routine that we do every `log_every_n` documents. | ||||||
|                 if log_every_n.map_or(false, |len| count % len == 0) { |                 if thread_index == 0 && log_every_n.map_or(false, |len| count % len == 0) { | ||||||
|                     info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); |                     info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed()); | ||||||
|                     progress_callback(UpdateIndexingStep::IndexDocuments { |                     progress_callback(UpdateIndexingStep::IndexDocuments { | ||||||
|                         documents_seen: count, |                         documents_seen: count, | ||||||
| @@ -571,7 +571,10 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec | |||||||
|             Value::Null => Ok(()), |             Value::Null => Ok(()), | ||||||
|             Value::Bool(b) => Ok(output.push(Integer(*b as i64))), |             Value::Bool(b) => Ok(output.push(Integer(*b as i64))), | ||||||
|             Value::Number(number) => match ftype { |             Value::Number(number) => match ftype { | ||||||
|                 FacetType::String => bail!("invalid facet type, expecting {} found number", ftype), |                 FacetType::String => { | ||||||
|  |                     let string = SmallString32::from(number.to_string()); | ||||||
|  |                     Ok(output.push(String(string))) | ||||||
|  |                 }, | ||||||
|                 FacetType::Float => match number.as_f64() { |                 FacetType::Float => match number.as_f64() { | ||||||
|                     Some(float) => Ok(output.push(Float(OrderedFloat(float)))), |                     Some(float) => Ok(output.push(Float(OrderedFloat(float)))), | ||||||
|                     None => bail!("invalid facet type, expecting {} found integer", ftype), |                     None => bail!("invalid facet type, expecting {} found integer", ftype), | ||||||
| @@ -586,7 +589,7 @@ fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec | |||||||
|                 }, |                 }, | ||||||
|             }, |             }, | ||||||
|             Value::String(string) => { |             Value::String(string) => { | ||||||
|                 let string = string.trim(); |                 let string = string.trim().to_lowercase(); | ||||||
|                 if string.is_empty() { return Ok(()) } |                 if string.is_empty() { return Ok(()) } | ||||||
|                 match ftype { |                 match ftype { | ||||||
|                     FacetType::String => { |                     FacetType::String => { | ||||||
|   | |||||||
| @@ -1,6 +1,7 @@ | |||||||
| mod available_documents_ids; | mod available_documents_ids; | ||||||
| mod clear_documents; | mod clear_documents; | ||||||
| mod delete_documents; | mod delete_documents; | ||||||
|  | mod facets; | ||||||
| mod index_documents; | mod index_documents; | ||||||
| mod settings; | mod settings; | ||||||
| mod update_builder; | mod update_builder; | ||||||
| @@ -11,6 +12,7 @@ pub use self::available_documents_ids::AvailableDocumentsIds; | |||||||
| pub use self::clear_documents::ClearDocuments; | pub use self::clear_documents::ClearDocuments; | ||||||
| pub use self::delete_documents::DeleteDocuments; | pub use self::delete_documents::DeleteDocuments; | ||||||
| pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; | pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat}; | ||||||
|  | pub use self::facets::Facets; | ||||||
| pub use self::settings::Settings; | pub use self::settings::Settings; | ||||||
| pub use self::update_builder::UpdateBuilder; | pub use self::update_builder::UpdateBuilder; | ||||||
| pub use self::update_step::UpdateIndexingStep; | pub use self::update_step::UpdateIndexingStep; | ||||||
|   | |||||||
| @@ -412,6 +412,23 @@ mod tests { | |||||||
|         let rtxn = index.read_txn().unwrap(); |         let rtxn = index.read_txn().unwrap(); | ||||||
|         let fields_ids = index.faceted_fields(&rtxn).unwrap(); |         let fields_ids = index.faceted_fields(&rtxn).unwrap(); | ||||||
|         assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); |         assert_eq!(fields_ids, hashmap!{ 1 => FacetType::Integer }); | ||||||
|  |         // Only count the field_id 0 and level 0 facet values. | ||||||
|  |         let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); | ||||||
|  |         assert_eq!(count, 3); | ||||||
|  |         drop(rtxn); | ||||||
|  |  | ||||||
|  |         // Index a little more documents with new and current facets values. | ||||||
|  |         let mut wtxn = index.write_txn().unwrap(); | ||||||
|  |         let content = &b"name,age\nkevin2,23\nkevina2,21\nbenoit2,35\n"[..]; | ||||||
|  |         let mut builder = IndexDocuments::new(&mut wtxn, &index); | ||||||
|  |         builder.update_format(UpdateFormat::Csv); | ||||||
|  |         builder.execute(content, |_| ()).unwrap(); | ||||||
|  |         wtxn.commit().unwrap(); | ||||||
|  |  | ||||||
|  |         let rtxn = index.read_txn().unwrap(); | ||||||
|  |         // Only count the field_id 0 and level 0 facet values. | ||||||
|  |         let count = index.facet_field_id_value_docids.prefix_iter(&rtxn, &[1, 0]).unwrap().count(); | ||||||
|  |         assert_eq!(count, 4); | ||||||
|         drop(rtxn); |         drop(rtxn); | ||||||
|     } |     } | ||||||
| } | } | ||||||
|   | |||||||
| @@ -2,7 +2,7 @@ use grenad::CompressionType; | |||||||
| use rayon::ThreadPool; | use rayon::ThreadPool; | ||||||
|  |  | ||||||
| use crate::Index; | use crate::Index; | ||||||
| use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings}; | use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets}; | ||||||
|  |  | ||||||
| pub struct UpdateBuilder<'a> { | pub struct UpdateBuilder<'a> { | ||||||
|     pub(crate) log_every_n: Option<usize>, |     pub(crate) log_every_n: Option<usize>, | ||||||
| @@ -118,4 +118,19 @@ impl<'a> UpdateBuilder<'a> { | |||||||
|  |  | ||||||
|         builder |         builder | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  |     pub fn facets<'t, 'u, 'i>( | ||||||
|  |         self, | ||||||
|  |         wtxn: &'t mut heed::RwTxn<'i, 'u>, | ||||||
|  |         index: &'i Index, | ||||||
|  |     ) -> Facets<'t, 'u, 'i> | ||||||
|  |     { | ||||||
|  |         let mut builder = Facets::new(wtxn, index); | ||||||
|  |  | ||||||
|  |         builder.chunk_compression_type = self.chunk_compression_type; | ||||||
|  |         builder.chunk_compression_level = self.chunk_compression_level; | ||||||
|  |         builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size; | ||||||
|  |  | ||||||
|  |         builder | ||||||
|  |     } | ||||||
| } | } | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user