mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-18 04:11:07 +00:00
Compare commits
26 Commits
prototype-
...
v0.8.5-bet
Author | SHA1 | Date | |
---|---|---|---|
a8c3438887 | |||
03e06a80b2 | |||
a303b6552a | |||
96a6f513ab | |||
4b1127cd98 | |||
8db102b8ed | |||
746ea3db67 | |||
9f2d430e96 | |||
daeb226a3f | |||
2eeae7cfdc | |||
ad958d38e8 | |||
96c3b98e68 | |||
2a7b34787b | |||
60c4292172 | |||
00174d9165 | |||
e7654ffa1e | |||
8a17a8d949 | |||
f21e0bffe2 | |||
7361dba079 | |||
5bc18fa704 | |||
3c8e4a3884 | |||
6e808e4b8f | |||
ee3a3cedf3 | |||
c4320b8b14 | |||
2c5da9aa11 | |||
eeb01c749c |
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,4 +1,5 @@
|
||||
/target
|
||||
meilisearch-core/target
|
||||
**/*.csv
|
||||
**/*.json_lines
|
||||
**/*.rs.bk
|
||||
|
98
Cargo.lock
generated
98
Cargo.lock
generated
@ -196,6 +196,14 @@ dependencies = [
|
||||
"ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cast"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.0.47"
|
||||
@ -249,6 +257,11 @@ dependencies = [
|
||||
"bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "compact_arena"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
name = "const-random"
|
||||
version = "0.1.6"
|
||||
@ -284,6 +297,39 @@ dependencies = [
|
||||
"cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"criterion-plot 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand_os 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand_xoshiro 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_derive 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"walkdir 2.2.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "criterion-plot"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-channel"
|
||||
version = "0.4.0"
|
||||
@ -760,6 +806,14 @@ dependencies = [
|
||||
"libc 0.2.65 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itertools"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"either 1.5.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.4"
|
||||
@ -887,6 +941,8 @@ dependencies = [
|
||||
"bincode 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"csv 1.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"deunicode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -895,6 +951,8 @@ dependencies = [
|
||||
"hashbrown 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"heed 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"levenshtein_automata 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"meilisearch-schema 0.8.2",
|
||||
@ -903,7 +961,7 @@ dependencies = [
|
||||
"once_cell 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"ordered-float 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rustyline 5.0.4 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"siphasher 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
@ -1439,6 +1497,15 @@ dependencies = [
|
||||
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_os"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"getrandom 0.1.13 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_pcg"
|
||||
version = "0.1.2"
|
||||
@ -1456,6 +1523,14 @@ dependencies = [
|
||||
"rand_core 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_xoshiro"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.2.0"
|
||||
@ -1616,7 +1691,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "sdset"
|
||||
version = "0.3.3"
|
||||
version = "0.3.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
|
||||
[[package]]
|
||||
@ -2044,6 +2119,15 @@ dependencies = [
|
||||
"winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinytemplate"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
dependencies = [
|
||||
"serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
"serde_json 1.0.41 (registry+https://github.com/rust-lang/crates.io-index)",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tokio"
|
||||
version = "0.1.22"
|
||||
@ -2564,16 +2648,20 @@ dependencies = [
|
||||
"checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
|
||||
"checksum bytes 0.4.12 (registry+https://github.com/rust-lang/crates.io-index)" = "206fdffcfa2df7cbe15601ef46c813fce0965eb3286db6b56c583b814b51c81c"
|
||||
"checksum c2-chacha 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "214238caa1bf3a496ec3392968969cab8549f96ff30652c9e56885329315f6bb"
|
||||
"checksum cast 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "4b9434b9a5aa1450faa3f9cb14ea0e8c53bb5d2b3c1bfd1ab4fc03e9f33fbfb0"
|
||||
"checksum cc 1.0.47 (registry+https://github.com/rust-lang/crates.io-index)" = "aa87058dce70a3ff5621797f1506cb837edd02ac4c0ae642b4542dce802908b8"
|
||||
"checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
|
||||
"checksum chrono 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e8493056968583b0193c1bb04d6f7684586f3726992d6c573261941a895dbd68"
|
||||
"checksum chunked_transfer 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "f98beb6554de08a14bd7b5c6014963c79d6a25a1c66b1d4ecb9e733ccba51d6c"
|
||||
"checksum clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5067f5bb2d80ef5d68b4c87db81601f0b75bca627bc2ef76b141d7b846a3c6d9"
|
||||
"checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
|
||||
"checksum compact_arena 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "4ab08c5bed92075075d5db5149887a477b2dc0318c40882a0dfbd34315ac6141"
|
||||
"checksum const-random 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7b641a8c9867e341f3295564203b1c250eb8ce6cb6126e007941f78c4d2ed7fe"
|
||||
"checksum const-random-macro 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c750ec12b83377637110d5a57f5ae08e895b06c4b16e2bdbf1a94ef717428c59"
|
||||
"checksum cookie 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "888604f00b3db336d2af898ec3c1d5d0ddf5e6d462220f2ededc33a87ac4bbd5"
|
||||
"checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1"
|
||||
"checksum criterion 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "938703e165481c8d612ea3479ac8342e5615185db37765162e762ec3523e2fc6"
|
||||
"checksum criterion-plot 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "eccdc6ce8bbe352ca89025bee672aa6d24f4eb8c53e3a8b5d1bc58011da072a2"
|
||||
"checksum crossbeam-channel 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "acec9a3b0b3559f15aee4f90746c4e5e293b701c0f7d3925d24e01645267b68c"
|
||||
"checksum crossbeam-deque 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)" = "c3aa945d63861bfe624b55d153a39684da1e8c0bc8fba932f7ee3a3c16cea3ca"
|
||||
"checksum crossbeam-epoch 0.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "5064ebdbf05ce3cb95e45c8b086f72263f4166b29b97f6baff7ef7fe047b55ac"
|
||||
@ -2627,6 +2715,7 @@ dependencies = [
|
||||
"checksum idna 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "02e2673c30ee86b5b96a9cb52ad15718aa1f966f5ab9ad54a8b95d5ca33120a9"
|
||||
"checksum indexmap 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "712d7b3ea5827fcb9d4fda14bf4da5f136f0db2ae9c8f4bd4e2d1c6fde4e6db2"
|
||||
"checksum iovec 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "b2b3ea6ff95e175473f8ffe6a7eb7c00d054240321b84c57051175fe3c1e075e"
|
||||
"checksum itertools 0.8.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f56a2d0bc861f9165be4eb3442afd3c236d8a98afd426f65d92324ae1091a484"
|
||||
"checksum itoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "501266b7edd0174f8530248f87f99c88fbe60ca4ef3dd486835b8d8d53136f7f"
|
||||
"checksum jemalloc-sys 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "0d3b9f3f5c9b31aa0f5ed3260385ac205db665baa41d49bb8338008ae94ede45"
|
||||
"checksum jemallocator 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "43ae63fcfc45e99ab3d1b29a46782ad679e98436c3169d15a167a1108a724b69"
|
||||
@ -2694,8 +2783,10 @@ dependencies = [
|
||||
"checksum rand_isaac 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ded997c9d5f13925be2a6fd7e66bf1872597f759fd9dd93513dd7e92e5a5ee08"
|
||||
"checksum rand_jitter 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "1166d5c91dc97b88d1decc3285bb0a99ed84b05cfd0bc2341bdf2d43fc41e39b"
|
||||
"checksum rand_os 0.1.3 (registry+https://github.com/rust-lang/crates.io-index)" = "7b75f676a1e053fc562eafbb47838d67c84801e38fc1ba459e8f180deabd5071"
|
||||
"checksum rand_os 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a788ae3edb696cfcba1c19bfd388cc4b8c21f8a408432b199c072825084da58a"
|
||||
"checksum rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "abf9b09b01790cfe0364f52bf32995ea3c39f4d2dd011eac241d2914146d0b44"
|
||||
"checksum rand_xorshift 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cbf7e9e623549b0e21f6e97cf8ecf247c1a8fd2e8a992ae265314300b2455d5c"
|
||||
"checksum rand_xoshiro 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "0e18c91676f670f6f0312764c759405f13afb98d5d73819840cf72a518487bff"
|
||||
"checksum rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "83a27732a533a1be0a0035a111fe76db89ad312f6f0347004c220c57f209a123"
|
||||
"checksum rayon-core 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "98dcf634205083b17d0861252431eb2acbfb698ab7478a2d20de07954f47ec7b"
|
||||
"checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
|
||||
@ -2715,7 +2806,7 @@ dependencies = [
|
||||
"checksum same-file 1.0.5 (registry+https://github.com/rust-lang/crates.io-index)" = "585e8ddcedc187886a30fa705c47985c3fa88d06624095856b36ca0b82ff4421"
|
||||
"checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
|
||||
"checksum sct 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e3042af939fca8c3453b7af0f1c66e533a15a86169e39de2657310ade8f98d3c"
|
||||
"checksum sdset 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "b6d2447743d6c37b6d67af88d9c0f1fc92989e2d9745d9b2f3d305b906a90195"
|
||||
"checksum sdset 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "5bfd7aab2bcae693c563b40fbbaf87d60c9b6f2a60d55ed69a9c761e3d4c63c9"
|
||||
"checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
|
||||
"checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
|
||||
"checksum serde 1.0.102 (registry+https://github.com/rust-lang/crates.io-index)" = "0c4b39bd9b0b087684013a792c59e3e07a46a01d2322518d8a1104641a0b1be0"
|
||||
@ -2760,6 +2851,7 @@ dependencies = [
|
||||
"checksum tide-querystring 0.1.0 (git+https://github.com/rustasync/tide?rev=e77709370bb24cf776fe6da902467c35131535b1)" = "<none>"
|
||||
"checksum tide-slog 0.1.0 (git+https://github.com/rustasync/tide?rev=e77709370bb24cf776fe6da902467c35131535b1)" = "<none>"
|
||||
"checksum time 0.1.42 (registry+https://github.com/rust-lang/crates.io-index)" = "db8dcfca086c1143c9270ac42a2bbd8a7ee477b78ac8e45b19abfb0cbede4b6f"
|
||||
"checksum tinytemplate 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4574b75faccaacddb9b284faecdf0b544b80b6b294f3d062d325c5726a209c20"
|
||||
"checksum tokio 0.1.22 (registry+https://github.com/rust-lang/crates.io-index)" = "5a09c0b5bb588872ab2f09afa13ee6e9dac11e10a0ec9e8e3ba39a5a5d530af6"
|
||||
"checksum tokio-buf 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "8fb220f46c53859a4b7ec083e41dec9778ff0b1851c0942b211edb89e0ccdc46"
|
||||
"checksum tokio-current-thread 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "d16217cad7f1b840c5a97dfb3c43b0c871fef423a6e8d2118c604e843662a443"
|
||||
|
@ -9,12 +9,14 @@ arc-swap = "0.4.3"
|
||||
bincode = "1.1.4"
|
||||
byteorder = "1.3.2"
|
||||
chrono = { version = "0.4.9", features = ["serde"] }
|
||||
compact_arena = "0.4.0"
|
||||
crossbeam-channel = "0.4.0"
|
||||
deunicode = "1.0.0"
|
||||
env_logger = "0.7.0"
|
||||
fst = { version = "0.3.5", default-features = false }
|
||||
hashbrown = { version = "0.6.0", features = ["serde"] }
|
||||
heed = "0.6.0"
|
||||
itertools = "0.8.2" # kill me please
|
||||
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
|
||||
log = "0.4.8"
|
||||
meilisearch-schema = { path = "../meilisearch-schema", version = "0.8.2" }
|
||||
@ -22,7 +24,7 @@ meilisearch-tokenizer = { path = "../meilisearch-tokenizer", version = "0.8.2" }
|
||||
meilisearch-types = { path = "../meilisearch-types", version = "0.8.2" }
|
||||
once_cell = "1.2.0"
|
||||
ordered-float = { version = "1.0.2", features = ["serde"] }
|
||||
sdset = "0.3.3"
|
||||
sdset = "0.3.6"
|
||||
serde = { version = "1.0.101", features = ["derive"] }
|
||||
serde_json = "1.0.41"
|
||||
siphasher = "0.3.1"
|
||||
@ -31,10 +33,16 @@ zerocopy = "0.2.8"
|
||||
|
||||
[dev-dependencies]
|
||||
assert_matches = "1.3"
|
||||
criterion = "0.3"
|
||||
csv = "1.0.7"
|
||||
indexmap = { version = "1.2.0", features = ["serde-1"] }
|
||||
jemallocator = "0.3.2"
|
||||
rustyline = { version = "5.0.0", default-features = false }
|
||||
structopt = "0.3.2"
|
||||
tempfile = "3.1.0"
|
||||
termcolor = "1.0.4"
|
||||
toml = "0.5.3"
|
||||
|
||||
[[bench]]
|
||||
name = "search_benchmark"
|
||||
harness = false
|
||||
|
95
meilisearch-core/benches/search_benchmark.rs
Normal file
95
meilisearch-core/benches/search_benchmark.rs
Normal file
@ -0,0 +1,95 @@
|
||||
#[cfg(test)]
|
||||
#[macro_use]
|
||||
extern crate assert_matches;
|
||||
|
||||
use std::sync::mpsc;
|
||||
use std::path::Path;
|
||||
use std::fs;
|
||||
use std::iter;
|
||||
|
||||
use meilisearch_core::Database;
|
||||
use meilisearch_core::{ProcessedUpdateResult, UpdateStatus};
|
||||
use serde_json::Value;
|
||||
|
||||
use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
|
||||
|
||||
fn prepare_database(path: &Path) -> Database {
|
||||
let database = Database::open_or_create(path).unwrap();
|
||||
let db = &database;
|
||||
|
||||
let (sender, receiver) = mpsc::sync_channel(100);
|
||||
let update_fn = move |_name: &str, update: ProcessedUpdateResult| {
|
||||
sender.send(update.update_id).unwrap()
|
||||
};
|
||||
let index = database.create_index("bench").unwrap();
|
||||
|
||||
database.set_update_callback(Box::new(update_fn));
|
||||
|
||||
let schema = {
|
||||
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/schema.toml");
|
||||
let string = fs::read_to_string(path).expect("find schema");
|
||||
toml::from_str(&string).unwrap()
|
||||
};
|
||||
|
||||
let mut update_writer = db.update_write_txn().unwrap();
|
||||
let _update_id = index.schema_update(&mut update_writer, schema).unwrap();
|
||||
update_writer.commit().unwrap();
|
||||
|
||||
let mut additions = index.documents_addition();
|
||||
|
||||
let json: Value = {
|
||||
let path = concat!(env!("CARGO_MANIFEST_DIR"), "/../datasets/movies/movies.json");
|
||||
let movies_file = fs::File::open(path).expect("find movies");
|
||||
serde_json::from_reader(movies_file).unwrap()
|
||||
};
|
||||
|
||||
let documents = json.as_array().unwrap();
|
||||
|
||||
for document in documents {
|
||||
additions.update_document(document);
|
||||
}
|
||||
|
||||
let mut update_writer = db.update_write_txn().unwrap();
|
||||
let update_id = additions.finalize(&mut update_writer).unwrap();
|
||||
update_writer.commit().unwrap();
|
||||
|
||||
// block until the transaction is processed
|
||||
let _ = receiver.into_iter().find(|id| *id == update_id);
|
||||
|
||||
let update_reader = db.update_read_txn().unwrap();
|
||||
let result = index.update_status(&update_reader, update_id).unwrap();
|
||||
assert_matches!(result, Some(UpdateStatus::Processed { content }) if content.error.is_none());
|
||||
|
||||
database
|
||||
}
|
||||
|
||||
pub fn criterion_benchmark(c: &mut Criterion) {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let database = prepare_database(dir.path());
|
||||
|
||||
let reader = database.main_read_txn().unwrap();
|
||||
let index = database.open_index("bench").unwrap();
|
||||
|
||||
let mut count = 0;
|
||||
let query = "I love paris ";
|
||||
|
||||
let iter = iter::from_fn(|| {
|
||||
count += 1;
|
||||
query.get(0..count)
|
||||
});
|
||||
|
||||
let mut group = c.benchmark_group("searching in movies (19654 docs)");
|
||||
group.sample_size(10);
|
||||
|
||||
for query in iter {
|
||||
let bench_name = BenchmarkId::from_parameter(format!("{:?}", query));
|
||||
group.bench_with_input(bench_name, &query, |b, query| b.iter(|| {
|
||||
let builder = index.query_builder();
|
||||
builder.query(&reader, query, 0..20).unwrap();
|
||||
}));
|
||||
}
|
||||
group.finish();
|
||||
}
|
||||
|
||||
criterion_group!(benches, criterion_benchmark);
|
||||
criterion_main!(benches);
|
@ -1,7 +1,7 @@
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::collections::HashSet;
|
||||
use std::collections::btree_map::{BTreeMap, Entry};
|
||||
use std::error::Error;
|
||||
use std::io::Write;
|
||||
use std::io::{Read, Write};
|
||||
use std::iter::FromIterator;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::{Duration, Instant};
|
||||
@ -15,19 +15,23 @@ use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
|
||||
use meilisearch_core::{Database, Highlight, ProcessedUpdateResult};
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
|
||||
// #[cfg(target_os = "linux")]
|
||||
#[global_allocator]
|
||||
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
|
||||
|
||||
#[derive(Debug, StructOpt)]
|
||||
struct IndexCommand {
|
||||
/// The destination where the database must be created.
|
||||
#[structopt(parse(from_os_str))]
|
||||
database_path: PathBuf,
|
||||
|
||||
#[structopt(long, default_value = "default")]
|
||||
index_uid: String,
|
||||
|
||||
/// The csv file to index.
|
||||
#[structopt(parse(from_os_str))]
|
||||
csv_data_path: PathBuf,
|
||||
|
||||
#[structopt(long, default_value = "default")]
|
||||
index_uid: String,
|
||||
|
||||
/// The path to the schema.
|
||||
#[structopt(long, parse(from_os_str))]
|
||||
schema: PathBuf,
|
||||
@ -135,7 +139,13 @@ fn index_command(command: IndexCommand, database: Database) -> Result<(), Box<dy
|
||||
}
|
||||
}
|
||||
|
||||
let mut rdr = csv::Reader::from_path(command.csv_data_path)?;
|
||||
let mut rdr = if command.csv_data_path.as_os_str() == "-" {
|
||||
csv::Reader::from_reader(Box::new(io::stdin()) as Box<dyn Read>)
|
||||
} else {
|
||||
let file = std::fs::File::open(command.csv_data_path)?;
|
||||
csv::Reader::from_reader(Box::new(file) as Box<dyn Read>)
|
||||
};
|
||||
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
let headers = rdr.headers()?.clone();
|
||||
|
||||
|
@ -46,3 +46,8 @@ pub fn build_prefix_dfa(query: &str) -> DFA {
|
||||
pub fn build_dfa(query: &str) -> DFA {
|
||||
build_dfa_with_setting(query, PrefixSetting::NoPrefix)
|
||||
}
|
||||
|
||||
pub fn build_exact_dfa(query: &str) -> DFA {
|
||||
let builder = LEVDIST0.get_or_init(|| LevBuilder::new(0, true));
|
||||
builder.build_dfa(query)
|
||||
}
|
||||
|
@ -1,125 +1,13 @@
|
||||
mod dfa;
|
||||
mod query_enhancer;
|
||||
|
||||
use std::cmp::Reverse;
|
||||
use std::{cmp, vec};
|
||||
use meilisearch_tokenizer::is_cjk;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use levenshtein_automata::DFA;
|
||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||
|
||||
use crate::database::MainT;
|
||||
use crate::error::MResult;
|
||||
use crate::store;
|
||||
|
||||
use self::dfa::{build_dfa, build_prefix_dfa};
|
||||
pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||
pub use self::query_enhancer::QueryEnhancer;
|
||||
use self::query_enhancer::QueryEnhancerBuilder;
|
||||
pub use self::query_enhancer::QueryEnhancerBuilder;
|
||||
|
||||
const NGRAMS: usize = 3;
|
||||
|
||||
pub struct AutomatonProducer {
|
||||
automatons: Vec<AutomatonGroup>,
|
||||
}
|
||||
|
||||
impl AutomatonProducer {
|
||||
pub fn new(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_list_store: store::PostingsLists,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<(AutomatonProducer, QueryEnhancer)> {
|
||||
let (automatons, query_enhancer) = generate_automatons(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_list_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
Ok((AutomatonProducer { automatons }, query_enhancer))
|
||||
}
|
||||
|
||||
pub fn into_iter(self) -> vec::IntoIter<AutomatonGroup> {
|
||||
self.automatons.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct AutomatonGroup {
|
||||
pub is_phrase_query: bool,
|
||||
pub automatons: Vec<Automaton>,
|
||||
}
|
||||
|
||||
impl AutomatonGroup {
|
||||
fn normal(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: false,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_query(automatons: Vec<Automaton>) -> AutomatonGroup {
|
||||
AutomatonGroup {
|
||||
is_phrase_query: true,
|
||||
automatons,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Automaton {
|
||||
pub index: usize,
|
||||
pub ngram: usize,
|
||||
pub query_len: usize,
|
||||
pub is_exact: bool,
|
||||
pub is_prefix: bool,
|
||||
pub query: String,
|
||||
}
|
||||
|
||||
impl Automaton {
|
||||
pub fn dfa(&self) -> DFA {
|
||||
if self.is_prefix {
|
||||
build_prefix_dfa(&self.query)
|
||||
} else {
|
||||
build_dfa(&self.query)
|
||||
}
|
||||
}
|
||||
|
||||
fn exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn prefix_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: true,
|
||||
is_prefix: true,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn non_exact(index: usize, ngram: usize, query: &str) -> Automaton {
|
||||
Automaton {
|
||||
index,
|
||||
ngram,
|
||||
query_len: query.len(),
|
||||
is_exact: false,
|
||||
is_prefix: false,
|
||||
query: query.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub const NGRAMS: usize = 3;
|
||||
|
||||
pub fn normalize_str(string: &str) -> String {
|
||||
let mut string = string.to_lowercase();
|
||||
@ -130,167 +18,3 @@ pub fn normalize_str(string: &str) -> String {
|
||||
|
||||
string
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
word: &'a str,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = postings_lists_store
|
||||
.postings_list(reader, left.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let right_freq = postings_lists_store
|
||||
.postings_list(reader, right.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn generate_automatons(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<AutomatonGroup>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
Some(synonym) => synonym,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let mut automaton_index = 0;
|
||||
let mut automatons = Vec::new();
|
||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||
|
||||
// We must not declare the original words to the query enhancer
|
||||
// *but* we need to push them in the automatons list first
|
||||
let mut original_automatons = Vec::new();
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
let automaton = if not_prefix_dfa {
|
||||
Automaton::exact(automaton_index, 1, word)
|
||||
} else {
|
||||
Automaton::prefix_exact(automaton_index, 1, word)
|
||||
};
|
||||
automaton_index += 1;
|
||||
original_automatons.push(automaton);
|
||||
}
|
||||
|
||||
automatons.push(AutomatonGroup::normal(original_automatons));
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa =
|
||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa {
|
||||
build_dfa(&normalized)
|
||||
} else {
|
||||
build_prefix_dfa(&normalized)
|
||||
};
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(
|
||||
query_range.clone(),
|
||||
real_query_index,
|
||||
&synonyms_words,
|
||||
);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
Automaton::exact(automaton_index, n, synonym)
|
||||
} else {
|
||||
Automaton::non_exact(automaton_index, n, synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n == 1 {
|
||||
if let Some((left, right)) =
|
||||
split_best_frequency(reader, &normalized, postings_lists_store)?
|
||||
{
|
||||
let a = Automaton::exact(automaton_index, 1, left);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
let b = Automaton::exact(automaton_index, 1, right);
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
|
||||
automatons.push(AutomatonGroup::phrase_query(vec![a, b]));
|
||||
}
|
||||
} else {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||
|
||||
let automaton = Automaton::exact(automaton_index, n, &normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(AutomatonGroup::normal(vec![automaton]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// order automatons, the most important first,
|
||||
// we keep the original automatons at the front.
|
||||
automatons[1..].sort_by_key(|group| {
|
||||
let a = group.automatons.first().unwrap();
|
||||
(
|
||||
Reverse(a.is_exact),
|
||||
a.ngram,
|
||||
Reverse(group.automatons.len()),
|
||||
)
|
||||
});
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
}
|
||||
|
@ -58,6 +58,7 @@ where
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
@ -142,67 +143,80 @@ impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin
|
||||
.push((real_range, (range.start, real_length)));
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
QueryEnhancer {
|
||||
origins: self.origins,
|
||||
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||
let interval_tree = FakeIntervalTree::new(self.real_to_origin);
|
||||
let mut table = Vec::new();
|
||||
|
||||
for real in 0.. {
|
||||
match replacement(&self.origins, &interval_tree, real) {
|
||||
Some(range) => table.push(range),
|
||||
None => break,
|
||||
}
|
||||
}
|
||||
|
||||
QueryEnhancer { table }
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the query indices that represent this real query index.
|
||||
fn replacement(
|
||||
origins: &[usize],
|
||||
real_to_origin: &FakeIntervalTree,
|
||||
real: u32,
|
||||
) -> Option<Range<u32>>
|
||||
{
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) = real_to_origin.query(real)?;
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 {
|
||||
new_origin = origin + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = origins[origin];
|
||||
let end = origins.get(new_origin + 1)?;
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Some(Range {
|
||||
start: (start + n) as u32,
|
||||
end: (start + n + remaining) as u32,
|
||||
})
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = origins[origin];
|
||||
|
||||
Some(Range {
|
||||
start: (origin + n) as u32,
|
||||
end: (origin + n + 1) as u32,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct QueryEnhancer {
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: FakeIntervalTree,
|
||||
table: Vec<Range<u32>>,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices to use to replace this real query index.
|
||||
/// Returns the query indices that represent this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) = self
|
||||
.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 {
|
||||
new_origin = origin + i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = self.origins[origin];
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range {
|
||||
start: (start + n) as u32,
|
||||
end: (start + n + remaining) as u32,
|
||||
}
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range {
|
||||
start: (origin + n) as u32,
|
||||
end: (origin + n + 1) as u32,
|
||||
}
|
||||
}
|
||||
self.table[real as usize].clone()
|
||||
}
|
||||
}
|
||||
|
||||
|
711
meilisearch-core/src/bucket_sort.rs
Normal file
711
meilisearch-core/src/bucket_sort.rs
Normal file
@ -0,0 +1,711 @@
|
||||
use std::ops::Deref;
|
||||
use std::{cmp, fmt};
|
||||
use std::borrow::Cow;
|
||||
use std::mem;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use compact_arena::{SmallArena, Idx32, mk_arena};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use hashbrown::HashMap;
|
||||
use levenshtein_automata::DFA;
|
||||
use log::debug;
|
||||
use meilisearch_tokenizer::{is_cjk, split_query_string};
|
||||
use meilisearch_types::DocIndex;
|
||||
use sdset::{Set, SetBuf};
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
|
||||
use crate::automaton::NGRAMS;
|
||||
use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
|
||||
use crate::automaton::normalize_str;
|
||||
use crate::automaton::{QueryEnhancer, QueryEnhancerBuilder};
|
||||
|
||||
use crate::criterion::{Criteria, Context, ContextMut};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::raw_document::RawDocument;
|
||||
use crate::{database::MainT, reordered_attrs::ReorderedAttrs};
|
||||
use crate::{store, Document, DocumentId, MResult};
|
||||
|
||||
pub fn bucket_sort<'c, FI>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
query: &str,
|
||||
range: Range<usize>,
|
||||
filter: Option<FI>,
|
||||
criteria: Criteria<'c>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<Vec<Document>>
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
{
|
||||
// We delegate the filter work to the distinct query builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
if filter.is_some() {
|
||||
let distinct = |_| None;
|
||||
let distinct_size = 1;
|
||||
return bucket_sort_with_distinct(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
filter,
|
||||
distinct,
|
||||
distinct_size,
|
||||
criteria,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
documents_fields_counts_store,
|
||||
synonyms_store,
|
||||
);
|
||||
}
|
||||
|
||||
let (mut automatons, mut query_enhancer) =
|
||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
||||
|
||||
debug!("{:?}", query_enhancer);
|
||||
|
||||
let before_postings_lists_fetching = Instant::now();
|
||||
mk_arena!(arena);
|
||||
let mut bare_matches =
|
||||
fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
|
||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
||||
bare_matches.len(),
|
||||
before_postings_lists_fetching.elapsed(),
|
||||
);
|
||||
|
||||
let before_raw_documents_presort = Instant::now();
|
||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut prefiltered_documents = 0;
|
||||
let mut raw_documents = Vec::new();
|
||||
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
prefiltered_documents += 1;
|
||||
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) {
|
||||
raw_documents.push(raw_document);
|
||||
}
|
||||
}
|
||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
||||
raw_documents.len(),
|
||||
prefiltered_documents,
|
||||
before_raw_documents_building.elapsed(),
|
||||
);
|
||||
|
||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
||||
|
||||
'criteria: for criterion in criteria.as_ref() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for mut group in tmp_groups {
|
||||
let before_criterion_preparation = Instant::now();
|
||||
|
||||
let ctx = ContextMut {
|
||||
postings_lists: &mut arena,
|
||||
query_enhancer: &mut query_enhancer,
|
||||
automatons: &mut automatons,
|
||||
};
|
||||
|
||||
criterion.prepare(ctx, &mut group);
|
||||
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
|
||||
|
||||
let ctx = Context {
|
||||
postings_lists: &arena,
|
||||
query_enhancer: &query_enhancer,
|
||||
automatons: &automatons,
|
||||
};
|
||||
|
||||
let before_criterion_sort = Instant::now();
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
|
||||
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
|
||||
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
|
||||
debug!("{:?} produced a group of size {}", criterion.name(), group.len());
|
||||
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if documents_seen >= range.end {
|
||||
continue 'criteria;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let iter = raw_documents.into_iter().skip(range.start).take(range.len());
|
||||
let iter = iter.map(|rd| Document::from_raw(rd, &automatons, &arena));
|
||||
|
||||
Ok(iter.collect())
|
||||
}
|
||||
|
||||
pub fn bucket_sort_with_distinct<'c, FI, FD>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
query: &str,
|
||||
range: Range<usize>,
|
||||
filter: Option<FI>,
|
||||
distinct: FD,
|
||||
distinct_size: usize,
|
||||
criteria: Criteria<'c>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<Vec<Document>>
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<u64>,
|
||||
{
|
||||
let (mut automatons, mut query_enhancer) =
|
||||
construct_automatons(reader, query, main_store, postings_lists_store, synonyms_store)?;
|
||||
|
||||
let before_postings_lists_fetching = Instant::now();
|
||||
mk_arena!(arena);
|
||||
let mut bare_matches = fetch_matches(reader, &automatons, &mut arena, main_store, postings_lists_store)?;
|
||||
debug!("bare matches ({}) retrieved in {:.02?}",
|
||||
bare_matches.len(),
|
||||
before_postings_lists_fetching.elapsed(),
|
||||
);
|
||||
|
||||
let before_raw_documents_presort = Instant::now();
|
||||
bare_matches.sort_unstable_by_key(|sm| sm.document_id);
|
||||
debug!("sort by documents ids took {:.02?}", before_raw_documents_presort.elapsed());
|
||||
|
||||
let before_raw_documents_building = Instant::now();
|
||||
let mut prefiltered_documents = 0;
|
||||
let mut raw_documents = Vec::new();
|
||||
for raw_matches in bare_matches.linear_group_by_key_mut(|sm| sm.document_id) {
|
||||
prefiltered_documents += 1;
|
||||
if let Some(raw_document) = RawDocument::new(raw_matches, &automatons, &mut arena) {
|
||||
raw_documents.push(raw_document);
|
||||
}
|
||||
}
|
||||
debug!("creating {} (original {}) candidates documents took {:.02?}",
|
||||
raw_documents.len(),
|
||||
prefiltered_documents,
|
||||
before_raw_documents_building.elapsed(),
|
||||
);
|
||||
|
||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
||||
let mut key_cache = HashMap::new();
|
||||
|
||||
let mut filter_map = HashMap::new();
|
||||
// these two variables informs on the current distinct map and
|
||||
// on the raw offset of the start of the group where the
|
||||
// range.start bound is located according to the distinct function
|
||||
let mut distinct_map = DistinctMap::new(distinct_size);
|
||||
let mut distinct_raw_offset = 0;
|
||||
|
||||
'criteria: for criterion in criteria.as_ref() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for mut group in tmp_groups {
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < distinct_raw_offset {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
continue;
|
||||
}
|
||||
|
||||
let ctx = ContextMut {
|
||||
postings_lists: &mut arena,
|
||||
query_enhancer: &mut query_enhancer,
|
||||
automatons: &mut automatons,
|
||||
};
|
||||
|
||||
let before_criterion_preparation = Instant::now();
|
||||
criterion.prepare(ctx, &mut group);
|
||||
debug!("{:?} preparation took {:.02?}", criterion.name(), before_criterion_preparation.elapsed());
|
||||
|
||||
let ctx = Context {
|
||||
postings_lists: &arena,
|
||||
query_enhancer: &query_enhancer,
|
||||
automatons: &automatons,
|
||||
};
|
||||
|
||||
let before_criterion_sort = Instant::now();
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(&ctx, a, b));
|
||||
debug!("{:?} evaluation took {:.02?}", criterion.name(), before_criterion_sort.elapsed());
|
||||
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(&ctx, a, b)) {
|
||||
// we must compute the real distinguished len of this sub-group
|
||||
for document in group.iter() {
|
||||
let filter_accepted = match &filter {
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id))
|
||||
}
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
};
|
||||
}
|
||||
|
||||
// the requested range end is reached: stop computing distinct
|
||||
if buf_distinct.len() >= range.end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
// if this sub-group does not overlap with the requested range
|
||||
// we must update the distinct map and its start index
|
||||
if buf_distinct.len() < range.start {
|
||||
buf_distinct.transfert_to_internal();
|
||||
distinct_raw_offset = documents_seen;
|
||||
}
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if buf_distinct.len() >= range.end {
|
||||
continue 'criteria;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// once we classified the documents related to the current
|
||||
// automatons we save that as the next valid result
|
||||
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
||||
|
||||
let mut documents = Vec::with_capacity(range.len());
|
||||
for raw_document in raw_documents.into_iter().skip(distinct_raw_offset) {
|
||||
let filter_accepted = match &filter {
|
||||
Some(_) => filter_map.remove(&raw_document.id).unwrap(),
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let key = key_cache.remove(&raw_document.id).unwrap();
|
||||
let distinct_accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
documents.push(Document::from_raw(raw_document, &automatons, &arena));
|
||||
if documents.len() == range.len() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(documents)
|
||||
}
|
||||
|
||||
pub struct BareMatch<'tag> {
|
||||
pub document_id: DocumentId,
|
||||
pub query_index: u16,
|
||||
pub distance: u8,
|
||||
pub is_exact: bool,
|
||||
pub postings_list: Idx32<'tag>,
|
||||
}
|
||||
|
||||
impl fmt::Debug for BareMatch<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("BareMatch")
|
||||
.field("document_id", &self.document_id)
|
||||
.field("query_index", &self.query_index)
|
||||
.field("distance", &self.distance)
|
||||
.field("is_exact", &self.is_exact)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
// TODO remove that
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
|
||||
pub struct SimpleMatch {
|
||||
pub query_index: u16,
|
||||
pub distance: u8,
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
pub is_exact: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum PostingsListView<'txn> {
|
||||
Original {
|
||||
input: Rc<[u8]>,
|
||||
postings_list: Rc<Cow<'txn, Set<DocIndex>>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Rewritten {
|
||||
input: Rc<[u8]>,
|
||||
postings_list: SetBuf<DocIndex>,
|
||||
},
|
||||
}
|
||||
|
||||
impl fmt::Debug for PostingsListView<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("PostingsListView")
|
||||
.field("input", &std::str::from_utf8(&self.input()).unwrap())
|
||||
.field("postings_list", &self.as_ref())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'txn> PostingsListView<'txn> {
|
||||
pub fn original(input: Rc<[u8]>, postings_list: Rc<Cow<'txn, Set<DocIndex>>>) -> PostingsListView<'txn> {
|
||||
let len = postings_list.len();
|
||||
PostingsListView::Original { input, postings_list, offset: 0, len }
|
||||
}
|
||||
|
||||
pub fn rewritten(input: Rc<[u8]>, postings_list: SetBuf<DocIndex>) -> PostingsListView<'txn> {
|
||||
PostingsListView::Rewritten { input, postings_list }
|
||||
}
|
||||
|
||||
pub fn rewrite_with(&mut self, postings_list: SetBuf<DocIndex>) {
|
||||
let input = match self {
|
||||
PostingsListView::Original { input, .. } => input.clone(),
|
||||
PostingsListView::Rewritten { input, .. } => input.clone(),
|
||||
};
|
||||
*self = PostingsListView::rewritten(input, postings_list);
|
||||
}
|
||||
|
||||
pub fn len(&self) -> usize {
|
||||
match self {
|
||||
PostingsListView::Original { len, .. } => *len,
|
||||
PostingsListView::Rewritten { postings_list, .. } => postings_list.len(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn input(&self) -> &[u8] {
|
||||
match self {
|
||||
PostingsListView::Original { ref input, .. } => input,
|
||||
PostingsListView::Rewritten { ref input, .. } => input,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn range(&self, range_offset: usize, range_len: usize) -> PostingsListView<'txn> {
|
||||
match self {
|
||||
PostingsListView::Original { input, postings_list, offset, len } => {
|
||||
assert!(range_offset + range_len <= *len);
|
||||
PostingsListView::Original {
|
||||
input: input.clone(),
|
||||
postings_list: postings_list.clone(),
|
||||
offset: offset + range_offset,
|
||||
len: range_len,
|
||||
}
|
||||
},
|
||||
PostingsListView::Rewritten { .. } => {
|
||||
panic!("Cannot create a range on a rewritten postings list view");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocIndex>> for PostingsListView<'_> {
|
||||
fn as_ref(&self) -> &Set<DocIndex> {
|
||||
self
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for PostingsListView<'_> {
|
||||
type Target = Set<DocIndex>;
|
||||
|
||||
fn deref(&self) -> &Set<DocIndex> {
|
||||
match *self {
|
||||
PostingsListView::Original { ref postings_list, offset, len, .. } => {
|
||||
Set::new_unchecked(&postings_list[offset..offset + len])
|
||||
},
|
||||
PostingsListView::Rewritten { ref postings_list, .. } => postings_list,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matches<'txn, 'tag>(
|
||||
reader: &'txn heed::RoTxn<MainT>,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
arena: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Vec<BareMatch<'tag>>>
|
||||
{
|
||||
let before_words_fst = Instant::now();
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
debug!("words fst took {:.02?}", before_words_fst.elapsed());
|
||||
|
||||
let mut total_postings_lists = Vec::new();
|
||||
|
||||
let mut dfa_time = Duration::default();
|
||||
let mut stream_next_time = Duration::default();
|
||||
let mut postings_lists_fetching_time = Duration::default();
|
||||
|
||||
for (query_index, automaton) in automatons.iter().enumerate() {
|
||||
let before_dfa = Instant::now();
|
||||
let dfa = automaton.dfa();
|
||||
let QueryWordAutomaton { query, is_exact, .. } = automaton;
|
||||
dfa_time += before_dfa.elapsed();
|
||||
|
||||
let mut number_of_words = 0;
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
|
||||
// while let Some(input) = stream.next() {
|
||||
loop {
|
||||
let before_stream_next = Instant::now();
|
||||
let input = match stream.next() {
|
||||
Some(input) => input,
|
||||
None => break,
|
||||
};
|
||||
stream_next_time += before_stream_next.elapsed();
|
||||
|
||||
number_of_words += 1;
|
||||
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == query.len();
|
||||
|
||||
let before_postings_lists_fetching = Instant::now();
|
||||
if let Some(postings_list) = postings_lists_store.postings_list(reader, input)? {
|
||||
|
||||
let input = Rc::from(input);
|
||||
let postings_list = Rc::new(postings_list);
|
||||
let postings_list_view = PostingsListView::original(input, postings_list);
|
||||
|
||||
let mut offset = 0;
|
||||
for group in postings_list_view.linear_group_by_key(|di| di.document_id) {
|
||||
|
||||
let posting_list_index = arena.add(postings_list_view.range(offset, group.len()));
|
||||
let document_id = group[0].document_id;
|
||||
let bare_match = BareMatch {
|
||||
document_id,
|
||||
query_index: query_index as u16,
|
||||
distance,
|
||||
is_exact,
|
||||
postings_list: posting_list_index,
|
||||
};
|
||||
|
||||
total_postings_lists.push(bare_match);
|
||||
offset += group.len();
|
||||
}
|
||||
}
|
||||
postings_lists_fetching_time += before_postings_lists_fetching.elapsed();
|
||||
}
|
||||
|
||||
debug!("{:?} gives {} words", query, number_of_words);
|
||||
}
|
||||
|
||||
debug!("stream next took {:.02?}", stream_next_time);
|
||||
debug!("postings lists fetching took {:.02?}", postings_lists_fetching_time);
|
||||
debug!("dfa creation took {:.02?}", dfa_time);
|
||||
|
||||
Ok(total_postings_lists)
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct QueryWordAutomaton {
|
||||
pub query: String,
|
||||
/// Is it a word that must be considered exact
|
||||
/// or is it some derived word (i.e. a synonym)
|
||||
pub is_exact: bool,
|
||||
pub is_prefix: bool,
|
||||
/// If it's a phrase query and what is
|
||||
/// its index an the length of the phrase
|
||||
pub phrase_query: Option<(u16, u16)>,
|
||||
}
|
||||
|
||||
impl QueryWordAutomaton {
|
||||
pub fn exact(query: &str) -> QueryWordAutomaton {
|
||||
QueryWordAutomaton {
|
||||
query: query.to_string(),
|
||||
is_exact: true,
|
||||
is_prefix: false,
|
||||
phrase_query: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn exact_prefix(query: &str) -> QueryWordAutomaton {
|
||||
QueryWordAutomaton {
|
||||
query: query.to_string(),
|
||||
is_exact: true,
|
||||
is_prefix: true,
|
||||
phrase_query: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn non_exact(query: &str) -> QueryWordAutomaton {
|
||||
QueryWordAutomaton {
|
||||
query: query.to_string(),
|
||||
is_exact: false,
|
||||
is_prefix: false,
|
||||
phrase_query: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn dfa(&self) -> DFA {
|
||||
if self.phrase_query.is_some() {
|
||||
build_exact_dfa(&self.query)
|
||||
} else if self.is_prefix {
|
||||
build_prefix_dfa(&self.query)
|
||||
} else {
|
||||
build_dfa(&self.query)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn split_best_frequency<'a>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
word: &'a str,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
) -> MResult<Option<(&'a str, &'a str)>> {
|
||||
let chars = word.char_indices().skip(1);
|
||||
let mut best = None;
|
||||
|
||||
for (i, _) in chars {
|
||||
let (left, right) = word.split_at(i);
|
||||
|
||||
let left_freq = postings_lists_store
|
||||
.postings_list(reader, left.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let right_freq = postings_lists_store
|
||||
.postings_list(reader, right.as_ref())?
|
||||
.map_or(0, |i| i.len());
|
||||
|
||||
let min_freq = cmp::min(left_freq, right_freq);
|
||||
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
|
||||
best = Some((min_freq, left, right));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(best.map(|(_, l, r)| (l, r)))
|
||||
}
|
||||
|
||||
fn construct_automatons(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
query: &str,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
synonym_store: store::Synonyms,
|
||||
) -> MResult<(Vec<QueryWordAutomaton>, QueryEnhancer)> {
|
||||
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
|
||||
let query_words: Vec<_> = split_query_string(query).map(str::to_lowercase).collect();
|
||||
let synonyms = match main_store.synonyms_fst(reader)? {
|
||||
Some(synonym) => synonym,
|
||||
None => fst::Set::default(),
|
||||
};
|
||||
|
||||
let mut automaton_index = 0;
|
||||
let mut automatons = Vec::new();
|
||||
let mut enhancer_builder = QueryEnhancerBuilder::new(&query_words);
|
||||
|
||||
// We must not declare the original words to the query enhancer
|
||||
// *but* we need to push them in the automatons list first
|
||||
let mut original_words = query_words.iter().peekable();
|
||||
while let Some(word) = original_words.next() {
|
||||
let has_following_word = original_words.peek().is_some();
|
||||
let not_prefix_dfa = has_following_word || has_end_whitespace || word.chars().all(is_cjk);
|
||||
|
||||
let automaton = if not_prefix_dfa {
|
||||
QueryWordAutomaton::exact(word)
|
||||
} else {
|
||||
QueryWordAutomaton::exact_prefix(word)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(automaton);
|
||||
}
|
||||
|
||||
for n in 1..=NGRAMS {
|
||||
let mut ngrams = query_words.windows(n).enumerate().peekable();
|
||||
while let Some((query_index, ngram_slice)) = ngrams.next() {
|
||||
let query_range = query_index..query_index + n;
|
||||
let ngram_nb_words = ngram_slice.len();
|
||||
let ngram = ngram_slice.join(" ");
|
||||
|
||||
let has_following_word = ngrams.peek().is_some();
|
||||
let not_prefix_dfa =
|
||||
has_following_word || has_end_whitespace || ngram.chars().all(is_cjk);
|
||||
|
||||
// automaton of synonyms of the ngrams
|
||||
let normalized = normalize_str(&ngram);
|
||||
let lev = if not_prefix_dfa {
|
||||
build_dfa(&normalized)
|
||||
} else {
|
||||
build_prefix_dfa(&normalized)
|
||||
};
|
||||
|
||||
let mut stream = synonyms.search(&lev).into_stream();
|
||||
while let Some(base) = stream.next() {
|
||||
// only trigger alternatives when the last word has been typed
|
||||
// i.e. "new " do not but "new yo" triggers alternatives to "new york"
|
||||
let base = std::str::from_utf8(base).unwrap();
|
||||
let base_nb_words = split_query_string(base).count();
|
||||
if ngram_nb_words != base_nb_words {
|
||||
continue;
|
||||
}
|
||||
|
||||
if let Some(synonyms) = synonym_store.synonyms(reader, base.as_bytes())? {
|
||||
let mut stream = synonyms.into_stream();
|
||||
while let Some(synonyms) = stream.next() {
|
||||
let synonyms = std::str::from_utf8(synonyms).unwrap();
|
||||
let synonyms_words: Vec<_> = split_query_string(synonyms).collect();
|
||||
let nb_synonym_words = synonyms_words.len();
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &synonyms_words);
|
||||
|
||||
for synonym in synonyms_words {
|
||||
let automaton = if nb_synonym_words == 1 {
|
||||
QueryWordAutomaton::exact(synonym)
|
||||
} else {
|
||||
QueryWordAutomaton::non_exact(synonym)
|
||||
};
|
||||
automaton_index += 1;
|
||||
automatons.push(automaton);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n == 1 {
|
||||
// automatons for splitted words
|
||||
if let Some((left, right)) = split_best_frequency(reader, &normalized, postings_lists_store)? {
|
||||
let mut left_automaton = QueryWordAutomaton::exact(left);
|
||||
left_automaton.phrase_query = Some((0, 2));
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[left]);
|
||||
automaton_index += 1;
|
||||
automatons.push(left_automaton);
|
||||
|
||||
let mut right_automaton = QueryWordAutomaton::exact(right);
|
||||
right_automaton.phrase_query = Some((1, 2));
|
||||
enhancer_builder.declare(query_range.clone(), automaton_index, &[right]);
|
||||
automaton_index += 1;
|
||||
automatons.push(right_automaton);
|
||||
}
|
||||
} else {
|
||||
// automaton of concatenation of query words
|
||||
let concat = ngram_slice.concat();
|
||||
let normalized = normalize_str(&concat);
|
||||
|
||||
let real_query_index = automaton_index;
|
||||
enhancer_builder.declare(query_range.clone(), real_query_index, &[&normalized]);
|
||||
|
||||
let automaton = QueryWordAutomaton::exact(&normalized);
|
||||
automaton_index += 1;
|
||||
automatons.push(automaton);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((automatons, enhancer_builder.build()))
|
||||
}
|
35
meilisearch-core/src/criterion/attribute.rs
Normal file
35
meilisearch-core/src/criterion/attribute.rs
Normal file
@ -0,0 +1,35 @@
|
||||
use std::cmp::Ordering;
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::RawDocument;
|
||||
use crate::bucket_sort::SimpleMatch;
|
||||
use super::{Criterion, Context, ContextMut, prepare_raw_matches};
|
||||
|
||||
pub struct Attribute;
|
||||
|
||||
impl Criterion for Attribute {
|
||||
fn name(&self) -> &str { "attribute" }
|
||||
|
||||
fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) {
|
||||
prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons);
|
||||
}
|
||||
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
#[inline]
|
||||
fn sum_of_attribute(matches: &[SimpleMatch]) -> usize {
|
||||
let mut sum_of_attribute = 0;
|
||||
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
||||
sum_of_attribute += group[0].attribute as usize;
|
||||
}
|
||||
sum_of_attribute
|
||||
}
|
||||
|
||||
let lhs = sum_of_attribute(&lhs.processed_matches);
|
||||
let rhs = sum_of_attribute(&rhs.processed_matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
@ -1,16 +1,17 @@
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
use compact_arena::SmallArena;
|
||||
use crate::RawDocument;
|
||||
use super::{Criterion, Context};
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct DocumentId;
|
||||
|
||||
impl Criterion for DocumentId {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
lhs.id.cmp(&rhs.id)
|
||||
}
|
||||
fn name(&self) -> &str { "stable document id" }
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"DocumentId"
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = &lhs.id;
|
||||
let rhs = &rhs.id;
|
||||
|
||||
lhs.cmp(rhs)
|
||||
}
|
||||
}
|
||||
|
@ -1,132 +1,35 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
use sdset::Set;
|
||||
use std::cmp::{Ordering, Reverse};
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use crate::bucket_sort::BareMatch;
|
||||
use super::{Criterion, Context, ContextMut};
|
||||
|
||||
#[inline]
|
||||
fn number_exact_matches(
|
||||
query_index: &[u32],
|
||||
attribute: &[u16],
|
||||
is_exact: &[bool],
|
||||
fields_counts: &Set<(SchemaAttr, u64)>,
|
||||
) -> usize {
|
||||
let mut count = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
let len = group.len();
|
||||
|
||||
let mut found_exact = false;
|
||||
for (pos, is_exact) in is_exact[index..index + len].iter().enumerate() {
|
||||
if *is_exact {
|
||||
found_exact = true;
|
||||
let attr = &attribute[index + pos];
|
||||
if let Ok(pos) = fields_counts.binary_search_by_key(attr, |(a, _)| a.0) {
|
||||
let (_, count) = fields_counts[pos];
|
||||
if count == 1 {
|
||||
return usize::max_value();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
count += found_exact as usize;
|
||||
index += len;
|
||||
}
|
||||
|
||||
count
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct Exact;
|
||||
|
||||
impl Criterion for Exact {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let is_exact = lhs.is_exact();
|
||||
let attribute = lhs.attribute();
|
||||
let fields_counts = &lhs.fields_counts;
|
||||
fn name(&self) -> &str { "exact" }
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
fn prepare(&self, _ctx: ContextMut, documents: &mut [RawDocument]) {
|
||||
for document in documents {
|
||||
document.raw_matches.sort_unstable_by_key(|bm| (bm.query_index, Reverse(bm.is_exact)));
|
||||
}
|
||||
}
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let is_exact = rhs.is_exact();
|
||||
let attribute = rhs.attribute();
|
||||
let fields_counts = &rhs.fields_counts;
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
#[inline]
|
||||
fn sum_exact_query_words(matches: &[BareMatch]) -> usize {
|
||||
let mut sum_exact_query_words = 0;
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
||||
sum_exact_query_words += group[0].is_exact as usize;
|
||||
}
|
||||
|
||||
sum_exact_query_words
|
||||
}
|
||||
|
||||
let lhs = sum_exact_query_words(&lhs.raw_matches);
|
||||
let rhs = sum_exact_query_words(&rhs.raw_matches);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"Exact"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: "Soulier bleu"
|
||||
// doc1: "souliereres rouge"
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[false];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 2)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: { 0. "soulier" }
|
||||
// doc1: { 0. "soulier bleu et blanc" }
|
||||
#[test]
|
||||
fn basic() {
|
||||
let doc0 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 1)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
let doc1 = {
|
||||
let query_index = &[0];
|
||||
let attribute = &[0];
|
||||
let is_exact = &[true];
|
||||
let fields_counts = Set::new(&[(SchemaAttr(0), 4)]).unwrap();
|
||||
|
||||
number_exact_matches(query_index, attribute, is_exact, fields_counts)
|
||||
};
|
||||
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
||||
|
@ -1,59 +1,71 @@
|
||||
mod document_id;
|
||||
mod exact;
|
||||
mod number_of_words;
|
||||
mod sort_by_attr;
|
||||
mod sum_of_typos;
|
||||
mod sum_of_words_attribute;
|
||||
mod sum_of_words_position;
|
||||
mod words_proximity;
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{SimpleMatch, PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
pub use self::{
|
||||
document_id::DocumentId, exact::Exact, number_of_words::NumberOfWords,
|
||||
sort_by_attr::SortByAttr, sum_of_typos::SumOfTypos,
|
||||
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition,
|
||||
words_proximity::WordsProximity,
|
||||
};
|
||||
mod typo;
|
||||
mod words;
|
||||
mod proximity;
|
||||
mod attribute;
|
||||
mod words_position;
|
||||
mod exact;
|
||||
mod document_id;
|
||||
mod sort_by_attr;
|
||||
|
||||
pub trait Criterion: Send + Sync {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering;
|
||||
pub use self::typo::Typo;
|
||||
pub use self::words::Words;
|
||||
pub use self::proximity::Proximity;
|
||||
pub use self::attribute::Attribute;
|
||||
pub use self::words_position::WordsPosition;
|
||||
pub use self::exact::Exact;
|
||||
pub use self::document_id::DocumentId;
|
||||
pub use self::sort_by_attr::SortByAttr;
|
||||
|
||||
pub trait Criterion {
|
||||
fn name(&self) -> &str;
|
||||
|
||||
fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) {
|
||||
/* ... */
|
||||
}
|
||||
|
||||
fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
||||
lhs: &RawDocument<'r, 'tag>,
|
||||
rhs: &RawDocument<'r, 'tag>,
|
||||
) -> Ordering;
|
||||
|
||||
#[inline]
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
self.evaluate(lhs, rhs) == Ordering::Equal
|
||||
fn eq<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
||||
lhs: &RawDocument<'r, 'tag>,
|
||||
rhs: &RawDocument<'r, 'tag>,
|
||||
) -> bool
|
||||
{
|
||||
self.evaluate(ctx, lhs, rhs) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T: Criterion + ?Sized + Send + Sync> Criterion for &'a T {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
pub struct ContextMut<'p, 'tag, 'txn, 'q, 'a> {
|
||||
pub postings_lists: &'p mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
pub query_enhancer: &'q mut QueryEnhancer,
|
||||
pub automatons: &'a mut [QueryWordAutomaton],
|
||||
}
|
||||
|
||||
impl<T: Criterion + ?Sized> Criterion for Box<T> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
(**self).evaluate(lhs, rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
(**self).name()
|
||||
}
|
||||
|
||||
fn eq(&self, lhs: &RawDocument, rhs: &RawDocument) -> bool {
|
||||
(**self).eq(lhs, rhs)
|
||||
}
|
||||
pub struct Context<'p, 'tag, 'txn, 'q, 'a> {
|
||||
pub postings_lists: &'p SmallArena<'tag, PostingsListView<'txn>>,
|
||||
pub query_enhancer: &'q QueryEnhancer,
|
||||
pub automatons: &'a [QueryWordAutomaton],
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
@ -103,11 +115,11 @@ pub struct Criteria<'a> {
|
||||
impl<'a> Default for Criteria<'a> {
|
||||
fn default() -> Self {
|
||||
CriteriaBuilder::with_capacity(7)
|
||||
.add(SumOfTypos)
|
||||
.add(NumberOfWords)
|
||||
.add(WordsProximity)
|
||||
.add(SumOfWordsAttribute)
|
||||
.add(SumOfWordsPosition)
|
||||
.add(Typo)
|
||||
.add(Words)
|
||||
.add(Proximity)
|
||||
.add(Attribute)
|
||||
.add(WordsPosition)
|
||||
.add(Exact)
|
||||
.add(DocumentId)
|
||||
.build()
|
||||
@ -119,3 +131,165 @@ impl<'a> AsRef<[Box<dyn Criterion + 'a>]> for Criteria<'a> {
|
||||
&self.inner
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_query_distances<'a, 'tag, 'txn>(
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
postings_lists: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) {
|
||||
for document in documents {
|
||||
if !document.processed_distances.is_empty() { continue }
|
||||
|
||||
let mut processed = Vec::new();
|
||||
for m in document.raw_matches.iter() {
|
||||
if postings_lists[m.postings_list].is_empty() { continue }
|
||||
|
||||
let range = query_enhancer.replacement(m.query_index as u32);
|
||||
let new_len = cmp::max(range.end as usize, processed.len());
|
||||
processed.resize(new_len, None);
|
||||
|
||||
for index in range {
|
||||
let index = index as usize;
|
||||
processed[index] = match processed[index] {
|
||||
Some(distance) if distance > m.distance => Some(m.distance),
|
||||
Some(distance) => Some(distance),
|
||||
None => Some(m.distance),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
document.processed_distances = processed;
|
||||
}
|
||||
}
|
||||
|
||||
fn prepare_raw_matches<'a, 'tag, 'txn>(
|
||||
documents: &mut [RawDocument<'a, 'tag>],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) {
|
||||
for document in documents {
|
||||
if !document.processed_matches.is_empty() { continue }
|
||||
|
||||
let mut processed = Vec::new();
|
||||
for m in document.raw_matches.iter() {
|
||||
let postings_list = &postings_lists[m.postings_list];
|
||||
processed.reserve(postings_list.len());
|
||||
for di in postings_list.as_ref() {
|
||||
let simple_match = SimpleMatch {
|
||||
query_index: m.query_index,
|
||||
distance: m.distance,
|
||||
attribute: di.attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact: m.is_exact,
|
||||
};
|
||||
processed.push(simple_match);
|
||||
}
|
||||
}
|
||||
|
||||
let processed = multiword_rewrite_matches(&mut processed, query_enhancer, automatons);
|
||||
document.processed_matches = processed.into_vec();
|
||||
}
|
||||
}
|
||||
|
||||
fn multiword_rewrite_matches(
|
||||
matches: &mut [SimpleMatch],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
) -> SetBuf<SimpleMatch>
|
||||
{
|
||||
matches.sort_unstable_by_key(|m| (m.attribute, m.word_index));
|
||||
|
||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||
|
||||
// let before_padding = Instant::now();
|
||||
// for each attribute of each document
|
||||
for same_document_attribute in matches.linear_group_by_key(|m| m.attribute) {
|
||||
// padding will only be applied
|
||||
// to word indices in the same attribute
|
||||
let mut padding = 0;
|
||||
let mut iter = same_document_attribute.linear_group_by_key(|m| m.word_index);
|
||||
|
||||
// for each match at the same position
|
||||
// in this document attribute
|
||||
while let Some(same_word_index) = iter.next() {
|
||||
// find the biggest padding
|
||||
let mut biggest = 0;
|
||||
for match_ in same_word_index {
|
||||
let mut replacement = query_enhancer.replacement(match_.query_index as u32);
|
||||
let replacement_len = replacement.len();
|
||||
let nexts = iter.remainder().linear_group_by_key(|m| m.word_index);
|
||||
|
||||
if let Some(query_index) = replacement.next() {
|
||||
let word_index = match_.word_index + padding as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
}
|
||||
|
||||
let mut found = false;
|
||||
|
||||
// look ahead and if there already is a match
|
||||
// corresponding to this padding word, abort the padding
|
||||
'padding: for (x, next_group) in nexts.enumerate() {
|
||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let padmatch = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
|
||||
for nmatch_ in next_group {
|
||||
let mut rep = query_enhancer.replacement(nmatch_.query_index as u32);
|
||||
let query_index = rep.next().unwrap() as u16;
|
||||
if query_index == padmatch.query_index {
|
||||
if !found {
|
||||
// if we find a corresponding padding for the
|
||||
// first time we must push preceding paddings
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||
{
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
biggest = biggest.max(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
padded_matches.push(padmatch);
|
||||
found = true;
|
||||
continue 'padding;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we do not find a corresponding padding in the
|
||||
// next groups so stop here and pad what was found
|
||||
break;
|
||||
}
|
||||
|
||||
if !found {
|
||||
// if no padding was found in the following matches
|
||||
// we must insert the entire padding
|
||||
for (i, query_index) in replacement.enumerate() {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let query_index = query_index as u16;
|
||||
let match_ = SimpleMatch { query_index, word_index, ..*match_ };
|
||||
padded_matches.push(match_);
|
||||
}
|
||||
|
||||
biggest = biggest.max(replacement_len - 1);
|
||||
}
|
||||
}
|
||||
|
||||
padding += biggest;
|
||||
}
|
||||
}
|
||||
|
||||
// debug!("padding matches took {:.02?}", before_padding.elapsed());
|
||||
|
||||
// With this check we can see that the loop above takes something
|
||||
// like 43% of the search time even when no rewrite is needed.
|
||||
// assert_eq!(before_matches, padded_matches);
|
||||
|
||||
SetBuf::from_dirty(padded_matches)
|
||||
}
|
||||
|
@ -1,31 +0,0 @@
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn number_of_query_words(query_index: &[u32]) -> usize {
|
||||
query_index.linear_group().count()
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct NumberOfWords;
|
||||
|
||||
impl Criterion for NumberOfWords {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
number_of_query_words(query_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"NumberOfWords"
|
||||
}
|
||||
}
|
66
meilisearch-core/src/criterion/proximity.rs
Normal file
66
meilisearch-core/src/criterion/proximity.rs
Normal file
@ -0,0 +1,66 @@
|
||||
use std::cmp::{self, Ordering};
|
||||
use slice_group_by::GroupBy;
|
||||
use crate::bucket_sort::{SimpleMatch};
|
||||
use crate::RawDocument;
|
||||
use super::{Criterion, Context, ContextMut, prepare_raw_matches};
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
pub struct Proximity;
|
||||
|
||||
impl Criterion for Proximity {
|
||||
fn name(&self) -> &str { "proximity" }
|
||||
|
||||
fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) {
|
||||
prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons);
|
||||
}
|
||||
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity(lhs: SimpleMatch, rhs: SimpleMatch) -> u16 {
|
||||
if lhs.attribute != rhs.attribute { MAX_DISTANCE }
|
||||
else { index_proximity(lhs.word_index, rhs.word_index) }
|
||||
}
|
||||
|
||||
fn min_proximity(lhs: &[SimpleMatch], rhs: &[SimpleMatch]) -> u16 {
|
||||
let mut min_prox = u16::max_value();
|
||||
for a in lhs {
|
||||
for b in rhs {
|
||||
let prox = attribute_proximity(*a, *b);
|
||||
min_prox = cmp::min(min_prox, prox);
|
||||
}
|
||||
}
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(matches: &[SimpleMatch],) -> u16 {
|
||||
let mut proximity = 0;
|
||||
let mut iter = matches.linear_group_by_key(|m| m.query_index);
|
||||
|
||||
// iterate over groups by windows of size 2
|
||||
let mut last = iter.next();
|
||||
while let (Some(lhs), Some(rhs)) = (last, iter.next()) {
|
||||
proximity += min_proximity(lhs, rhs);
|
||||
last = Some(rhs);
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
let lhs = matches_proximity(&lhs.processed_matches);
|
||||
let rhs = matches_proximity(&rhs.processed_matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
@ -1,10 +1,9 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::{RankedMap, RawDocument};
|
||||
use meilisearch_schema::{Schema, SchemaAttr};
|
||||
use crate::{RankedMap, RawDocument};
|
||||
use super::{Criterion, Context};
|
||||
|
||||
/// An helper struct that permit to sort documents by
|
||||
/// some of their stored attributes.
|
||||
@ -28,11 +27,11 @@ use meilisearch_schema::{Schema, SchemaAttr};
|
||||
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
|
||||
///
|
||||
/// let builder = CriteriaBuilder::with_capacity(8)
|
||||
/// .add(SumOfTypos)
|
||||
/// .add(NumberOfWords)
|
||||
/// .add(WordsProximity)
|
||||
/// .add(SumOfWordsAttribute)
|
||||
/// .add(SumOfWordsPosition)
|
||||
/// .add(Typo)
|
||||
/// .add(Words)
|
||||
/// .add(Proximity)
|
||||
/// .add(Attribute)
|
||||
/// .add(WordsPosition)
|
||||
/// .add(Exact)
|
||||
/// .add(custom_ranking)
|
||||
/// .add(DocumentId);
|
||||
@ -86,8 +85,12 @@ impl<'a> SortByAttr<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Criterion for SortByAttr<'a> {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
impl Criterion for SortByAttr<'_> {
|
||||
fn name(&self) -> &str {
|
||||
"sort by attribute"
|
||||
}
|
||||
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = self.ranked_map.get(lhs.id, self.attr);
|
||||
let rhs = self.ranked_map.get(rhs.id, self.attr);
|
||||
|
||||
@ -105,10 +108,6 @@ impl<'a> Criterion for SortByAttr<'a> {
|
||||
(None, None) => Ordering::Equal,
|
||||
}
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SortByAttr"
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
|
@ -1,116 +0,0 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_typos(query_index: &[u32], distance: &[u8]) -> usize {
|
||||
let mut number_words: usize = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_typos += custom_log10(distance[index]);
|
||||
number_words += 1;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfTypos;
|
||||
|
||||
impl Criterion for SumOfTypos {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
sum_matches_typos(query_index, distance)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfTypos"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "Geox CEO"
|
||||
//
|
||||
// doc0: "Geox SpA: CEO and Executive"
|
||||
// doc1: "Mt. Gox CEO Resigns From Bitcoin Foundation"
|
||||
#[test]
|
||||
fn one_typo_reference() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0, 1];
|
||||
let distance1 = &[1, 0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchette"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn no_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
|
||||
// typing: "bouton manchztte"
|
||||
//
|
||||
// doc0: "bouton manchette"
|
||||
// doc1: "bouton"
|
||||
#[test]
|
||||
fn one_typo() {
|
||||
let query_index0 = &[0, 1];
|
||||
let distance0 = &[0, 1];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let distance1 = &[0];
|
||||
|
||||
let doc0 = sum_matches_typos(query_index0, distance0);
|
||||
let doc1 = sum_matches_typos(query_index1, distance1);
|
||||
assert_eq!(doc0.cmp(&doc1).reverse(), Ordering::Less);
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {
|
||||
let mut sum_attributes = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_attributes += attribute[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_attributes
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsAttribute;
|
||||
|
||||
impl Criterion for SumOfWordsAttribute {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let attribute = lhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let attribute = rhs.attribute();
|
||||
sum_matches_attributes(query_index, attribute)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsAttribute"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: { 0. "Soulier bleu", 1. "bla bla bla" }
|
||||
// doc1: { 0. "Botte rouge", 1. "Soulier en cuir" }
|
||||
#[test]
|
||||
fn title_vs_description() {
|
||||
let query_index0 = &[0];
|
||||
let attribute0 = &[0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let attribute1 = &[1];
|
||||
|
||||
let doc0 = sum_matches_attributes(query_index0, attribute0);
|
||||
let doc1 = sum_matches_attributes(query_index1, attribute1);
|
||||
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::Ordering;
|
||||
|
||||
#[inline]
|
||||
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {
|
||||
let mut sum_word_index = 0;
|
||||
let mut index = 0;
|
||||
|
||||
for group in query_index.linear_group() {
|
||||
sum_word_index += word_index[index] as usize;
|
||||
index += group.len();
|
||||
}
|
||||
|
||||
sum_word_index
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SumOfWordsPosition;
|
||||
|
||||
impl Criterion for SumOfWordsPosition {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let word_index = lhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let word_index = rhs.word_index();
|
||||
sum_matches_attribute_index(query_index, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"SumOfWordsPosition"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
// typing: "soulier"
|
||||
//
|
||||
// doc0: "Soulier bleu"
|
||||
// doc1: "Botte rouge et soulier noir"
|
||||
#[test]
|
||||
fn easy_case() {
|
||||
let query_index0 = &[0];
|
||||
let word_index0 = &[0];
|
||||
|
||||
let query_index1 = &[0];
|
||||
let word_index1 = &[3];
|
||||
|
||||
let doc0 = sum_matches_attribute_index(query_index0, word_index0);
|
||||
let doc1 = sum_matches_attribute_index(query_index1, word_index1);
|
||||
assert_eq!(doc0.cmp(&doc1), Ordering::Less);
|
||||
}
|
||||
}
|
59
meilisearch-core/src/criterion/typo.rs
Normal file
59
meilisearch-core/src/criterion/typo.rs
Normal file
@ -0,0 +1,59 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use compact_arena::SmallArena;
|
||||
|
||||
use crate::automaton::QueryEnhancer;
|
||||
use crate::bucket_sort::{PostingsListView, QueryWordAutomaton};
|
||||
use crate::RawDocument;
|
||||
|
||||
use super::{Criterion, Context, ContextMut, prepare_query_distances};
|
||||
|
||||
pub struct Typo;
|
||||
|
||||
impl Criterion for Typo {
|
||||
fn name(&self) -> &str { "typo" }
|
||||
|
||||
fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) {
|
||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists);
|
||||
}
|
||||
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
// This function is a wrong logarithmic 10 function.
|
||||
// It is safe to panic on input number higher than 3,
|
||||
// the number of typos is never bigger than that.
|
||||
#[inline]
|
||||
fn custom_log10(n: u8) -> f32 {
|
||||
match n {
|
||||
0 => 0.0, // log(1)
|
||||
1 => 0.30102, // log(2)
|
||||
2 => 0.47712, // log(3)
|
||||
3 => 0.60205, // log(4)
|
||||
_ => panic!("invalid number"),
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn compute_typos(distances: &[Option<u8>]) -> usize {
|
||||
let mut number_words: usize = 0;
|
||||
let mut sum_typos = 0.0;
|
||||
|
||||
for distance in distances {
|
||||
if let Some(distance) = distance {
|
||||
sum_typos += custom_log10(*distance);
|
||||
number_words += 1;
|
||||
}
|
||||
}
|
||||
|
||||
(number_words as f32 / (sum_typos + 1.0) * 1000.0) as usize
|
||||
}
|
||||
|
||||
let lhs = compute_typos(&lhs.processed_distances);
|
||||
let rhs = compute_typos(&rhs.processed_distances);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
29
meilisearch-core/src/criterion/words.rs
Normal file
29
meilisearch-core/src/criterion/words.rs
Normal file
@ -0,0 +1,29 @@
|
||||
use std::cmp::Ordering;
|
||||
use crate::RawDocument;
|
||||
use super::{Criterion, Context, ContextMut, prepare_query_distances};
|
||||
|
||||
pub struct Words;
|
||||
|
||||
impl Criterion for Words {
|
||||
fn name(&self) -> &str { "words" }
|
||||
|
||||
fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) {
|
||||
prepare_query_distances(documents, ctx.query_enhancer, ctx.automatons, ctx.postings_lists);
|
||||
}
|
||||
|
||||
fn evaluate(&self, _ctx: &Context, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
#[inline]
|
||||
fn number_of_query_words(distances: &[Option<u8>]) -> usize {
|
||||
distances.iter().cloned().filter(Option::is_some).count()
|
||||
}
|
||||
|
||||
let lhs = number_of_query_words(&lhs.processed_distances);
|
||||
let rhs = number_of_query_words(&rhs.processed_distances);
|
||||
|
||||
lhs.cmp(&rhs).reverse()
|
||||
}
|
||||
}
|
43
meilisearch-core/src/criterion/words_position.rs
Normal file
43
meilisearch-core/src/criterion/words_position.rs
Normal file
@ -0,0 +1,43 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::RawDocument;
|
||||
use crate::bucket_sort::SimpleMatch;
|
||||
use super::{Criterion, Context, ContextMut, prepare_raw_matches};
|
||||
|
||||
pub struct WordsPosition;
|
||||
|
||||
impl Criterion for WordsPosition {
|
||||
fn name(&self) -> &str { "words position" }
|
||||
|
||||
fn prepare<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: ContextMut<'p, 'tag, 'txn, 'q, 'a>,
|
||||
documents: &mut [RawDocument<'r, 'tag>],
|
||||
) {
|
||||
prepare_raw_matches(documents, ctx.postings_lists, ctx.query_enhancer, ctx.automatons);
|
||||
}
|
||||
|
||||
fn evaluate<'p, 'tag, 'txn, 'q, 'a, 'r>(
|
||||
&self,
|
||||
ctx: &Context<'p, 'tag, 'txn, 'q, 'a>,
|
||||
lhs: &RawDocument<'r, 'tag>,
|
||||
rhs: &RawDocument<'r, 'tag>,
|
||||
) -> Ordering
|
||||
{
|
||||
#[inline]
|
||||
fn sum_words_position(matches: &[SimpleMatch]) -> usize {
|
||||
let mut sum_words_position = 0;
|
||||
for group in matches.linear_group_by_key(|bm| bm.query_index) {
|
||||
sum_words_position += group[0].word_index as usize;
|
||||
}
|
||||
sum_words_position
|
||||
}
|
||||
|
||||
let lhs = sum_words_position(&lhs.processed_matches);
|
||||
let rhs = sum_words_position(&rhs.processed_matches);
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
}
|
@ -1,164 +0,0 @@
|
||||
use crate::criterion::Criterion;
|
||||
use crate::RawDocument;
|
||||
use slice_group_by::GroupBy;
|
||||
use std::cmp::{self, Ordering};
|
||||
|
||||
const MAX_DISTANCE: u16 = 8;
|
||||
|
||||
#[inline]
|
||||
fn clone_tuple<T: Clone, U: Clone>((a, b): (&T, &U)) -> (T, U) {
|
||||
(a.clone(), b.clone())
|
||||
}
|
||||
|
||||
fn index_proximity(lhs: u16, rhs: u16) -> u16 {
|
||||
if lhs < rhs {
|
||||
cmp::min(rhs - lhs, MAX_DISTANCE)
|
||||
} else {
|
||||
cmp::min(lhs - rhs, MAX_DISTANCE) + 1
|
||||
}
|
||||
}
|
||||
|
||||
fn attribute_proximity((lattr, lwi): (u16, u16), (rattr, rwi): (u16, u16)) -> u16 {
|
||||
if lattr != rattr {
|
||||
return MAX_DISTANCE;
|
||||
}
|
||||
index_proximity(lwi, rwi)
|
||||
}
|
||||
|
||||
fn min_proximity((lattr, lwi): (&[u16], &[u16]), (rattr, rwi): (&[u16], &[u16])) -> u16 {
|
||||
let mut min_prox = u16::max_value();
|
||||
|
||||
for a in lattr.iter().zip(lwi) {
|
||||
for b in rattr.iter().zip(rwi) {
|
||||
let a = clone_tuple(a);
|
||||
let b = clone_tuple(b);
|
||||
min_prox = cmp::min(min_prox, attribute_proximity(a, b));
|
||||
}
|
||||
}
|
||||
|
||||
min_prox
|
||||
}
|
||||
|
||||
fn matches_proximity(
|
||||
query_index: &[u32],
|
||||
distance: &[u8],
|
||||
attribute: &[u16],
|
||||
word_index: &[u16],
|
||||
) -> u16 {
|
||||
let mut query_index_groups = query_index.linear_group();
|
||||
let mut proximity = 0;
|
||||
let mut index = 0;
|
||||
|
||||
let get_attr_wi = |index: usize, group_len: usize| {
|
||||
// retrieve the first distance group (with the lowest values)
|
||||
let len = distance[index..index + group_len]
|
||||
.linear_group()
|
||||
.next()
|
||||
.unwrap()
|
||||
.len();
|
||||
|
||||
let rattr = &attribute[index..index + len];
|
||||
let rwi = &word_index[index..index + len];
|
||||
|
||||
(rattr, rwi)
|
||||
};
|
||||
|
||||
let mut last = query_index_groups.next().map(|group| {
|
||||
let attr_wi = get_attr_wi(index, group.len());
|
||||
index += group.len();
|
||||
attr_wi
|
||||
});
|
||||
|
||||
// iter by windows of size 2
|
||||
while let (Some(lhs), Some(rhs)) = (last, query_index_groups.next()) {
|
||||
let attr_wi = get_attr_wi(index, rhs.len());
|
||||
proximity += min_proximity(lhs, attr_wi);
|
||||
last = Some(attr_wi);
|
||||
index += rhs.len();
|
||||
}
|
||||
|
||||
proximity
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WordsProximity;
|
||||
|
||||
impl Criterion for WordsProximity {
|
||||
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
|
||||
let lhs = {
|
||||
let query_index = lhs.query_index();
|
||||
let distance = lhs.distance();
|
||||
let attribute = lhs.attribute();
|
||||
let word_index = lhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
let rhs = {
|
||||
let query_index = rhs.query_index();
|
||||
let distance = rhs.distance();
|
||||
let attribute = rhs.attribute();
|
||||
let word_index = rhs.word_index();
|
||||
matches_proximity(query_index, distance, attribute, word_index)
|
||||
};
|
||||
|
||||
lhs.cmp(&rhs)
|
||||
}
|
||||
|
||||
fn name(&self) -> &str {
|
||||
"WordsProximity"
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn three_different_attributes() {
|
||||
// "soup" "of the" "the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 0 }
|
||||
// { id: 2, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 2, attr_index: 0 }
|
||||
// { id: 3, attr: 3, attr_index: 1 }
|
||||
|
||||
let query_index = &[0, 1, 2, 2, 3];
|
||||
let distance = &[0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 2, 3];
|
||||
let word_index = &[0, 0, 1, 0, 1];
|
||||
|
||||
// soup -> of = 8
|
||||
// + of -> the = 1
|
||||
// + the -> day = 8 (not 1)
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
17
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn two_different_attributes() {
|
||||
// "soup day" "soup of the day"
|
||||
//
|
||||
// { id: 0, attr: 0, attr_index: 0 }
|
||||
// { id: 0, attr: 1, attr_index: 0 }
|
||||
// { id: 1, attr: 1, attr_index: 1 }
|
||||
// { id: 2, attr: 1, attr_index: 2 }
|
||||
// { id: 3, attr: 0, attr_index: 1 }
|
||||
// { id: 3, attr: 1, attr_index: 3 }
|
||||
|
||||
let query_index = &[0, 0, 1, 2, 3, 3];
|
||||
let distance = &[0, 0, 0, 0, 0, 0];
|
||||
let attribute = &[0, 1, 1, 1, 0, 1];
|
||||
let word_index = &[0, 0, 1, 2, 1, 3];
|
||||
|
||||
// soup -> of = 1
|
||||
// + of -> the = 1
|
||||
// + the -> day = 1
|
||||
assert_eq!(
|
||||
matches_proximity(query_index, distance, attribute, word_index),
|
||||
3
|
||||
);
|
||||
}
|
||||
}
|
@ -18,6 +18,9 @@ pub mod serde;
|
||||
pub mod store;
|
||||
mod update;
|
||||
|
||||
// TODO replace
|
||||
mod bucket_sort;
|
||||
|
||||
pub use self::database::{BoxUpdateFn, Database, MainT, UpdateT};
|
||||
pub use self::error::{Error, MResult};
|
||||
pub use self::number::{Number, ParseNumberError};
|
||||
@ -25,63 +28,48 @@ pub use self::ranked_map::RankedMap;
|
||||
pub use self::raw_document::RawDocument;
|
||||
pub use self::store::Index;
|
||||
pub use self::update::{EnqueuedUpdateResult, ProcessedUpdateResult, UpdateStatus, UpdateType};
|
||||
pub use meilisearch_types::{DocIndex, DocumentId, Highlight};
|
||||
pub use meilisearch_types::{DocIndex, DocumentId, Highlight, AttrCount};
|
||||
|
||||
#[doc(hidden)]
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct TmpMatch {
|
||||
pub query_index: u32,
|
||||
pub distance: u8,
|
||||
pub attribute: u16,
|
||||
pub word_index: u16,
|
||||
pub is_exact: bool,
|
||||
}
|
||||
use compact_arena::SmallArena;
|
||||
use crate::bucket_sort::{QueryWordAutomaton, PostingsListView};
|
||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
pub struct Document {
|
||||
pub id: DocumentId,
|
||||
pub highlights: Vec<Highlight>,
|
||||
|
||||
#[cfg(test)]
|
||||
pub matches: Vec<TmpMatch>,
|
||||
// #[cfg(test)]
|
||||
// pub matches: Vec<TmpMatch>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
#[cfg(not(test))]
|
||||
fn from_raw(raw: RawDocument) -> Document {
|
||||
Document {
|
||||
id: raw.id,
|
||||
highlights: raw.highlights,
|
||||
}
|
||||
}
|
||||
pub fn from_raw<'a, 'tag, 'txn>(
|
||||
raw_document: RawDocument<'a, 'tag>,
|
||||
automatons: &[QueryWordAutomaton],
|
||||
arena: &SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Document
|
||||
{
|
||||
let highlights = raw_document.raw_matches.iter().flat_map(|sm| {
|
||||
let postings_list = &arena[sm.postings_list];
|
||||
let input = postings_list.input();
|
||||
let query = &automatons[sm.query_index as usize].query;
|
||||
postings_list.iter().map(move |m| {
|
||||
let covered_area = if query.len() > input.len() {
|
||||
input.len()
|
||||
} else {
|
||||
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
||||
};
|
||||
|
||||
#[cfg(test)]
|
||||
fn from_raw(raw: RawDocument) -> Document {
|
||||
let len = raw.query_index().len();
|
||||
let mut matches = Vec::with_capacity(len);
|
||||
Highlight {
|
||||
attribute: m.attribute,
|
||||
char_index: m.char_index,
|
||||
char_length: covered_area as u16,
|
||||
}
|
||||
})
|
||||
}).collect();
|
||||
|
||||
let query_index = raw.query_index();
|
||||
let distance = raw.distance();
|
||||
let attribute = raw.attribute();
|
||||
let word_index = raw.word_index();
|
||||
let is_exact = raw.is_exact();
|
||||
|
||||
for i in 0..len {
|
||||
let match_ = TmpMatch {
|
||||
query_index: query_index[i],
|
||||
distance: distance[i],
|
||||
attribute: attribute[i],
|
||||
word_index: word_index[i],
|
||||
is_exact: is_exact[i],
|
||||
};
|
||||
matches.push(match_);
|
||||
}
|
||||
|
||||
Document {
|
||||
id: raw.id,
|
||||
matches,
|
||||
highlights: raw.highlights,
|
||||
}
|
||||
Document { id: raw_document.id, highlights }
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,20 +1,9 @@
|
||||
use hashbrown::HashMap;
|
||||
use std::convert::TryFrom;
|
||||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{cmp, mem};
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::{GroupBy, GroupByMut};
|
||||
use std::time::Duration;
|
||||
|
||||
use crate::database::MainT;
|
||||
use crate::automaton::{Automaton, AutomatonGroup, AutomatonProducer, QueryEnhancer};
|
||||
use crate::distinct_map::{BufferedDistinctMap, DistinctMap};
|
||||
use crate::levenshtein::prefix_damerau_levenshtein;
|
||||
use crate::raw_document::{raw_documents_from, RawDocument};
|
||||
use crate::{criterion::Criteria, Document, DocumentId, Highlight, TmpMatch};
|
||||
use crate::bucket_sort::{bucket_sort, bucket_sort_with_distinct};
|
||||
use crate::{criterion::Criteria, Document, DocumentId};
|
||||
use crate::{reordered_attrs::ReorderedAttrs, store, MResult};
|
||||
|
||||
pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
@ -29,249 +18,6 @@ pub struct QueryBuilder<'c, 'f, 'd> {
|
||||
synonyms_store: store::Synonyms,
|
||||
}
|
||||
|
||||
fn multiword_rewrite_matches(
|
||||
mut matches: Vec<(DocumentId, TmpMatch)>,
|
||||
query_enhancer: &QueryEnhancer,
|
||||
) -> SetBuf<(DocumentId, TmpMatch)> {
|
||||
let mut padded_matches = Vec::with_capacity(matches.len());
|
||||
|
||||
// we sort the matches by word index to make them rewritable
|
||||
matches.sort_unstable_by_key(|(id, match_)| (*id, match_.attribute, match_.word_index));
|
||||
|
||||
// for each attribute of each document
|
||||
for same_document_attribute in matches.linear_group_by_key(|(id, m)| (*id, m.attribute)) {
|
||||
// padding will only be applied
|
||||
// to word indices in the same attribute
|
||||
let mut padding = 0;
|
||||
let mut iter = same_document_attribute.linear_group_by_key(|(_, m)| m.word_index);
|
||||
|
||||
// for each match at the same position
|
||||
// in this document attribute
|
||||
while let Some(same_word_index) = iter.next() {
|
||||
// find the biggest padding
|
||||
let mut biggest = 0;
|
||||
for (id, match_) in same_word_index {
|
||||
let mut replacement = query_enhancer.replacement(match_.query_index);
|
||||
let replacement_len = replacement.len();
|
||||
let nexts = iter.remainder().linear_group_by_key(|(_, m)| m.word_index);
|
||||
|
||||
if let Some(query_index) = replacement.next() {
|
||||
let word_index = match_.word_index + padding as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
}
|
||||
|
||||
let mut found = false;
|
||||
|
||||
// look ahead and if there already is a match
|
||||
// corresponding to this padding word, abort the padding
|
||||
'padding: for (x, next_group) in nexts.enumerate() {
|
||||
for (i, query_index) in replacement.clone().enumerate().skip(x) {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let padmatch = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
|
||||
for (_, nmatch_) in next_group {
|
||||
let mut rep = query_enhancer.replacement(nmatch_.query_index);
|
||||
let query_index = rep.next().unwrap();
|
||||
if query_index == padmatch.query_index {
|
||||
if !found {
|
||||
// if we find a corresponding padding for the
|
||||
// first time we must push preceding paddings
|
||||
for (i, query_index) in replacement.clone().enumerate().take(i)
|
||||
{
|
||||
let word_index =
|
||||
match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
biggest = biggest.max(i + 1);
|
||||
}
|
||||
}
|
||||
|
||||
padded_matches.push((*id, padmatch));
|
||||
found = true;
|
||||
continue 'padding;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// if we do not find a corresponding padding in the
|
||||
// next groups so stop here and pad what was found
|
||||
break;
|
||||
}
|
||||
|
||||
if !found {
|
||||
// if no padding was found in the following matches
|
||||
// we must insert the entire padding
|
||||
for (i, query_index) in replacement.enumerate() {
|
||||
let word_index = match_.word_index + padding as u16 + (i + 1) as u16;
|
||||
let match_ = TmpMatch {
|
||||
query_index,
|
||||
word_index,
|
||||
..*match_
|
||||
};
|
||||
padded_matches.push((*id, match_));
|
||||
}
|
||||
|
||||
biggest = biggest.max(replacement_len - 1);
|
||||
}
|
||||
}
|
||||
|
||||
padding += biggest;
|
||||
}
|
||||
}
|
||||
|
||||
for document_matches in padded_matches.linear_group_by_key_mut(|(id, _)| *id) {
|
||||
document_matches.sort_unstable();
|
||||
}
|
||||
|
||||
SetBuf::new_unchecked(padded_matches)
|
||||
}
|
||||
|
||||
fn fetch_raw_documents(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
automatons_groups: &[AutomatonGroup],
|
||||
query_enhancer: &QueryEnhancer,
|
||||
searchables: Option<&ReorderedAttrs>,
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
) -> MResult<Vec<RawDocument>> {
|
||||
let mut matches = Vec::new();
|
||||
let mut highlights = Vec::new();
|
||||
|
||||
for group in automatons_groups {
|
||||
let AutomatonGroup {
|
||||
is_phrase_query,
|
||||
automatons,
|
||||
} = group;
|
||||
let phrase_query_len = automatons.len();
|
||||
|
||||
let mut tmp_matches = Vec::new();
|
||||
for (id, automaton) in automatons.into_iter().enumerate() {
|
||||
let Automaton {
|
||||
index,
|
||||
is_exact,
|
||||
query_len,
|
||||
query,
|
||||
..
|
||||
} = automaton;
|
||||
let dfa = automaton.dfa();
|
||||
|
||||
let words = match main_store.words_fst(reader)? {
|
||||
Some(words) => words,
|
||||
None => return Ok(Vec::new()),
|
||||
};
|
||||
|
||||
let mut stream = words.search(&dfa).into_stream();
|
||||
while let Some(input) = stream.next() {
|
||||
let distance = dfa.eval(input).to_u8();
|
||||
let is_exact = *is_exact && distance == 0 && input.len() == *query_len;
|
||||
|
||||
let covered_area = if *query_len > input.len() {
|
||||
input.len()
|
||||
} else {
|
||||
prefix_damerau_levenshtein(query.as_bytes(), input).1
|
||||
};
|
||||
|
||||
let doc_indexes = match postings_lists_store.postings_list(reader, input)? {
|
||||
Some(doc_indexes) => doc_indexes,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
tmp_matches.reserve(doc_indexes.len());
|
||||
|
||||
for di in doc_indexes.as_ref() {
|
||||
let attribute = searchables.map_or(Some(di.attribute), |r| r.get(di.attribute));
|
||||
if let Some(attribute) = attribute {
|
||||
let match_ = TmpMatch {
|
||||
query_index: *index as u32,
|
||||
distance,
|
||||
attribute,
|
||||
word_index: di.word_index,
|
||||
is_exact,
|
||||
};
|
||||
|
||||
let covered_area = u16::try_from(covered_area).unwrap_or(u16::max_value());
|
||||
let covered_area = cmp::min(covered_area, di.char_length);
|
||||
|
||||
let highlight = Highlight {
|
||||
attribute: di.attribute,
|
||||
char_index: di.char_index,
|
||||
char_length: covered_area,
|
||||
};
|
||||
|
||||
tmp_matches.push((di.document_id, id, match_, highlight));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if *is_phrase_query {
|
||||
tmp_matches.sort_unstable_by_key(|(id, _, m, _)| (*id, m.attribute, m.word_index));
|
||||
for group in tmp_matches.linear_group_by_key(|(id, _, m, _)| (*id, m.attribute)) {
|
||||
for window in group.windows(2) {
|
||||
let (ida, ia, ma, ha) = window[0];
|
||||
let (idb, ib, mb, hb) = window[1];
|
||||
|
||||
debug_assert_eq!(ida, idb);
|
||||
|
||||
// if matches must follow and actually follows themselves
|
||||
if ia + 1 == ib && ma.word_index + 1 == mb.word_index {
|
||||
// TODO we must make it work for phrase query longer than 2
|
||||
// if the second match is the last phrase query word
|
||||
if ib + 1 == phrase_query_len {
|
||||
// insert first match
|
||||
matches.push((ida, ma));
|
||||
highlights.push((ida, ha));
|
||||
|
||||
// insert second match
|
||||
matches.push((idb, mb));
|
||||
highlights.push((idb, hb));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (id, _, match_, highlight) in tmp_matches {
|
||||
matches.push((id, match_));
|
||||
highlights.push((id, highlight));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let matches = multiword_rewrite_matches(matches, &query_enhancer);
|
||||
let highlights = {
|
||||
highlights.sort_unstable_by_key(|(id, _)| *id);
|
||||
SetBuf::new_unchecked(highlights)
|
||||
};
|
||||
|
||||
let fields_counts = {
|
||||
let mut fields_counts = Vec::new();
|
||||
for group in matches.linear_group_by_key(|(id, ..)| *id) {
|
||||
let id = group[0].0;
|
||||
for result in documents_fields_counts_store.document_fields_counts(reader, id)? {
|
||||
let (attr, count) = result?;
|
||||
fields_counts.push((id, attr, count));
|
||||
}
|
||||
}
|
||||
SetBuf::new(fields_counts).unwrap()
|
||||
};
|
||||
|
||||
Ok(raw_documents_from(matches, highlights, fields_counts))
|
||||
}
|
||||
|
||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
pub fn new(
|
||||
main: store::Main,
|
||||
@ -307,9 +53,7 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
synonyms_store: synonyms,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
pub fn with_filter<F>(&mut self, function: F)
|
||||
where
|
||||
F: Fn(DocumentId) -> bool + 'f,
|
||||
@ -342,29 +86,25 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
range: Range<usize>,
|
||||
) -> MResult<Vec<Document>> {
|
||||
match self.distinct {
|
||||
Some((distinct, distinct_size)) => raw_query_with_distinct(
|
||||
Some((distinct, distinct_size)) => bucket_sort_with_distinct(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
self.filter,
|
||||
distinct,
|
||||
distinct_size,
|
||||
self.timeout,
|
||||
self.criteria,
|
||||
self.searchable_attrs,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
self.synonyms_store,
|
||||
),
|
||||
None => raw_query(
|
||||
None => bucket_sort(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
self.filter,
|
||||
self.timeout,
|
||||
self.criteria,
|
||||
self.searchable_attrs,
|
||||
self.main_store,
|
||||
self.postings_lists_store,
|
||||
self.documents_fields_counts_store,
|
||||
@ -374,320 +114,6 @@ impl<'c, 'f, 'd> QueryBuilder<'c, 'f, 'd> {
|
||||
}
|
||||
}
|
||||
|
||||
fn raw_query<'c, FI>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
|
||||
query: &str,
|
||||
range: Range<usize>,
|
||||
|
||||
filter: Option<FI>,
|
||||
timeout: Option<Duration>,
|
||||
|
||||
criteria: Criteria<'c>,
|
||||
searchable_attrs: Option<ReorderedAttrs>,
|
||||
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<Vec<Document>>
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
{
|
||||
// We delegate the filter work to the distinct query builder,
|
||||
// specifying a distinct rule that has no effect.
|
||||
if filter.is_some() {
|
||||
let distinct = |_| None;
|
||||
let distinct_size = 1;
|
||||
return raw_query_with_distinct(
|
||||
reader,
|
||||
query,
|
||||
range,
|
||||
filter,
|
||||
distinct,
|
||||
distinct_size,
|
||||
timeout,
|
||||
criteria,
|
||||
searchable_attrs,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
documents_fields_counts_store,
|
||||
synonyms_store,
|
||||
);
|
||||
}
|
||||
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::with_capacity(range.len());
|
||||
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
let automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
// aggregate automatons groups by groups after time
|
||||
for auts in automaton_producer {
|
||||
automatons.push(auts);
|
||||
|
||||
// we must retrieve the documents associated
|
||||
// with the current automatons
|
||||
let mut raw_documents = fetch_raw_documents(
|
||||
reader,
|
||||
&automatons,
|
||||
&query_enhancer,
|
||||
searchable_attrs.as_ref(),
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
documents_fields_counts_store,
|
||||
)?;
|
||||
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
||||
|
||||
'criteria: for criterion in criteria.as_ref() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < range.start {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if documents_seen >= range.end {
|
||||
continue 'criteria;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// once we classified the documents related to the current
|
||||
// automatons we save that as the next valid result
|
||||
let iter = raw_documents
|
||||
.into_iter()
|
||||
.skip(range.start)
|
||||
.take(range.len());
|
||||
raw_documents_processed.clear();
|
||||
raw_documents_processed.extend(iter);
|
||||
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if start_processing.elapsed() > timeout {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make real documents now that we know
|
||||
// those must be returned
|
||||
let documents = raw_documents_processed
|
||||
.into_iter()
|
||||
.map(Document::from_raw)
|
||||
.collect();
|
||||
|
||||
Ok(documents)
|
||||
}
|
||||
|
||||
fn raw_query_with_distinct<'c, FI, FD>(
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
|
||||
query: &str,
|
||||
range: Range<usize>,
|
||||
|
||||
filter: Option<FI>,
|
||||
|
||||
distinct: FD,
|
||||
distinct_size: usize,
|
||||
timeout: Option<Duration>,
|
||||
|
||||
criteria: Criteria<'c>,
|
||||
searchable_attrs: Option<ReorderedAttrs>,
|
||||
|
||||
main_store: store::Main,
|
||||
postings_lists_store: store::PostingsLists,
|
||||
documents_fields_counts_store: store::DocumentsFieldsCounts,
|
||||
synonyms_store: store::Synonyms,
|
||||
) -> MResult<Vec<Document>>
|
||||
where
|
||||
FI: Fn(DocumentId) -> bool,
|
||||
FD: Fn(DocumentId) -> Option<u64>,
|
||||
{
|
||||
let start_processing = Instant::now();
|
||||
let mut raw_documents_processed = Vec::new();
|
||||
|
||||
let (automaton_producer, query_enhancer) = AutomatonProducer::new(
|
||||
reader,
|
||||
query,
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
synonyms_store,
|
||||
)?;
|
||||
|
||||
let automaton_producer = automaton_producer.into_iter();
|
||||
let mut automatons = Vec::new();
|
||||
|
||||
// aggregate automatons groups by groups after time
|
||||
for auts in automaton_producer {
|
||||
automatons.push(auts);
|
||||
|
||||
// we must retrieve the documents associated
|
||||
// with the current automatons
|
||||
let mut raw_documents = fetch_raw_documents(
|
||||
reader,
|
||||
&automatons,
|
||||
&query_enhancer,
|
||||
searchable_attrs.as_ref(),
|
||||
main_store,
|
||||
postings_lists_store,
|
||||
documents_fields_counts_store,
|
||||
)?;
|
||||
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if !raw_documents_processed.is_empty() && start_processing.elapsed() > timeout {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let mut groups = vec![raw_documents.as_mut_slice()];
|
||||
let mut key_cache = HashMap::new();
|
||||
|
||||
let mut filter_map = HashMap::new();
|
||||
// these two variables informs on the current distinct map and
|
||||
// on the raw offset of the start of the group where the
|
||||
// range.start bound is located according to the distinct function
|
||||
let mut distinct_map = DistinctMap::new(distinct_size);
|
||||
let mut distinct_raw_offset = 0;
|
||||
|
||||
'criteria: for criterion in criteria.as_ref() {
|
||||
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
||||
let mut buf_distinct = BufferedDistinctMap::new(&mut distinct_map);
|
||||
let mut documents_seen = 0;
|
||||
|
||||
for group in tmp_groups {
|
||||
// if this group does not overlap with the requested range,
|
||||
// push it without sorting and splitting it
|
||||
if documents_seen + group.len() < distinct_raw_offset {
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
continue;
|
||||
}
|
||||
|
||||
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
||||
|
||||
for group in group.binary_group_by_mut(|a, b| criterion.eq(a, b)) {
|
||||
// we must compute the real distinguished len of this sub-group
|
||||
for document in group.iter() {
|
||||
let filter_accepted = match &filter {
|
||||
Some(filter) => {
|
||||
let entry = filter_map.entry(document.id);
|
||||
*entry.or_insert_with(|| (filter)(document.id))
|
||||
}
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let entry = key_cache.entry(document.id);
|
||||
let key = entry.or_insert_with(|| (distinct)(document.id).map(Rc::new));
|
||||
|
||||
match key.clone() {
|
||||
Some(key) => buf_distinct.register(key),
|
||||
None => buf_distinct.register_without_key(),
|
||||
};
|
||||
}
|
||||
|
||||
// the requested range end is reached: stop computing distinct
|
||||
if buf_distinct.len() >= range.end {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
documents_seen += group.len();
|
||||
groups.push(group);
|
||||
|
||||
// if this sub-group does not overlap with the requested range
|
||||
// we must update the distinct map and its start index
|
||||
if buf_distinct.len() < range.start {
|
||||
buf_distinct.transfert_to_internal();
|
||||
distinct_raw_offset = documents_seen;
|
||||
}
|
||||
|
||||
// we have sort enough documents if the last document sorted is after
|
||||
// the end of the requested range, we can continue to the next criterion
|
||||
if buf_distinct.len() >= range.end {
|
||||
continue 'criteria;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// once we classified the documents related to the current
|
||||
// automatons we save that as the next valid result
|
||||
let mut seen = BufferedDistinctMap::new(&mut distinct_map);
|
||||
raw_documents_processed.clear();
|
||||
|
||||
for document in raw_documents.into_iter().skip(distinct_raw_offset) {
|
||||
let filter_accepted = match &filter {
|
||||
Some(_) => filter_map.remove(&document.id).unwrap(),
|
||||
None => true,
|
||||
};
|
||||
|
||||
if filter_accepted {
|
||||
let key = key_cache.remove(&document.id).unwrap();
|
||||
let distinct_accepted = match key {
|
||||
Some(key) => seen.register(key),
|
||||
None => seen.register_without_key(),
|
||||
};
|
||||
|
||||
if distinct_accepted && seen.len() > range.start {
|
||||
raw_documents_processed.push(document);
|
||||
if raw_documents_processed.len() == range.len() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// stop processing when time is running out
|
||||
if let Some(timeout) = timeout {
|
||||
if start_processing.elapsed() > timeout {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// make real documents now that we know
|
||||
// those must be returned
|
||||
let documents = raw_documents_processed
|
||||
.into_iter()
|
||||
.map(Document::from_raw)
|
||||
.collect();
|
||||
|
||||
Ok(documents)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -815,7 +241,7 @@ mod tests {
|
||||
|
||||
let mut words_fst = BTreeSet::new();
|
||||
let mut postings_lists = HashMap::new();
|
||||
let mut fields_counts = HashMap::<_, u64>::new();
|
||||
let mut fields_counts = HashMap::<_, u16>::new();
|
||||
|
||||
for (word, indexes) in iter {
|
||||
let word = word.to_lowercase().into_bytes();
|
||||
|
@ -1,398 +0,0 @@
|
||||
use std::ops::Range;
|
||||
use std::cmp::Ordering::{Less, Greater, Equal};
|
||||
|
||||
/// Return `true` if the specified range can accept the given replacements words.
|
||||
/// Returns `false` if the replacements words are already present in the original query
|
||||
/// or if there is fewer replacement words than the range to replace.
|
||||
//
|
||||
//
|
||||
// ## Ignored because already present in original
|
||||
//
|
||||
// new york city subway
|
||||
// -------- ^^^^
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
//
|
||||
// ## Ignored because smaller than the original
|
||||
//
|
||||
// new york city subway
|
||||
// -------------
|
||||
// \ /
|
||||
// [new york]
|
||||
//
|
||||
//
|
||||
// ## Accepted because bigger than the original
|
||||
//
|
||||
// NYC subway
|
||||
// ---
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// / \
|
||||
// [new york city]
|
||||
//
|
||||
fn rewrite_range_with<S, T>(query: &[S], range: Range<usize>, words: &[T]) -> bool
|
||||
where S: AsRef<str>,
|
||||
T: AsRef<str>,
|
||||
{
|
||||
if words.len() <= range.len() {
|
||||
// there is fewer or equal replacement words
|
||||
// than there is already in the replaced range
|
||||
return false
|
||||
}
|
||||
|
||||
// retrieve the part to rewrite but with the length
|
||||
// of the replacement part
|
||||
let original = query.iter().skip(range.start).take(words.len());
|
||||
|
||||
// check if the original query doesn't already contain
|
||||
// the replacement words
|
||||
!original.map(AsRef::as_ref).eq(words.iter().map(AsRef::as_ref))
|
||||
}
|
||||
|
||||
type Origin = usize;
|
||||
type RealLength = usize;
|
||||
|
||||
struct FakeIntervalTree {
|
||||
intervals: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl FakeIntervalTree {
|
||||
fn new(mut intervals: Vec<(Range<usize>, (Origin, RealLength))>) -> FakeIntervalTree {
|
||||
intervals.sort_unstable_by_key(|(r, _)| (r.start, r.end));
|
||||
FakeIntervalTree { intervals }
|
||||
}
|
||||
|
||||
fn query(&self, point: usize) -> Option<(Range<usize>, (Origin, RealLength))> {
|
||||
let element = self.intervals.binary_search_by(|(r, _)| {
|
||||
if point >= r.start {
|
||||
if point < r.end { Equal } else { Less }
|
||||
} else { Greater }
|
||||
});
|
||||
|
||||
let n = match element { Ok(n) => n, Err(n) => n };
|
||||
|
||||
match self.intervals.get(n) {
|
||||
Some((range, value)) if range.contains(&point) => Some((range.clone(), *value)),
|
||||
_otherwise => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancerBuilder<'a, S> {
|
||||
query: &'a [S],
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: Vec<(Range<usize>, (Origin, RealLength))>,
|
||||
}
|
||||
|
||||
impl<S: AsRef<str>> QueryEnhancerBuilder<'_, S> {
|
||||
pub fn new(query: &[S]) -> QueryEnhancerBuilder<S> {
|
||||
// we initialize origins query indices based on their positions
|
||||
let origins: Vec<_> = (0..query.len() + 1).collect();
|
||||
let real_to_origin = origins.iter().map(|&o| (o..o+1, (o, 1))).collect();
|
||||
|
||||
QueryEnhancerBuilder { query, origins, real_to_origin }
|
||||
}
|
||||
|
||||
/// Update the final real to origin query indices mapping.
|
||||
///
|
||||
/// `range` is the original words range that this `replacement` words replace
|
||||
/// and `real` is the first real query index of these replacement words.
|
||||
pub fn declare<T>(&mut self, range: Range<usize>, real: usize, replacement: &[T])
|
||||
where T: AsRef<str>,
|
||||
{
|
||||
// check if the range of original words
|
||||
// can be rewritten with the replacement words
|
||||
if rewrite_range_with(self.query, range.clone(), replacement) {
|
||||
|
||||
// this range can be replaced so we need to
|
||||
// modify the origins accordingly
|
||||
let offset = replacement.len() - range.len();
|
||||
|
||||
let previous_padding = self.origins[range.end - 1];
|
||||
let current_offset = (self.origins[range.end] - 1) - previous_padding;
|
||||
let diff = offset.saturating_sub(current_offset);
|
||||
self.origins[range.end] += diff;
|
||||
|
||||
for r in &mut self.origins[range.end + 1..] {
|
||||
*r += diff;
|
||||
}
|
||||
}
|
||||
|
||||
// we need to store the real number and origins relations
|
||||
// this way it will be possible to know by how many
|
||||
// we need to pad real query indices
|
||||
let real_range = real..real + replacement.len().max(range.len());
|
||||
let real_length = replacement.len();
|
||||
self.real_to_origin.push((real_range, (range.start, real_length)));
|
||||
}
|
||||
|
||||
pub fn build(self) -> QueryEnhancer {
|
||||
QueryEnhancer {
|
||||
origins: self.origins,
|
||||
real_to_origin: FakeIntervalTree::new(self.real_to_origin),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct QueryEnhancer {
|
||||
origins: Vec<usize>,
|
||||
real_to_origin: FakeIntervalTree,
|
||||
}
|
||||
|
||||
impl QueryEnhancer {
|
||||
/// Returns the query indices to use to replace this real query index.
|
||||
pub fn replacement(&self, real: u32) -> Range<u32> {
|
||||
let real = real as usize;
|
||||
|
||||
// query the fake interval tree with the real query index
|
||||
let (range, (origin, real_length)) =
|
||||
self.real_to_origin
|
||||
.query(real)
|
||||
.expect("real has never been declared");
|
||||
|
||||
// if `real` is the end bound of the range
|
||||
if (range.start + real_length - 1) == real {
|
||||
let mut count = range.len();
|
||||
let mut new_origin = origin;
|
||||
for (i, slice) in self.origins[new_origin..].windows(2).enumerate() {
|
||||
let len = slice[1] - slice[0];
|
||||
count = count.saturating_sub(len);
|
||||
if count == 0 { new_origin = origin + i; break }
|
||||
}
|
||||
|
||||
let n = real - range.start;
|
||||
let start = self.origins[origin];
|
||||
let end = self.origins[new_origin + 1];
|
||||
let remaining = (end - start) - n;
|
||||
|
||||
Range { start: (start + n) as u32, end: (start + n + remaining) as u32 }
|
||||
|
||||
} else {
|
||||
// just return the origin along with
|
||||
// the real position of the word
|
||||
let n = real as usize - range.start;
|
||||
let origin = self.origins[origin];
|
||||
|
||||
Range { start: (origin + n) as u32, end: (origin + n + 1) as u32 }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn original_unmodified() {
|
||||
let query = ["new", "york", "city", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(2), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(3), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn simple_growing() {
|
||||
let query = ["new", "york", "subway"];
|
||||
// 0 1 2
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// new york = new york city
|
||||
builder.declare(0..2, 3, &["new", "york", "city"]);
|
||||
// ^ 3 4 5
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(2), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(3), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(4), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(5), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn same_place_growings() {
|
||||
let query = ["NY", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NY = new york
|
||||
builder.declare(0..1, 2, &["new", "york"]);
|
||||
// ^ 2 3
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// NY = NYC
|
||||
builder.declare(0..1, 7, &["NYC"]);
|
||||
// ^ 7
|
||||
|
||||
// NY = new york city
|
||||
builder.declare(0..1, 8, &["new", "york", "city"]);
|
||||
// ^ 8 9 10
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(1..2, 11, &["underground", "train"]);
|
||||
// ^ 11 12
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NY
|
||||
assert_eq!(enhancer.replacement(1), 3..5); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..3); // york
|
||||
assert_eq!(enhancer.replacement(4), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(5), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(6), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(7), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(8), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(9), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(10), 2..3); // city
|
||||
assert_eq!(enhancer.replacement(11), 3..4); // underground
|
||||
assert_eq!(enhancer.replacement(12), 4..5); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bigger_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(0..1, 2, &["new", "york", "city"]);
|
||||
// ^ 2 3 4
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..3); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 3..4); // subway
|
||||
assert_eq!(enhancer.replacement(2), 0..1); // new
|
||||
assert_eq!(enhancer.replacement(3), 1..2); // york
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn middle_query_growing() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..6); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn end_query_growing() {
|
||||
let query = ["NYC", "subway"];
|
||||
// 0 1
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(1..2, 2, &["underground", "train"]);
|
||||
// ^ 2 3
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // NYC
|
||||
assert_eq!(enhancer.replacement(1), 1..3); // subway
|
||||
assert_eq!(enhancer.replacement(2), 1..2); // underground
|
||||
assert_eq!(enhancer.replacement(3), 2..3); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_probable_growings() {
|
||||
let query = ["great", "awesome", "NYC", "subway"];
|
||||
// 0 1 2 3
|
||||
let mut builder = QueryEnhancerBuilder::new(&query);
|
||||
|
||||
// NYC = new york city
|
||||
builder.declare(2..3, 4, &["new", "york", "city"]);
|
||||
// ^ 4 5 6
|
||||
|
||||
// subway = underground train
|
||||
builder.declare(3..4, 7, &["underground", "train"]);
|
||||
// ^ 7 8
|
||||
|
||||
// great awesome = good
|
||||
builder.declare(0..2, 9, &["good"]);
|
||||
// ^ 9
|
||||
|
||||
// awesome NYC = NY
|
||||
builder.declare(1..3, 10, &["NY"]);
|
||||
// ^^ 10
|
||||
|
||||
// NYC subway = metro
|
||||
builder.declare(2..4, 11, &["metro"]);
|
||||
// ^^ 11
|
||||
|
||||
let enhancer = builder.build();
|
||||
|
||||
assert_eq!(enhancer.replacement(0), 0..1); // great
|
||||
assert_eq!(enhancer.replacement(1), 1..2); // awesome
|
||||
assert_eq!(enhancer.replacement(2), 2..5); // NYC
|
||||
assert_eq!(enhancer.replacement(3), 5..7); // subway
|
||||
assert_eq!(enhancer.replacement(4), 2..3); // new
|
||||
assert_eq!(enhancer.replacement(5), 3..4); // york
|
||||
assert_eq!(enhancer.replacement(6), 4..5); // city
|
||||
assert_eq!(enhancer.replacement(7), 5..6); // underground
|
||||
assert_eq!(enhancer.replacement(8), 6..7); // train
|
||||
assert_eq!(enhancer.replacement(9), 0..2); // good
|
||||
assert_eq!(enhancer.replacement(10), 1..5); // NY
|
||||
assert_eq!(enhancer.replacement(11), 2..5); // metro
|
||||
}
|
||||
}
|
@ -1,186 +1,89 @@
|
||||
use std::fmt;
|
||||
use std::sync::Arc;
|
||||
|
||||
use meilisearch_schema::SchemaAttr;
|
||||
use compact_arena::SmallArena;
|
||||
use itertools::EitherOrBoth;
|
||||
use sdset::SetBuf;
|
||||
use slice_group_by::GroupBy;
|
||||
|
||||
use crate::{DocumentId, Highlight, TmpMatch};
|
||||
use crate::bucket_sort::{SimpleMatch, BareMatch, QueryWordAutomaton, PostingsListView};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct RawDocument {
|
||||
pub id: DocumentId,
|
||||
pub matches: SharedMatches,
|
||||
pub highlights: Vec<Highlight>,
|
||||
pub fields_counts: SetBuf<(SchemaAttr, u64)>,
|
||||
pub struct RawDocument<'a, 'tag> {
|
||||
pub id: crate::DocumentId,
|
||||
pub raw_matches: &'a mut [BareMatch<'tag>],
|
||||
pub processed_matches: Vec<SimpleMatch>,
|
||||
/// The list of minimum `distance` found
|
||||
pub processed_distances: Vec<Option<u8>>,
|
||||
}
|
||||
|
||||
impl RawDocument {
|
||||
pub fn query_index(&self) -> &[u32] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe {
|
||||
&self
|
||||
.matches
|
||||
.matches
|
||||
.query_index
|
||||
.get_unchecked(r.start..r.end)
|
||||
}
|
||||
}
|
||||
impl<'a, 'tag> RawDocument<'a, 'tag> {
|
||||
pub fn new<'txn>(
|
||||
raw_matches: &'a mut [BareMatch<'tag>],
|
||||
automatons: &[QueryWordAutomaton],
|
||||
postings_lists: &mut SmallArena<'tag, PostingsListView<'txn>>,
|
||||
) -> Option<RawDocument<'a, 'tag>>
|
||||
{
|
||||
raw_matches.sort_unstable_by_key(|m| m.query_index);
|
||||
|
||||
pub fn distance(&self) -> &[u8] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.distance.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
let mut previous_word = None;
|
||||
for i in 0..raw_matches.len() {
|
||||
let a = &raw_matches[i];
|
||||
let auta = &automatons[a.query_index as usize];
|
||||
|
||||
pub fn attribute(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.attribute.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
match auta.phrase_query {
|
||||
Some((0, _)) => {
|
||||
let b = match raw_matches.get(i + 1) {
|
||||
Some(b) => b,
|
||||
None => {
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
pub fn word_index(&self) -> &[u16] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe {
|
||||
&self
|
||||
.matches
|
||||
.matches
|
||||
.word_index
|
||||
.get_unchecked(r.start..r.end)
|
||||
}
|
||||
}
|
||||
if a.query_index + 1 != b.query_index {
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
||||
continue
|
||||
}
|
||||
|
||||
pub fn is_exact(&self) -> &[bool] {
|
||||
let r = self.matches.range;
|
||||
// it is safe because construction/modifications
|
||||
// can only be done in this module
|
||||
unsafe { &self.matches.matches.is_exact.get_unchecked(r.start..r.end) }
|
||||
}
|
||||
}
|
||||
let pla = &postings_lists[a.postings_list];
|
||||
let plb = &postings_lists[b.postings_list];
|
||||
|
||||
impl fmt::Debug for RawDocument {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
f.write_str("RawDocument {\r\n")?;
|
||||
f.write_fmt(format_args!("{:>15}: {:?},\r\n", "id", self.id))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"query_index",
|
||||
self.query_index()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"distance",
|
||||
self.distance()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"attribute",
|
||||
self.attribute()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"word_index",
|
||||
self.word_index()
|
||||
))?;
|
||||
f.write_fmt(format_args!(
|
||||
"{:>15}: {:^5?},\r\n",
|
||||
"is_exact",
|
||||
self.is_exact()
|
||||
))?;
|
||||
f.write_str("}")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
let iter = itertools::merge_join_by(pla.iter(), plb.iter(), |a, b| {
|
||||
a.attribute.cmp(&b.attribute).then((a.word_index + 1).cmp(&b.word_index))
|
||||
});
|
||||
|
||||
pub fn raw_documents_from(
|
||||
matches: SetBuf<(DocumentId, TmpMatch)>,
|
||||
highlights: SetBuf<(DocumentId, Highlight)>,
|
||||
fields_counts: SetBuf<(DocumentId, SchemaAttr, u64)>,
|
||||
) -> Vec<RawDocument> {
|
||||
let mut docs_ranges: Vec<(_, Range, _, _)> = Vec::new();
|
||||
let mut matches2 = Matches::with_capacity(matches.len());
|
||||
let mut newa = Vec::new();
|
||||
let mut newb = Vec::new();
|
||||
|
||||
let matches = matches.linear_group_by_key(|(id, _)| *id);
|
||||
let highlights = highlights.linear_group_by_key(|(id, _)| *id);
|
||||
let fields_counts = fields_counts.linear_group_by_key(|(id, _, _)| *id);
|
||||
for eb in iter {
|
||||
if let EitherOrBoth::Both(a, b) = eb {
|
||||
newa.push(*a);
|
||||
newb.push(*b);
|
||||
}
|
||||
}
|
||||
|
||||
for ((mgroup, hgroup), fgroup) in matches.zip(highlights).zip(fields_counts) {
|
||||
debug_assert_eq!(mgroup[0].0, hgroup[0].0);
|
||||
debug_assert_eq!(mgroup[0].0, fgroup[0].0);
|
||||
if !newa.is_empty() {
|
||||
previous_word = Some(a.query_index);
|
||||
}
|
||||
|
||||
let document_id = mgroup[0].0;
|
||||
let start = docs_ranges.last().map(|(_, r, _, _)| r.end).unwrap_or(0);
|
||||
let end = start + mgroup.len();
|
||||
let highlights = hgroup.iter().map(|(_, h)| *h).collect();
|
||||
let fields_counts = SetBuf::new(fgroup.iter().map(|(_, a, c)| (*a, *c)).collect()).unwrap();
|
||||
|
||||
docs_ranges.push((document_id, Range { start, end }, highlights, fields_counts));
|
||||
matches2.extend_from_slice(mgroup);
|
||||
}
|
||||
|
||||
let matches = Arc::new(matches2);
|
||||
docs_ranges
|
||||
.into_iter()
|
||||
.map(|(id, range, highlights, fields_counts)| {
|
||||
let matches = SharedMatches {
|
||||
range,
|
||||
matches: matches.clone(),
|
||||
};
|
||||
RawDocument {
|
||||
id,
|
||||
matches,
|
||||
highlights,
|
||||
fields_counts,
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::new_unchecked(newa));
|
||||
postings_lists[b.postings_list].rewrite_with(SetBuf::new_unchecked(newb));
|
||||
},
|
||||
Some((1, _)) => {
|
||||
if previous_word.take() != Some(a.query_index - 1) {
|
||||
postings_lists[a.postings_list].rewrite_with(SetBuf::default());
|
||||
}
|
||||
},
|
||||
Some((_, _)) => unreachable!(),
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
|
||||
if raw_matches.iter().all(|rm| postings_lists[rm.postings_list].is_empty()) {
|
||||
return None
|
||||
}
|
||||
|
||||
Some(RawDocument {
|
||||
id: raw_matches[0].document_id,
|
||||
raw_matches,
|
||||
processed_matches: Vec::new(),
|
||||
processed_distances: Vec::new(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
struct Range {
|
||||
start: usize,
|
||||
end: usize,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct SharedMatches {
|
||||
range: Range,
|
||||
matches: Arc<Matches>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct Matches {
|
||||
query_index: Vec<u32>,
|
||||
distance: Vec<u8>,
|
||||
attribute: Vec<u16>,
|
||||
word_index: Vec<u16>,
|
||||
is_exact: Vec<bool>,
|
||||
}
|
||||
|
||||
impl Matches {
|
||||
fn with_capacity(cap: usize) -> Matches {
|
||||
Matches {
|
||||
query_index: Vec::with_capacity(cap),
|
||||
distance: Vec::with_capacity(cap),
|
||||
attribute: Vec::with_capacity(cap),
|
||||
word_index: Vec::with_capacity(cap),
|
||||
is_exact: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
fn extend_from_slice(&mut self, matches: &[(DocumentId, TmpMatch)]) {
|
||||
for (_, match_) in matches {
|
||||
self.query_index.push(match_.query_index);
|
||||
self.distance.push(match_.distance);
|
||||
self.attribute.push(match_.attribute);
|
||||
self.word_index.push(match_.word_index);
|
||||
self.is_exact.push(match_.is_exact);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -325,7 +325,7 @@ where
|
||||
txn,
|
||||
document_id,
|
||||
attribute,
|
||||
number_of_words as u64,
|
||||
number_of_words as u16,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ use meilisearch_schema::SchemaAttr;
|
||||
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct DocumentsFieldsCounts {
|
||||
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
pub(crate) documents_fields_counts: heed::Database<OwnedType<DocumentAttrKey>, OwnedType<u16>>,
|
||||
}
|
||||
|
||||
impl DocumentsFieldsCounts {
|
||||
@ -16,7 +16,7 @@ impl DocumentsFieldsCounts {
|
||||
writer: &mut heed::RwTxn<MainT>,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
value: u64,
|
||||
value: u16,
|
||||
) -> ZResult<()> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
self.documents_fields_counts.put(writer, &key, &value)
|
||||
@ -42,7 +42,7 @@ impl DocumentsFieldsCounts {
|
||||
reader: &heed::RoTxn<MainT>,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
) -> ZResult<Option<u64>> {
|
||||
) -> ZResult<Option<u16>> {
|
||||
let key = DocumentAttrKey::new(document_id, attribute);
|
||||
match self.documents_fields_counts.get(reader, &key)? {
|
||||
Some(count) => Ok(Some(count)),
|
||||
@ -79,11 +79,11 @@ impl DocumentsFieldsCounts {
|
||||
}
|
||||
|
||||
pub struct DocumentFieldsCountsIter<'txn> {
|
||||
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
iter: heed::RoRange<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
|
||||
}
|
||||
|
||||
impl Iterator for DocumentFieldsCountsIter<'_> {
|
||||
type Item = ZResult<(SchemaAttr, u64)>;
|
||||
type Item = ZResult<(SchemaAttr, u16)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
@ -99,7 +99,7 @@ impl Iterator for DocumentFieldsCountsIter<'_> {
|
||||
|
||||
pub struct DocumentsIdsIter<'txn> {
|
||||
last_seen_id: Option<DocumentId>,
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
|
||||
}
|
||||
|
||||
impl Iterator for DocumentsIdsIter<'_> {
|
||||
@ -123,11 +123,11 @@ impl Iterator for DocumentsIdsIter<'_> {
|
||||
}
|
||||
|
||||
pub struct AllDocumentsFieldsCountsIter<'txn> {
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u64>>,
|
||||
iter: heed::RoIter<'txn, OwnedType<DocumentAttrKey>, OwnedType<u16>>,
|
||||
}
|
||||
|
||||
impl Iterator for AllDocumentsFieldsCountsIter<'_> {
|
||||
type Item = ZResult<(DocumentId, SchemaAttr, u64)>;
|
||||
type Item = ZResult<(DocumentId, SchemaAttr, u16)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
|
@ -303,11 +303,11 @@ impl<'a> SearchBuilder<'a> {
|
||||
if let Some(ranking_rules_order) = ranking_order {
|
||||
for rule in ranking_rules_order {
|
||||
match rule.as_str() {
|
||||
"_sum_of_typos" => builder.push(SumOfTypos),
|
||||
"_number_of_words" => builder.push(NumberOfWords),
|
||||
"_word_proximity" => builder.push(WordsProximity),
|
||||
"_sum_of_words_attribute" => builder.push(SumOfWordsAttribute),
|
||||
"_sum_of_words_position" => builder.push(SumOfWordsPosition),
|
||||
"_typo" => builder.push(Typo),
|
||||
"_words" => builder.push(Words),
|
||||
"_proximity" => builder.push(Proximity),
|
||||
"_attribute" => builder.push(Attribute),
|
||||
"_words_position" => builder.push(WordsPosition),
|
||||
"_exact" => builder.push(Exact),
|
||||
_ => {
|
||||
let order = match ranking_rules.get(rule.as_str()) {
|
||||
@ -333,11 +333,11 @@ impl<'a> SearchBuilder<'a> {
|
||||
builder.push(DocumentId);
|
||||
return Ok(Some(builder.build()));
|
||||
} else {
|
||||
builder.push(SumOfTypos);
|
||||
builder.push(NumberOfWords);
|
||||
builder.push(WordsProximity);
|
||||
builder.push(SumOfWordsAttribute);
|
||||
builder.push(SumOfWordsPosition);
|
||||
builder.push(Typo);
|
||||
builder.push(Words);
|
||||
builder.push(Proximity);
|
||||
builder.push(Attribute);
|
||||
builder.push(WordsPosition);
|
||||
builder.push(Exact);
|
||||
for (rule, order) in ranking_rules.iter() {
|
||||
let custom_ranking = match order {
|
||||
|
@ -63,3 +63,11 @@ pub struct Highlight {
|
||||
/// without needing to run the tokenizer again.
|
||||
pub char_length: u16,
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||
#[cfg_attr(feature = "zerocopy", derive(AsBytes, FromBytes))]
|
||||
#[repr(C)]
|
||||
pub struct AttrCount {
|
||||
pub attr: u16,
|
||||
pub count: u16,
|
||||
}
|
||||
|
Reference in New Issue
Block a user