mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-04 03:35:43 +00:00
Compare commits
102 Commits
prototype-
...
no-cache-n
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
bdd363dd94 | ||
|
|
d6b3aae8a6 | ||
|
|
ac2d54b27c | ||
|
|
7d61697f19 | ||
|
|
97d2860998 | ||
|
|
15bf556291 | ||
|
|
3d244451df | ||
|
|
5f53935c8a | ||
|
|
29a7623c3f | ||
|
|
e97041f7d0 | ||
|
|
52d7f3ed1c | ||
|
|
86d5e6d9ff | ||
|
|
759b9b1546 | ||
|
|
3f7a500f3b | ||
|
|
974272f2e9 | ||
|
|
7ad037841f | ||
|
|
e0c7067355 | ||
|
|
6e87332410 | ||
|
|
2d1caf27df | ||
|
|
92678383d6 | ||
|
|
7f148c127c | ||
|
|
4ce5d3d66d | ||
|
|
ff931edb55 | ||
|
|
42b093687d | ||
|
|
835c5f98f9 | ||
|
|
f00664247d | ||
|
|
3c63d4a1e5 | ||
|
|
4551abf6d4 | ||
|
|
193d7f5d34 | ||
|
|
013acb3d93 | ||
|
|
f4ab1f168e | ||
|
|
1a0e962299 | ||
|
|
f13e076b8a | ||
|
|
7ba49b849e | ||
|
|
f7652186e1 | ||
|
|
b2f4e67c9a | ||
|
|
ff5d3b59f5 | ||
|
|
aa69308e45 | ||
|
|
eb9a20ff0b | ||
|
|
0d868f36d7 | ||
|
|
e7d9db078f | ||
|
|
3e9198ebaa | ||
|
|
2a0ad0982f | ||
|
|
2b317c681b | ||
|
|
39b5990f64 | ||
|
|
3848adf5a2 | ||
|
|
b4de06259e | ||
|
|
8287c2644f | ||
|
|
c1c44a0b81 | ||
|
|
04596f3616 | ||
|
|
24cb5839ad | ||
|
|
8d97b7b28c | ||
|
|
f69688e8f7 | ||
|
|
8fd0afaaaa | ||
|
|
72c6a21a30 | ||
|
|
8412be4a7d | ||
|
|
10f09c531f | ||
|
|
8fd99b111b | ||
|
|
f6b3d1f9a5 | ||
|
|
73ce67862d | ||
|
|
0fc02f7351 | ||
|
|
34f11e3380 | ||
|
|
27308eaab1 | ||
|
|
b33ec9ba3f | ||
|
|
9c0a1cd9fd | ||
|
|
0b061f1e70 | ||
|
|
19d937ab21 | ||
|
|
1d59c19cd2 | ||
|
|
98e48371c3 | ||
|
|
6d74fb0229 | ||
|
|
1eb75a1040 | ||
|
|
3b82d8b5b9 | ||
|
|
781a186f75 | ||
|
|
6a399556b5 | ||
|
|
27b4cab857 | ||
|
|
52d32b4ee9 | ||
|
|
da61408e52 | ||
|
|
fe69385bd7 | ||
|
|
c1557734dc | ||
|
|
c50d3edc4a | ||
|
|
5369bf4a62 | ||
|
|
bcb1aa3d22 | ||
|
|
9b7858fb90 | ||
|
|
ab01679a8f | ||
|
|
521775f788 | ||
|
|
72e7b7846e | ||
|
|
6526ce1208 | ||
|
|
e639ec79d1 | ||
|
|
bb885a5810 | ||
|
|
b625d31c7d | ||
|
|
6487a67f2b | ||
|
|
271ce91b3b | ||
|
|
54f2eb4507 | ||
|
|
794ebcd582 | ||
|
|
b7c77c7a39 | ||
|
|
0c57cf7565 | ||
|
|
27df9e6c73 | ||
|
|
45c060831e | ||
|
|
874c1ac538 | ||
|
|
e6ffa4d454 | ||
|
|
637a9c8bdd | ||
|
|
c683fa98e6 |
108
Cargo.lock
generated
108
Cargo.lock
generated
@@ -386,9 +386,8 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
|
||||
|
||||
[[package]]
|
||||
name = "arroy"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dfc5f272f38fa063bbff0a7ab5219404e221493de005e2b4078c62d626ef567e"
|
||||
version = "0.4.0"
|
||||
source = "git+https://github.com/meilisearch/arroy/?rev=2386594dfb009ce08821a925ccc89fb8e30bf73d#2386594dfb009ce08821a925ccc89fb8e30bf73d"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
@@ -472,7 +471,7 @@ checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6"
|
||||
|
||||
[[package]]
|
||||
name = "benchmarks"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"bytes",
|
||||
@@ -528,7 +527,7 @@ dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"regex",
|
||||
"rustc-hash 1.1.0",
|
||||
"rustc-hash",
|
||||
"shlex",
|
||||
"syn 2.0.60",
|
||||
]
|
||||
@@ -653,7 +652,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "build-info"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"time",
|
||||
@@ -935,8 +934,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "charabia"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "55ff52497324e7d168505a16949ae836c14595606fab94687238d2f6c8d4c798"
|
||||
source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71"
|
||||
dependencies = [
|
||||
"aho-corasick",
|
||||
"csv",
|
||||
@@ -1623,7 +1621,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "dump"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"big_s",
|
||||
@@ -1835,7 +1833,7 @@ checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
|
||||
|
||||
[[package]]
|
||||
name = "file-store"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"tempfile",
|
||||
"thiserror",
|
||||
@@ -1857,7 +1855,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "filter-parser"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"nom",
|
||||
@@ -1877,7 +1875,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "flatten-serde-json"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@@ -2001,7 +1999,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "fuzzers"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"clap",
|
||||
@@ -2222,11 +2220,11 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||
[[package]]
|
||||
name = "grenad"
|
||||
version = "0.4.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "350d89047298d3b1b40050acd11ab76e487b854a104b760ebc5a7f375093de77"
|
||||
source = "git+https://github.com/meilisearch/grenad?branch=various-improvements#58ac87d852413571102f44c5e55ca13509a3f1a0"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
"either",
|
||||
"rayon",
|
||||
"tempfile",
|
||||
]
|
||||
@@ -2309,9 +2307,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.14.3"
|
||||
version = "0.14.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "290f1a1d9242c78d09ce40a5e87e7554ee637af1351968159f4952f028f75604"
|
||||
checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1"
|
||||
dependencies = [
|
||||
"ahash 0.8.11",
|
||||
"allocator-api2",
|
||||
@@ -2553,7 +2551,7 @@ checksum = "206ca75c9c03ba3d4ace2460e57b189f39f43de612c2f85836e65c929701bb2d"
|
||||
|
||||
[[package]]
|
||||
name = "index-scheduler"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arroy",
|
||||
@@ -2571,6 +2569,7 @@ dependencies = [
|
||||
"meili-snap",
|
||||
"meilisearch-auth",
|
||||
"meilisearch-types",
|
||||
"memmap2",
|
||||
"page_size",
|
||||
"rayon",
|
||||
"roaring",
|
||||
@@ -2592,7 +2591,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown 0.14.3",
|
||||
"hashbrown 0.14.5",
|
||||
"serde",
|
||||
]
|
||||
|
||||
@@ -2651,8 +2650,7 @@ checksum = "28b29a3cd74f0f4598934efe3aeba42bae0eb4680554128851ebbecb02af14e6"
|
||||
[[package]]
|
||||
name = "irg-kvariants"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ef2af7c331f2536964a32b78a7d2e0963d78b42f4a76323b16cc7d94b1ddce26"
|
||||
source = "git+https://github.com/meilisearch/charabia?branch=mutualize-char-normalizer#f8d8308cdb8db80819be7eeed5652cc4a995cc71"
|
||||
dependencies = [
|
||||
"csv",
|
||||
"once_cell",
|
||||
@@ -2747,7 +2745,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "json-depth-checker"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"criterion",
|
||||
"serde_json",
|
||||
@@ -3366,7 +3364,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
||||
|
||||
[[package]]
|
||||
name = "meili-snap"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"insta",
|
||||
"md5",
|
||||
@@ -3375,7 +3373,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"actix-cors",
|
||||
"actix-http",
|
||||
@@ -3415,7 +3413,6 @@ dependencies = [
|
||||
"meilisearch-types",
|
||||
"mimalloc",
|
||||
"mime",
|
||||
"mopa-maintained",
|
||||
"num_cpus",
|
||||
"obkv",
|
||||
"once_cell",
|
||||
@@ -3465,7 +3462,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-auth"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"base64 0.22.1",
|
||||
"enum-iterator",
|
||||
@@ -3484,7 +3481,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilisearch-types"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"actix-web",
|
||||
"anyhow",
|
||||
@@ -3514,7 +3511,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "meilitool"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"clap",
|
||||
@@ -3545,7 +3542,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "milli"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"arroy",
|
||||
"big_s",
|
||||
@@ -3569,6 +3566,7 @@ dependencies = [
|
||||
"fxhash",
|
||||
"geoutils",
|
||||
"grenad",
|
||||
"hashbrown 0.14.5",
|
||||
"heed",
|
||||
"hf-hub",
|
||||
"indexmap",
|
||||
@@ -3682,12 +3680,6 @@ dependencies = [
|
||||
"syn 2.0.60",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mopa-maintained"
|
||||
version = "0.2.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "79b7f3e22167862cc7c95b21a6f326c22e4bf40da59cbf000b368a310173ba11"
|
||||
|
||||
[[package]]
|
||||
name = "mutually_exclusive_features"
|
||||
version = "0.0.3"
|
||||
@@ -3844,9 +3836,8 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "obkv"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2e27bcfe835a379d32352112f6b8dbae2d99d16a5fff42abe6e5ba5386c1e5a"
|
||||
version = "0.3.0"
|
||||
source = "git+https://github.com/kerollmops/obkv?branch=unsized-kvreader#ce535874008ecac554f02e0c670e6caf62134d6b"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
@@ -3991,7 +3982,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "permissive-json-pointer"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"big_s",
|
||||
"serde_json",
|
||||
@@ -4322,7 +4313,7 @@ dependencies = [
|
||||
"pin-project-lite",
|
||||
"quinn-proto",
|
||||
"quinn-udp",
|
||||
"rustc-hash 1.1.0",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"thiserror",
|
||||
"tokio",
|
||||
@@ -4331,14 +4322,14 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "quinn-proto"
|
||||
version = "0.11.8"
|
||||
version = "0.11.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6"
|
||||
checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe"
|
||||
dependencies = [
|
||||
"bytes",
|
||||
"rand",
|
||||
"ring",
|
||||
"rustc-hash 2.0.0",
|
||||
"rustc-hash",
|
||||
"rustls",
|
||||
"slab",
|
||||
"thiserror",
|
||||
@@ -4589,8 +4580,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "rhai"
|
||||
version = "1.20.0"
|
||||
source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4"
|
||||
version = "1.19.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "61797318be89b1a268a018a92a7657096d83f3ecb31418b9e9c16dcbb043b702"
|
||||
dependencies = [
|
||||
"ahash 0.8.11",
|
||||
"bitflags 2.6.0",
|
||||
@@ -4607,7 +4599,8 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "rhai_codegen"
|
||||
version = "2.2.0"
|
||||
source = "git+https://github.com/rhaiscript/rhai?rev=ef3df63121d27aacd838f366f2b83fd65f20a1e4#ef3df63121d27aacd838f366f2b83fd65f20a1e4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -4661,8 +4654,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "roaring"
|
||||
version = "0.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f4b84ba6e838ceb47b41de5194a60244fac43d9fe03b71dbe8c5a201081d6d1"
|
||||
source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=clone-iter-slice#6bba84b1a47da1d6e52d5c4dc0ce8593ae4646a5"
|
||||
dependencies = [
|
||||
"bytemuck",
|
||||
"byteorder",
|
||||
@@ -4709,12 +4701,6 @@ version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
||||
|
||||
[[package]]
|
||||
name = "rustc-hash"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152"
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.0"
|
||||
@@ -4853,9 +4839,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.209"
|
||||
version = "1.0.210"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
||||
checksum = "c8e3592472072e6e22e0a54d5904d9febf8508f65fb8552499a1abc7d1078c3a"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
@@ -4871,9 +4857,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.209"
|
||||
version = "1.0.210"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
||||
checksum = "243902eda00fad750862fc144cea25caca5e20d615af0a81bee94ca738f1df1f"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
@@ -5366,7 +5352,7 @@ dependencies = [
|
||||
"fancy-regex 0.12.0",
|
||||
"lazy_static",
|
||||
"parking_lot",
|
||||
"rustc-hash 1.1.0",
|
||||
"rustc-hash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -6054,7 +6040,7 @@ version = "0.16.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "471d1c1645d361eb782a1650b1786a8fb58dd625e681a04c09f5ff7c8764a7b0"
|
||||
dependencies = [
|
||||
"hashbrown 0.14.3",
|
||||
"hashbrown 0.14.5",
|
||||
"once_cell",
|
||||
]
|
||||
|
||||
@@ -6380,7 +6366,7 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "xtask"
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"build-info",
|
||||
|
||||
24
Cargo.toml
24
Cargo.toml
@@ -22,7 +22,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.11.3"
|
||||
version = "1.11.0"
|
||||
authors = [
|
||||
"Quentin de Quelen <quentin@dequelen.me>",
|
||||
"Clément Renault <clement@meilisearch.com>",
|
||||
@@ -44,23 +44,5 @@ opt-level = 3
|
||||
[profile.dev.package.roaring]
|
||||
opt-level = 3
|
||||
|
||||
[profile.dev.package.lindera-ipadic-builder]
|
||||
opt-level = 3
|
||||
[profile.dev.package.encoding]
|
||||
opt-level = 3
|
||||
[profile.dev.package.yada]
|
||||
opt-level = 3
|
||||
|
||||
[profile.release.package.lindera-ipadic-builder]
|
||||
opt-level = 3
|
||||
[profile.release.package.encoding]
|
||||
opt-level = 3
|
||||
[profile.release.package.yada]
|
||||
opt-level = 3
|
||||
|
||||
[profile.bench.package.lindera-ipadic-builder]
|
||||
opt-level = 3
|
||||
[profile.bench.package.encoding]
|
||||
opt-level = 3
|
||||
[profile.bench.package.yada]
|
||||
opt-level = 3
|
||||
[patch.crates-io]
|
||||
roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "clone-iter-slice" }
|
||||
|
||||
@@ -29,6 +29,7 @@ serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
||||
synchronoise = "1.0.1"
|
||||
tempfile = "3.10.1"
|
||||
thiserror = "1.0.61"
|
||||
memmap2 = "0.9.4"
|
||||
time = { version = "0.3.36", features = [
|
||||
"serde-well-known",
|
||||
"formatting",
|
||||
@@ -40,7 +41,7 @@ ureq = "2.10.0"
|
||||
uuid = { version = "1.10.0", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
arroy = "0.5.0"
|
||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
||||
big_s = "1.0.2"
|
||||
crossbeam = "0.8.4"
|
||||
insta = { version = "1.39.0", features = ["json", "redactions"] }
|
||||
|
||||
@@ -28,6 +28,9 @@ use meilisearch_types::error::Code;
|
||||
use meilisearch_types::heed::{RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::new::indexer::{
|
||||
self, retrieve_or_guess_primary_key, DocumentChanges,
|
||||
};
|
||||
use meilisearch_types::milli::update::{
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
};
|
||||
@@ -875,10 +878,8 @@ impl IndexScheduler {
|
||||
while let Some(doc) =
|
||||
cursor.next_document().map_err(milli::Error::from)?
|
||||
{
|
||||
dump_content_file.push_document(&obkv_to_object(
|
||||
&doc,
|
||||
&documents_batch_index,
|
||||
)?)?;
|
||||
dump_content_file
|
||||
.push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
|
||||
}
|
||||
dump_content_file.flush()?;
|
||||
}
|
||||
@@ -1252,58 +1253,52 @@ impl IndexScheduler {
|
||||
let must_stop_processing = self.must_stop_processing.clone();
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
|
||||
if let Some(primary_key) = primary_key {
|
||||
match index.primary_key(index_wtxn)? {
|
||||
// if a primary key was set AND had already been defined in the index
|
||||
// but to a different value, we can make the whole batch fail.
|
||||
Some(pk) => {
|
||||
if primary_key != pk {
|
||||
return Err(milli::Error::from(
|
||||
milli::UserError::PrimaryKeyCannotBeChanged(pk.to_string()),
|
||||
)
|
||||
.into());
|
||||
}
|
||||
}
|
||||
// if the primary key was set and there was no primary key set for this index
|
||||
// we set it to the received value before starting the indexing process.
|
||||
None => {
|
||||
let mut builder =
|
||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
builder.set_primary_key(primary_key);
|
||||
builder.execute(
|
||||
|indexing_step| tracing::debug!(update = ?indexing_step),
|
||||
|| must_stop_processing.clone().get(),
|
||||
)?;
|
||||
primary_key_has_been_set = true;
|
||||
/// TODO manage errors correctly
|
||||
let rtxn = index.read_txn()?;
|
||||
let first_addition_uuid = operations
|
||||
.iter()
|
||||
.find_map(|op| match op {
|
||||
DocumentOperation::Add(content_uuid) => Some(content_uuid),
|
||||
_ => None,
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
let mut content_files = Vec::new();
|
||||
for operation in &operations {
|
||||
if let DocumentOperation::Add(content_uuid) = operation {
|
||||
let content_file = self.file_store.get_update(*content_uuid)?;
|
||||
let mmap = unsafe { memmap2::Mmap::map(&content_file)? };
|
||||
if !mmap.is_empty() {
|
||||
content_files.push(mmap);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
|
||||
let mut fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let first_document = match content_files.first() {
|
||||
Some(mmap) => {
|
||||
let mut iter = serde_json::Deserializer::from_slice(mmap).into_iter();
|
||||
iter.next().transpose().map_err(|e| e.into()).map_err(Error::IoError)?
|
||||
}
|
||||
None => None,
|
||||
};
|
||||
|
||||
let embedder_configs = index.embedding_configs(index_wtxn)?;
|
||||
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
|
||||
let embedders = self.embedders(embedder_configs)?;
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
index_wtxn,
|
||||
let primary_key = retrieve_or_guess_primary_key(
|
||||
&rtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| tracing::trace!(?indexing_step, "Update"),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
&mut fields_ids_map,
|
||||
first_document.as_ref(),
|
||||
)?
|
||||
.unwrap();
|
||||
|
||||
let mut content_files_iter = content_files.iter();
|
||||
let mut indexer = indexer::DocumentOperation::new(method);
|
||||
for (operation, task) in operations.into_iter().zip(tasks.iter_mut()) {
|
||||
match operation {
|
||||
DocumentOperation::Add(content_uuid) => {
|
||||
let content_file = self.file_store.get_update(content_uuid)?;
|
||||
let reader = DocumentsBatchReader::from_reader(content_file)
|
||||
.map_err(milli::Error::from)?;
|
||||
let (new_builder, user_result) = builder.add_documents(reader)?;
|
||||
builder = new_builder;
|
||||
|
||||
builder = builder.with_embedders(embedders.clone());
|
||||
DocumentOperation::Add(_content_uuid) => {
|
||||
let mmap = content_files_iter.next().unwrap();
|
||||
let stats = indexer.add_documents(mmap)?;
|
||||
// builder = builder.with_embedders(embedders.clone());
|
||||
|
||||
let received_documents =
|
||||
if let Some(Details::DocumentAdditionOrUpdate {
|
||||
@@ -1317,30 +1312,17 @@ impl IndexScheduler {
|
||||
unreachable!();
|
||||
};
|
||||
|
||||
match user_result {
|
||||
Ok(count) => {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentAdditionOrUpdate {
|
||||
received_documents,
|
||||
indexed_documents: Some(count),
|
||||
})
|
||||
}
|
||||
Err(e) => {
|
||||
task.status = Status::Failed;
|
||||
task.details = Some(Details::DocumentAdditionOrUpdate {
|
||||
received_documents,
|
||||
indexed_documents: Some(0),
|
||||
});
|
||||
task.error = Some(milli::Error::from(e).into());
|
||||
}
|
||||
}
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentAdditionOrUpdate {
|
||||
received_documents,
|
||||
indexed_documents: Some(stats.document_count as u64),
|
||||
})
|
||||
}
|
||||
DocumentOperation::Delete(document_ids) => {
|
||||
let (new_builder, user_result) =
|
||||
builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
let count = document_ids.len();
|
||||
indexer.delete_documents(document_ids);
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
// let count = user_result.unwrap();
|
||||
let provided_ids =
|
||||
if let Some(Details::DocumentDeletion { provided_ids, .. }) =
|
||||
task.details
|
||||
@@ -1354,26 +1336,35 @@ impl IndexScheduler {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids,
|
||||
deleted_documents: Some(count),
|
||||
deleted_documents: Some(count as u64),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
let addition = builder.execute()?;
|
||||
tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||
} else if primary_key_has_been_set {
|
||||
// Everything failed but we've set a primary key.
|
||||
// We need to remove it.
|
||||
let mut builder =
|
||||
milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
builder.reset_primary_key();
|
||||
builder.execute(
|
||||
|indexing_step| tracing::trace!(update = ?indexing_step),
|
||||
|| must_stop_processing.clone().get(),
|
||||
)?;
|
||||
/// TODO create a pool if needed
|
||||
// let pool = indexer_config.thread_pool.unwrap();
|
||||
let pool = rayon::ThreadPoolBuilder::new().build().unwrap();
|
||||
|
||||
let param = (index, &rtxn, &primary_key);
|
||||
let document_changes = indexer.document_changes(&mut fields_ids_map, param)?;
|
||||
/// TODO pass/write the FieldsIdsMap
|
||||
indexer::index(index_wtxn, index, fields_ids_map, &pool, document_changes)?;
|
||||
|
||||
// tracing::info!(indexing_result = ?addition, processed_in = ?started_processing_at.elapsed(), "document indexing done");
|
||||
}
|
||||
// else if primary_key_has_been_set {
|
||||
// // Everything failed but we've set a primary key.
|
||||
// // We need to remove it.
|
||||
// let mut builder =
|
||||
// milli::update::Settings::new(index_wtxn, index, indexer_config);
|
||||
// builder.reset_primary_key();
|
||||
// builder.execute(
|
||||
// |indexing_step| tracing::trace!(update = ?indexing_step),
|
||||
// || must_stop_processing.clone().get(),
|
||||
// )?;
|
||||
// }
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
|
||||
@@ -1263,7 +1263,7 @@ impl IndexScheduler {
|
||||
#[cfg(test)]
|
||||
self.maybe_fail(tests::FailureLocation::UpdatingTaskAfterProcessBatchFailure)?;
|
||||
|
||||
tracing::error!("Batch failed {}", error);
|
||||
tracing::info!("Batch failed {}", error);
|
||||
|
||||
self.update_task(&mut wtxn, &task)
|
||||
.map_err(|e| Error::TaskDatabaseUpdate(Box::new(e)))?;
|
||||
@@ -5201,10 +5201,9 @@ mod tests {
|
||||
|
||||
let configs = index_scheduler.embedders(configs).unwrap();
|
||||
let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap();
|
||||
let beagle_embed =
|
||||
hf_embedder.embed_one(S("Intel the beagle best doggo"), None).unwrap();
|
||||
let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo"), None).unwrap();
|
||||
let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo"), None).unwrap();
|
||||
let beagle_embed = hf_embedder.embed_one(S("Intel the beagle best doggo")).unwrap();
|
||||
let lab_embed = hf_embedder.embed_one(S("Max the lab best doggo")).unwrap();
|
||||
let patou_embed = hf_embedder.embed_one(S("kefir the patou best doggo")).unwrap();
|
||||
(fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed)
|
||||
};
|
||||
|
||||
|
||||
@@ -66,8 +66,5 @@ khmer = ["milli/khmer"]
|
||||
vietnamese = ["milli/vietnamese"]
|
||||
# force swedish character recomposition
|
||||
swedish-recomposition = ["milli/swedish-recomposition"]
|
||||
# allow german tokenization
|
||||
# force german character recomposition
|
||||
german = ["milli/german"]
|
||||
# allow turkish normalization
|
||||
turkish = ["milli/turkish"]
|
||||
|
||||
|
||||
@@ -1,20 +1,22 @@
|
||||
use std::fmt::{self, Debug, Display};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufWriter, Write};
|
||||
use std::io::{self, BufWriter};
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use memmap2::MmapOptions;
|
||||
use milli::documents::{DocumentsBatchBuilder, Error};
|
||||
use memmap2::Mmap;
|
||||
use milli::documents::Error;
|
||||
use milli::update::new::TopLevelMap;
|
||||
use milli::Object;
|
||||
use serde::de::{SeqAccess, Visitor};
|
||||
use serde::{Deserialize, Deserializer};
|
||||
use serde_json::error::Category;
|
||||
use serde_json::{to_writer, Map, Value};
|
||||
|
||||
use crate::error::{Code, ErrorCode};
|
||||
|
||||
type Result<T> = std::result::Result<T, DocumentFormatError>;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum PayloadType {
|
||||
Ndjson,
|
||||
Json,
|
||||
@@ -88,6 +90,26 @@ impl From<(PayloadType, Error)> for DocumentFormatError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(PayloadType, serde_json::Error)> for DocumentFormatError {
|
||||
fn from((ty, error): (PayloadType, serde_json::Error)) -> Self {
|
||||
if error.classify() == Category::Data {
|
||||
Self::Io(error.into())
|
||||
} else {
|
||||
Self::MalformedPayload(Error::Json(error), ty)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<(PayloadType, csv::Error)> for DocumentFormatError {
|
||||
fn from((ty, error): (PayloadType, csv::Error)) -> Self {
|
||||
if error.is_io_error() {
|
||||
Self::Io(error.into())
|
||||
} else {
|
||||
Self::MalformedPayload(Error::Csv(error), ty)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for DocumentFormatError {
|
||||
fn from(error: io::Error) -> Self {
|
||||
Self::Io(error)
|
||||
@@ -103,67 +125,140 @@ impl ErrorCode for DocumentFormatError {
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads CSV from input and write an obkv batch to writer.
|
||||
pub fn read_csv(file: &File, writer: impl Write, delimiter: u8) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
|
||||
let mmap = unsafe { MmapOptions::new().map(file)? };
|
||||
let csv = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(mmap.as_ref());
|
||||
builder.append_csv(csv).map_err(|e| (PayloadType::Csv { delimiter }, e))?;
|
||||
|
||||
let count = builder.documents_count();
|
||||
let _ = builder.into_inner().map_err(DocumentFormatError::Io)?;
|
||||
|
||||
Ok(count as u64)
|
||||
// TODO remove that from the place I've borrowed it
|
||||
#[derive(Debug)]
|
||||
enum AllowedType {
|
||||
String,
|
||||
Boolean,
|
||||
Number,
|
||||
}
|
||||
|
||||
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||
pub fn read_json(file: &File, writer: impl Write) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
|
||||
let mmap = unsafe { MmapOptions::new().map(file)? };
|
||||
let mut deserializer = serde_json::Deserializer::from_slice(&mmap);
|
||||
fn parse_csv_header(header: &str) -> (&str, AllowedType) {
|
||||
// if there are several separators we only split on the last one.
|
||||
match header.rsplit_once(':') {
|
||||
Some((field_name, field_type)) => match field_type {
|
||||
"string" => (field_name, AllowedType::String),
|
||||
"boolean" => (field_name, AllowedType::Boolean),
|
||||
"number" => (field_name, AllowedType::Number),
|
||||
// if the pattern isn't recognized, we keep the whole field.
|
||||
_otherwise => (header, AllowedType::String),
|
||||
},
|
||||
None => (header, AllowedType::String),
|
||||
}
|
||||
}
|
||||
|
||||
match array_each(&mut deserializer, |obj| builder.append_json_object(&obj)) {
|
||||
/// Reads CSV from file and write it in NDJSON in a file checking it along the way.
|
||||
pub fn read_csv(input: &File, output: impl io::Write, delimiter: u8) -> Result<u64> {
|
||||
let ptype = PayloadType::Csv { delimiter };
|
||||
let mut output = BufWriter::new(output);
|
||||
let mut reader = csv::ReaderBuilder::new().delimiter(delimiter).from_reader(input);
|
||||
|
||||
let headers = reader.headers().map_err(|e| DocumentFormatError::from((ptype, e)))?.clone();
|
||||
let typed_fields: Vec<_> = headers.iter().map(parse_csv_header).collect();
|
||||
let mut object: Map<_, _> =
|
||||
typed_fields.iter().map(|(k, _)| (k.to_string(), Value::Null)).collect();
|
||||
|
||||
let mut line = 0;
|
||||
let mut record = csv::StringRecord::new();
|
||||
while reader.read_record(&mut record).map_err(|e| DocumentFormatError::from((ptype, e)))? {
|
||||
// We increment here and not at the end of the loop
|
||||
// to take the header offset into account.
|
||||
line += 1;
|
||||
|
||||
// Reset the document values
|
||||
object.iter_mut().for_each(|(_, v)| *v = Value::Null);
|
||||
|
||||
for (i, (name, atype)) in typed_fields.iter().enumerate() {
|
||||
let value = &record[i];
|
||||
let trimmed_value = value.trim();
|
||||
let value = match atype {
|
||||
AllowedType::Number if trimmed_value.is_empty() => Value::Null,
|
||||
AllowedType::Number => match trimmed_value.parse::<i64>() {
|
||||
Ok(integer) => Value::from(integer),
|
||||
Err(_) => match trimmed_value.parse::<f64>() {
|
||||
Ok(float) => Value::from(float),
|
||||
Err(error) => {
|
||||
return Err(DocumentFormatError::MalformedPayload(
|
||||
Error::ParseFloat { error, line, value: value.to_string() },
|
||||
ptype,
|
||||
))
|
||||
}
|
||||
},
|
||||
},
|
||||
AllowedType::Boolean if trimmed_value.is_empty() => Value::Null,
|
||||
AllowedType::Boolean => match trimmed_value.parse::<bool>() {
|
||||
Ok(bool) => Value::from(bool),
|
||||
Err(error) => {
|
||||
return Err(DocumentFormatError::MalformedPayload(
|
||||
Error::ParseBool { error, line, value: value.to_string() },
|
||||
ptype,
|
||||
))
|
||||
}
|
||||
},
|
||||
AllowedType::String if value.is_empty() => Value::Null,
|
||||
AllowedType::String => Value::from(value),
|
||||
};
|
||||
|
||||
*object.get_mut(*name).expect("encountered an unknown field") = value;
|
||||
}
|
||||
|
||||
to_writer(&mut output, &object).map_err(|e| DocumentFormatError::from((ptype, e)))?;
|
||||
}
|
||||
|
||||
Ok(line as u64)
|
||||
}
|
||||
|
||||
/// Reads JSON from file and write it in NDJSON in a file checking it along the way.
|
||||
pub fn read_json(input: &File, output: impl io::Write) -> Result<u64> {
|
||||
// We memory map to be able to deserailize into a TopLevelMap<'pl> that
|
||||
// does not allocate when possible and only materialize the first/top level.
|
||||
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
||||
|
||||
let mut out = BufWriter::new(output);
|
||||
let mut deserializer = serde_json::Deserializer::from_slice(&input);
|
||||
let count = match array_each(&mut deserializer, |obj: TopLevelMap| to_writer(&mut out, &obj)) {
|
||||
// The json data has been deserialized and does not need to be processed again.
|
||||
// The data has been transferred to the writer during the deserialization process.
|
||||
Ok(Ok(_)) => (),
|
||||
Ok(Err(e)) => return Err(DocumentFormatError::Io(e)),
|
||||
Ok(Ok(count)) => count,
|
||||
Ok(Err(e)) => return Err(DocumentFormatError::from((PayloadType::Json, e))),
|
||||
Err(e) => {
|
||||
// Attempt to deserialize a single json string when the cause of the exception is not Category.data
|
||||
// Other types of deserialisation exceptions are returned directly to the front-end
|
||||
if e.classify() != serde_json::error::Category::Data {
|
||||
return Err(DocumentFormatError::MalformedPayload(
|
||||
Error::Json(e),
|
||||
PayloadType::Json,
|
||||
));
|
||||
if e.classify() != Category::Data {
|
||||
return Err(DocumentFormatError::from((PayloadType::Json, e)));
|
||||
}
|
||||
|
||||
let content: Object = serde_json::from_slice(&mmap)
|
||||
let content: Object = serde_json::from_slice(&input)
|
||||
.map_err(Error::Json)
|
||||
.map_err(|e| (PayloadType::Json, e))?;
|
||||
builder.append_json_object(&content).map_err(DocumentFormatError::Io)?;
|
||||
to_writer(&mut out, &content)
|
||||
.map(|_| 1)
|
||||
.map_err(|e| DocumentFormatError::from((PayloadType::Json, e)))?
|
||||
}
|
||||
};
|
||||
|
||||
match out.into_inner() {
|
||||
Ok(_) => Ok(count),
|
||||
Err(ie) => Err(DocumentFormatError::Io(ie.into_error())),
|
||||
}
|
||||
|
||||
let count = builder.documents_count();
|
||||
let _ = builder.into_inner().map_err(DocumentFormatError::Io)?;
|
||||
|
||||
Ok(count as u64)
|
||||
}
|
||||
|
||||
/// Reads JSON from temporary file and write an obkv batch to writer.
|
||||
pub fn read_ndjson(file: &File, writer: impl Write) -> Result<u64> {
|
||||
let mut builder = DocumentsBatchBuilder::new(BufWriter::new(writer));
|
||||
let mmap = unsafe { MmapOptions::new().map(file)? };
|
||||
/// Reads NDJSON from file and write it in NDJSON in a file checking it along the way.
|
||||
pub fn read_ndjson(input: &File, output: impl io::Write) -> Result<u64> {
|
||||
// We memory map to be able to deserailize into a TopLevelMap<'pl> that
|
||||
// does not allocate when possible and only materialize the first/top level.
|
||||
let input = unsafe { Mmap::map(input).map_err(DocumentFormatError::Io)? };
|
||||
let mut output = BufWriter::new(output);
|
||||
|
||||
for result in serde_json::Deserializer::from_slice(&mmap).into_iter() {
|
||||
let object = result.map_err(Error::Json).map_err(|e| (PayloadType::Ndjson, e))?;
|
||||
builder.append_json_object(&object).map_err(Into::into).map_err(DocumentFormatError::Io)?;
|
||||
let mut count = 0;
|
||||
for result in serde_json::Deserializer::from_slice(&input).into_iter() {
|
||||
count += 1;
|
||||
result
|
||||
.and_then(|map: TopLevelMap| to_writer(&mut output, &map))
|
||||
.map_err(|e| DocumentFormatError::from((PayloadType::Ndjson, e)))?;
|
||||
}
|
||||
|
||||
let count = builder.documents_count();
|
||||
let _ = builder.into_inner().map_err(Into::into).map_err(DocumentFormatError::Io)?;
|
||||
|
||||
Ok(count as u64)
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
/// The actual handling of the deserialization process in serde
|
||||
@@ -172,20 +267,23 @@ pub fn read_ndjson(file: &File, writer: impl Write) -> Result<u64> {
|
||||
/// ## References
|
||||
/// <https://serde.rs/stream-array.html>
|
||||
/// <https://github.com/serde-rs/json/issues/160>
|
||||
fn array_each<'de, D, T, F>(deserializer: D, f: F) -> std::result::Result<io::Result<u64>, D::Error>
|
||||
fn array_each<'de, D, T, F>(
|
||||
deserializer: D,
|
||||
f: F,
|
||||
) -> std::result::Result<serde_json::Result<u64>, D::Error>
|
||||
where
|
||||
D: Deserializer<'de>,
|
||||
T: Deserialize<'de>,
|
||||
F: FnMut(T) -> io::Result<()>,
|
||||
F: FnMut(T) -> serde_json::Result<()>,
|
||||
{
|
||||
struct SeqVisitor<T, F>(F, PhantomData<T>);
|
||||
|
||||
impl<'de, T, F> Visitor<'de> for SeqVisitor<T, F>
|
||||
where
|
||||
T: Deserialize<'de>,
|
||||
F: FnMut(T) -> io::Result<()>,
|
||||
F: FnMut(T) -> serde_json::Result<()>,
|
||||
{
|
||||
type Value = io::Result<u64>;
|
||||
type Value = serde_json::Result<u64>;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a nonempty sequence")
|
||||
@@ -194,7 +292,7 @@ where
|
||||
fn visit_seq<A>(
|
||||
mut self,
|
||||
mut seq: A,
|
||||
) -> std::result::Result<io::Result<u64>, <A as SeqAccess<'de>>::Error>
|
||||
) -> std::result::Result<serde_json::Result<u64>, <A as SeqAccess<'de>>::Error>
|
||||
where
|
||||
A: SeqAccess<'de>,
|
||||
{
|
||||
@@ -203,7 +301,7 @@ where
|
||||
match self.0(value) {
|
||||
Ok(()) => max += 1,
|
||||
Err(e) => return Ok(Err(e)),
|
||||
};
|
||||
}
|
||||
}
|
||||
Ok(Ok(max))
|
||||
}
|
||||
|
||||
@@ -57,7 +57,7 @@ meilisearch-types = { path = "../meilisearch-types" }
|
||||
mimalloc = { version = "0.1.43", default-features = false }
|
||||
mime = "0.3.17"
|
||||
num_cpus = "1.16.0"
|
||||
obkv = "0.2.2"
|
||||
obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" }
|
||||
once_cell = "1.19.0"
|
||||
ordered-float = "4.2.1"
|
||||
parking_lot = "0.12.3"
|
||||
@@ -75,7 +75,7 @@ reqwest = { version = "0.12.5", features = [
|
||||
rustls = { version = "0.23.11", features = ["ring"], default-features = false }
|
||||
rustls-pki-types = { version = "1.7.0", features = ["alloc"] }
|
||||
rustls-pemfile = "2.1.2"
|
||||
segment = { version = "0.2.4" }
|
||||
segment = { version = "0.2.4", optional = true }
|
||||
serde = { version = "1.0.204", features = ["derive"] }
|
||||
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
||||
sha2 = "0.10.8"
|
||||
@@ -104,7 +104,6 @@ tracing-trace = { version = "0.1.0", path = "../tracing-trace" }
|
||||
tracing-actix-web = "0.7.11"
|
||||
build-info = { version = "1.7.0", path = "../build-info" }
|
||||
roaring = "0.10.2"
|
||||
mopa-maintained = "0.2.3"
|
||||
|
||||
[dev-dependencies]
|
||||
actix-rt = "2.10.0"
|
||||
@@ -132,7 +131,8 @@ tempfile = { version = "3.10.1", optional = true }
|
||||
zip = { version = "2.1.3", optional = true }
|
||||
|
||||
[features]
|
||||
default = ["meilisearch-types/all-tokenizations", "mini-dashboard"]
|
||||
default = ["analytics", "meilisearch-types/all-tokenizations", "mini-dashboard"]
|
||||
analytics = ["segment"]
|
||||
mini-dashboard = [
|
||||
"static-files",
|
||||
"anyhow",
|
||||
@@ -154,8 +154,7 @@ khmer = ["meilisearch-types/khmer"]
|
||||
vietnamese = ["meilisearch-types/vietnamese"]
|
||||
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
||||
german = ["meilisearch-types/german"]
|
||||
turkish = ["meilisearch-types/turkish"]
|
||||
|
||||
[package.metadata.mini-dashboard]
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.15/build.zip"
|
||||
sha1 = "d057600b4a839a2e0c0be7a372cd1b2683f3ca7e"
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
|
||||
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"
|
||||
|
||||
109
meilisearch/src/analytics/mock_analytics.rs
Normal file
109
meilisearch/src/analytics/mock_analytics.rs
Normal file
@@ -0,0 +1,109 @@
|
||||
use std::any::Any;
|
||||
use std::sync::Arc;
|
||||
|
||||
use actix_web::HttpRequest;
|
||||
use meilisearch_types::InstanceUid;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::{find_user_id, Analytics, DocumentDeletionKind, DocumentFetchKind};
|
||||
use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery};
|
||||
use crate::Opt;
|
||||
|
||||
pub struct MockAnalytics {
|
||||
instance_uid: Option<InstanceUid>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SearchAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl SearchAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SimilarAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl SimilarAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MultiSearchAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl MultiSearchAggregator {
|
||||
pub fn from_federated_search(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self) {}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct FacetSearchAggregator;
|
||||
|
||||
#[allow(dead_code)]
|
||||
impl FacetSearchAggregator {
|
||||
pub fn from_query(_: &dyn Any, _: &dyn Any) -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, _: &dyn Any) {}
|
||||
}
|
||||
|
||||
impl MockAnalytics {
|
||||
#[allow(clippy::new_ret_no_self)]
|
||||
pub fn new(opt: &Opt) -> Arc<dyn Analytics> {
|
||||
let instance_uid = find_user_id(&opt.db_path);
|
||||
Arc::new(Self { instance_uid })
|
||||
}
|
||||
}
|
||||
|
||||
impl Analytics for MockAnalytics {
|
||||
fn instance_uid(&self) -> Option<&meilisearch_types::InstanceUid> {
|
||||
self.instance_uid.as_ref()
|
||||
}
|
||||
|
||||
// These methods are noop and should be optimized out
|
||||
fn publish(&self, _event_name: String, _send: Value, _request: Option<&HttpRequest>) {}
|
||||
fn get_search(&self, _aggregate: super::SearchAggregator) {}
|
||||
fn post_search(&self, _aggregate: super::SearchAggregator) {}
|
||||
fn get_similar(&self, _aggregate: super::SimilarAggregator) {}
|
||||
fn post_similar(&self, _aggregate: super::SimilarAggregator) {}
|
||||
fn post_multi_search(&self, _aggregate: super::MultiSearchAggregator) {}
|
||||
fn post_facet_search(&self, _aggregate: super::FacetSearchAggregator) {}
|
||||
fn add_documents(
|
||||
&self,
|
||||
_documents_query: &UpdateDocumentsQuery,
|
||||
_index_creation: bool,
|
||||
_request: &HttpRequest,
|
||||
) {
|
||||
}
|
||||
fn delete_documents(&self, _kind: DocumentDeletionKind, _request: &HttpRequest) {}
|
||||
fn update_documents(
|
||||
&self,
|
||||
_documents_query: &UpdateDocumentsQuery,
|
||||
_index_creation: bool,
|
||||
_request: &HttpRequest,
|
||||
) {
|
||||
}
|
||||
fn update_documents_by_function(
|
||||
&self,
|
||||
_documents_query: &DocumentEditionByFunction,
|
||||
_index_creation: bool,
|
||||
_request: &HttpRequest,
|
||||
) {
|
||||
}
|
||||
fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
|
||||
fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {}
|
||||
}
|
||||
@@ -1,45 +1,44 @@
|
||||
pub mod segment_analytics;
|
||||
mod mock_analytics;
|
||||
#[cfg(feature = "analytics")]
|
||||
mod segment_analytics;
|
||||
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use actix_web::HttpRequest;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::InstanceUid;
|
||||
use mopa::mopafy;
|
||||
pub use mock_analytics::MockAnalytics;
|
||||
use once_cell::sync::Lazy;
|
||||
use platform_dirs::AppDirs;
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::routes::indexes::documents::{DocumentEditionByFunction, UpdateDocumentsQuery};
|
||||
|
||||
// if the analytics feature is disabled
|
||||
// the `SegmentAnalytics` point to the mock instead of the real analytics
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type SegmentAnalytics = mock_analytics::MockAnalytics;
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type SearchAggregator = mock_analytics::SearchAggregator;
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type SimilarAggregator = mock_analytics::SimilarAggregator;
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type MultiSearchAggregator = mock_analytics::MultiSearchAggregator;
|
||||
#[cfg(not(feature = "analytics"))]
|
||||
pub type FacetSearchAggregator = mock_analytics::FacetSearchAggregator;
|
||||
|
||||
// if the feature analytics is enabled we use the real analytics
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type SegmentAnalytics = segment_analytics::SegmentAnalytics;
|
||||
|
||||
use crate::Opt;
|
||||
|
||||
/// A macro used to quickly define events that don't aggregate or send anything besides an empty event with its name.
|
||||
#[macro_export]
|
||||
macro_rules! empty_analytics {
|
||||
($struct_name:ident, $event_name:literal) => {
|
||||
#[derive(Default)]
|
||||
struct $struct_name {}
|
||||
|
||||
impl $crate::analytics::Aggregate for $struct_name {
|
||||
fn event_name(&self) -> &'static str {
|
||||
$event_name
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, _other: Box<Self>) -> Box<Self> {
|
||||
self
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::json!({})
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type SearchAggregator = segment_analytics::SearchAggregator;
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type SimilarAggregator = segment_analytics::SimilarAggregator;
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type MultiSearchAggregator = segment_analytics::MultiSearchAggregator;
|
||||
#[cfg(feature = "analytics")]
|
||||
pub type FacetSearchAggregator = segment_analytics::FacetSearchAggregator;
|
||||
|
||||
/// The Meilisearch config dir:
|
||||
/// `~/.config/Meilisearch` on *NIX or *BSD.
|
||||
@@ -79,88 +78,60 @@ pub enum DocumentFetchKind {
|
||||
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
||||
}
|
||||
|
||||
/// To send an event to segment, your event must be able to aggregate itself with another event of the same type.
|
||||
pub trait Aggregate: 'static + mopa::Any + Send {
|
||||
/// The name of the event that will be sent to segment.
|
||||
fn event_name(&self) -> &'static str;
|
||||
|
||||
/// Will be called every time an event has been used twice before segment flushed its buffer.
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self>
|
||||
where
|
||||
Self: Sized;
|
||||
|
||||
/// Converts your structure to the final event that'll be sent to segment.
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value;
|
||||
}
|
||||
|
||||
mopafy!(Aggregate);
|
||||
|
||||
/// Helper trait to define multiple aggregates with the same content but a different name.
|
||||
/// Commonly used when you must aggregate a search with POST or with GET, for example.
|
||||
pub trait AggregateMethod: 'static + Default + Send {
|
||||
fn event_name() -> &'static str;
|
||||
}
|
||||
|
||||
/// A macro used to quickly define multiple aggregate method with their name
|
||||
/// Usage:
|
||||
/// ```rust
|
||||
/// use meilisearch::aggregate_methods;
|
||||
///
|
||||
/// aggregate_methods!(
|
||||
/// SearchGET => "Documents Searched GET",
|
||||
/// SearchPOST => "Documents Searched POST",
|
||||
/// );
|
||||
/// ```
|
||||
#[macro_export]
|
||||
macro_rules! aggregate_methods {
|
||||
($method:ident => $event_name:literal) => {
|
||||
#[derive(Default)]
|
||||
pub struct $method {}
|
||||
|
||||
impl $crate::analytics::AggregateMethod for $method {
|
||||
fn event_name() -> &'static str {
|
||||
$event_name
|
||||
}
|
||||
}
|
||||
};
|
||||
($($method:ident => $event_name:literal,)+) => {
|
||||
$(
|
||||
aggregate_methods!($method => $event_name);
|
||||
)+
|
||||
|
||||
};
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct Analytics {
|
||||
segment: Option<Arc<SegmentAnalytics>>,
|
||||
}
|
||||
|
||||
impl Analytics {
|
||||
pub async fn new(
|
||||
opt: &Opt,
|
||||
index_scheduler: Arc<IndexScheduler>,
|
||||
auth_controller: Arc<AuthController>,
|
||||
) -> Self {
|
||||
if opt.no_analytics {
|
||||
Self { segment: None }
|
||||
} else {
|
||||
Self { segment: SegmentAnalytics::new(opt, index_scheduler, auth_controller).await }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn no_analytics() -> Self {
|
||||
Self { segment: None }
|
||||
}
|
||||
|
||||
pub fn instance_uid(&self) -> Option<&InstanceUid> {
|
||||
self.segment.as_ref().map(|segment| segment.instance_uid.as_ref())
|
||||
}
|
||||
pub trait Analytics: Sync + Send {
|
||||
fn instance_uid(&self) -> Option<&InstanceUid>;
|
||||
|
||||
/// The method used to publish most analytics that do not need to be batched every hours
|
||||
pub fn publish<T: Aggregate>(&self, event: T, request: &HttpRequest) {
|
||||
if let Some(ref segment) = self.segment {
|
||||
let _ = segment.sender.try_send(segment_analytics::Message::new(event, request));
|
||||
}
|
||||
}
|
||||
fn publish(&self, event_name: String, send: Value, request: Option<&HttpRequest>);
|
||||
|
||||
/// This method should be called to aggregate a get search
|
||||
fn get_search(&self, aggregate: SearchAggregator);
|
||||
|
||||
/// This method should be called to aggregate a post search
|
||||
fn post_search(&self, aggregate: SearchAggregator);
|
||||
|
||||
/// This method should be called to aggregate a get similar request
|
||||
fn get_similar(&self, aggregate: SimilarAggregator);
|
||||
|
||||
/// This method should be called to aggregate a post similar request
|
||||
fn post_similar(&self, aggregate: SimilarAggregator);
|
||||
|
||||
/// This method should be called to aggregate a post array of searches
|
||||
fn post_multi_search(&self, aggregate: MultiSearchAggregator);
|
||||
|
||||
/// This method should be called to aggregate post facet values searches
|
||||
fn post_facet_search(&self, aggregate: FacetSearchAggregator);
|
||||
|
||||
// this method should be called to aggregate an add documents request
|
||||
fn add_documents(
|
||||
&self,
|
||||
documents_query: &UpdateDocumentsQuery,
|
||||
index_creation: bool,
|
||||
request: &HttpRequest,
|
||||
);
|
||||
|
||||
// this method should be called to aggregate a fetch documents request
|
||||
fn get_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
|
||||
|
||||
// this method should be called to aggregate a fetch documents request
|
||||
fn post_fetch_documents(&self, documents_query: &DocumentFetchKind, request: &HttpRequest);
|
||||
|
||||
// this method should be called to aggregate a add documents request
|
||||
fn delete_documents(&self, kind: DocumentDeletionKind, request: &HttpRequest);
|
||||
|
||||
// this method should be called to batch an update documents request
|
||||
fn update_documents(
|
||||
&self,
|
||||
documents_query: &UpdateDocumentsQuery,
|
||||
index_creation: bool,
|
||||
request: &HttpRequest,
|
||||
);
|
||||
|
||||
// this method should be called to batch an update documents by function request
|
||||
fn update_documents_by_function(
|
||||
&self,
|
||||
documents_query: &DocumentEditionByFunction,
|
||||
index_creation: bool,
|
||||
request: &HttpRequest,
|
||||
);
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -120,7 +120,7 @@ pub fn create_app(
|
||||
search_queue: Data<SearchQueue>,
|
||||
opt: Opt,
|
||||
logs: (LogRouteHandle, LogStderrHandle),
|
||||
analytics: Data<Analytics>,
|
||||
analytics: Arc<dyn Analytics>,
|
||||
enable_dashboard: bool,
|
||||
) -> actix_web::App<
|
||||
impl ServiceFactory<
|
||||
@@ -473,14 +473,14 @@ pub fn configure_data(
|
||||
search_queue: Data<SearchQueue>,
|
||||
opt: &Opt,
|
||||
(logs_route, logs_stderr): (LogRouteHandle, LogStderrHandle),
|
||||
analytics: Data<Analytics>,
|
||||
analytics: Arc<dyn Analytics>,
|
||||
) {
|
||||
let http_payload_size_limit = opt.http_payload_size_limit.as_u64() as usize;
|
||||
config
|
||||
.app_data(index_scheduler)
|
||||
.app_data(auth)
|
||||
.app_data(search_queue)
|
||||
.app_data(analytics)
|
||||
.app_data(web::Data::from(analytics))
|
||||
.app_data(web::Data::new(logs_route))
|
||||
.app_data(web::Data::new(logs_stderr))
|
||||
.app_data(web::Data::new(opt.clone()))
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
use std::thread::available_parallelism;
|
||||
use std::time::Duration;
|
||||
|
||||
use actix_web::http::KeepAlive;
|
||||
use actix_web::web::Data;
|
||||
@@ -124,12 +123,19 @@ async fn try_main() -> anyhow::Result<()> {
|
||||
|
||||
let (index_scheduler, auth_controller) = setup_meilisearch(&opt)?;
|
||||
|
||||
let analytics =
|
||||
analytics::Analytics::new(&opt, index_scheduler.clone(), auth_controller.clone()).await;
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
let analytics = if !opt.no_analytics {
|
||||
analytics::SegmentAnalytics::new(&opt, index_scheduler.clone(), auth_controller.clone())
|
||||
.await
|
||||
} else {
|
||||
analytics::MockAnalytics::new(&opt)
|
||||
};
|
||||
#[cfg(any(debug_assertions, not(feature = "analytics")))]
|
||||
let analytics = analytics::MockAnalytics::new(&opt);
|
||||
|
||||
print_launch_resume(&opt, analytics.clone(), config_read_from);
|
||||
|
||||
run_http(index_scheduler, auth_controller, opt, log_handle, Arc::new(analytics)).await?;
|
||||
run_http(index_scheduler, auth_controller, opt, log_handle, analytics).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -139,23 +145,16 @@ async fn run_http(
|
||||
auth_controller: Arc<AuthController>,
|
||||
opt: Opt,
|
||||
logs: (LogRouteHandle, LogStderrHandle),
|
||||
analytics: Arc<Analytics>,
|
||||
analytics: Arc<dyn Analytics>,
|
||||
) -> anyhow::Result<()> {
|
||||
let enable_dashboard = &opt.env == "development";
|
||||
let opt_clone = opt.clone();
|
||||
let index_scheduler = Data::from(index_scheduler);
|
||||
let auth_controller = Data::from(auth_controller);
|
||||
let analytics = Data::from(analytics);
|
||||
let search_queue = SearchQueue::new(
|
||||
opt.experimental_search_queue_size,
|
||||
available_parallelism()
|
||||
.unwrap_or(NonZeroUsize::new(2).unwrap())
|
||||
.checked_mul(opt.experimental_nb_searches_per_core)
|
||||
.unwrap_or(NonZeroUsize::MAX),
|
||||
)
|
||||
.with_time_to_abort(Duration::from_secs(
|
||||
usize::from(opt.experimental_drop_search_after) as u64
|
||||
));
|
||||
available_parallelism().unwrap_or(NonZeroUsize::new(2).unwrap()),
|
||||
);
|
||||
let search_queue = Data::new(search_queue);
|
||||
|
||||
let http_server = HttpServer::new(move || {
|
||||
@@ -181,7 +180,11 @@ async fn run_http(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Option<PathBuf>) {
|
||||
pub fn print_launch_resume(
|
||||
opt: &Opt,
|
||||
analytics: Arc<dyn Analytics>,
|
||||
config_read_from: Option<PathBuf>,
|
||||
) {
|
||||
let build_info = build_info::BuildInfo::from_build();
|
||||
|
||||
let protocol =
|
||||
@@ -223,6 +226,7 @@ pub fn print_launch_resume(opt: &Opt, analytics: Analytics, config_read_from: Op
|
||||
eprintln!("Prototype:\t\t{:?}", prototype);
|
||||
}
|
||||
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
{
|
||||
if !opt.no_analytics {
|
||||
eprintln!(
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::env::VarError;
|
||||
use std::ffi::OsStr;
|
||||
use std::fmt::Display;
|
||||
use std::io::{BufReader, Read};
|
||||
use std::num::{NonZeroUsize, ParseIntError};
|
||||
use std::num::ParseIntError;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
use std::str::FromStr;
|
||||
@@ -29,6 +29,7 @@ const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
|
||||
const MEILI_ENV: &str = "MEILI_ENV";
|
||||
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
|
||||
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
|
||||
#[cfg(feature = "analytics")]
|
||||
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
|
||||
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
|
||||
const MEILI_SSL_CERT_PATH: &str = "MEILI_SSL_CERT_PATH";
|
||||
@@ -54,8 +55,6 @@ const MEILI_EXPERIMENTAL_ENABLE_LOGS_ROUTE: &str = "MEILI_EXPERIMENTAL_ENABLE_LO
|
||||
const MEILI_EXPERIMENTAL_CONTAINS_FILTER: &str = "MEILI_EXPERIMENTAL_CONTAINS_FILTER";
|
||||
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
||||
const MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE: &str = "MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE";
|
||||
const MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER: &str = "MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER";
|
||||
const MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE: &str = "MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE";
|
||||
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
|
||||
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
|
||||
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
|
||||
@@ -209,6 +208,7 @@ pub struct Opt {
|
||||
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
|
||||
/// All gathered data is used solely for the purpose of improving Meilisearch, and can be deleted
|
||||
/// at any time.
|
||||
#[cfg(feature = "analytics")]
|
||||
#[serde(default)] // we can't send true
|
||||
#[clap(long, env = MEILI_NO_ANALYTICS)]
|
||||
pub no_analytics: bool,
|
||||
@@ -357,26 +357,10 @@ pub struct Opt {
|
||||
/// Lets you customize the size of the search queue. Meilisearch processes your search requests as fast as possible but once the
|
||||
/// queue is full it starts returning HTTP 503, Service Unavailable.
|
||||
/// The default value is 1000.
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = default_experimental_search_queue_size())]
|
||||
#[serde(default = "default_experimental_search_queue_size")]
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE, default_value_t = 1000)]
|
||||
#[serde(default)]
|
||||
pub experimental_search_queue_size: usize,
|
||||
|
||||
/// Experimental drop search after. For more information, see: <https://github.com/orgs/meilisearch/discussions/783>
|
||||
///
|
||||
/// Let you customize after how many seconds Meilisearch should consider a search request irrelevant and drop it.
|
||||
/// The default value is 60.
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER, default_value_t = default_drop_search_after())]
|
||||
#[serde(default = "default_drop_search_after")]
|
||||
pub experimental_drop_search_after: NonZeroUsize,
|
||||
|
||||
/// Experimental number of searches per core. For more information, see: <https://github.com/orgs/meilisearch/discussions/784>
|
||||
///
|
||||
/// Lets you customize how many search requests can run on each core concurrently.
|
||||
/// The default value is 4.
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE, default_value_t = default_nb_searches_per_core())]
|
||||
#[serde(default = "default_nb_searches_per_core")]
|
||||
pub experimental_nb_searches_per_core: NonZeroUsize,
|
||||
|
||||
/// Experimental logs mode feature. For more information, see: <https://github.com/orgs/meilisearch/discussions/723>
|
||||
///
|
||||
/// Change the mode of the logs on the console.
|
||||
@@ -423,6 +407,7 @@ pub struct Opt {
|
||||
|
||||
impl Opt {
|
||||
/// Whether analytics should be enabled or not.
|
||||
#[cfg(all(not(debug_assertions), feature = "analytics"))]
|
||||
pub fn analytics(&self) -> bool {
|
||||
!self.no_analytics
|
||||
}
|
||||
@@ -502,12 +487,11 @@ impl Opt {
|
||||
ignore_missing_dump: _,
|
||||
ignore_dump_if_db_exists: _,
|
||||
config_file_path: _,
|
||||
#[cfg(feature = "analytics")]
|
||||
no_analytics,
|
||||
experimental_contains_filter,
|
||||
experimental_enable_metrics,
|
||||
experimental_search_queue_size,
|
||||
experimental_drop_search_after,
|
||||
experimental_nb_searches_per_core,
|
||||
experimental_logs_mode,
|
||||
experimental_enable_logs_route,
|
||||
experimental_replication_parameters,
|
||||
@@ -529,7 +513,10 @@ impl Opt {
|
||||
);
|
||||
}
|
||||
|
||||
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
||||
#[cfg(feature = "analytics")]
|
||||
{
|
||||
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
||||
}
|
||||
export_to_env_if_not_present(
|
||||
MEILI_HTTP_PAYLOAD_SIZE_LIMIT,
|
||||
http_payload_size_limit.to_string(),
|
||||
@@ -572,14 +559,6 @@ impl Opt {
|
||||
MEILI_EXPERIMENTAL_SEARCH_QUEUE_SIZE,
|
||||
experimental_search_queue_size.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_DROP_SEARCH_AFTER,
|
||||
experimental_drop_search_after.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_NB_SEARCHES_PER_CORE,
|
||||
experimental_nb_searches_per_core.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_LOGS_MODE,
|
||||
experimental_logs_mode.to_string(),
|
||||
@@ -911,18 +890,6 @@ fn default_dump_dir() -> PathBuf {
|
||||
PathBuf::from(DEFAULT_DUMP_DIR)
|
||||
}
|
||||
|
||||
fn default_experimental_search_queue_size() -> usize {
|
||||
1000
|
||||
}
|
||||
|
||||
fn default_drop_search_after() -> NonZeroUsize {
|
||||
NonZeroUsize::new(60).unwrap()
|
||||
}
|
||||
|
||||
fn default_nb_searches_per_core() -> NonZeroUsize {
|
||||
NonZeroUsize::new(4).unwrap()
|
||||
}
|
||||
|
||||
/// Indicates if a snapshot was scheduled, and if yes with which interval.
|
||||
#[derive(Debug, Default, Copy, Clone, Deserialize, Serialize)]
|
||||
pub enum ScheduleSnapshot {
|
||||
|
||||
@@ -4,6 +4,7 @@ use index_scheduler::IndexScheduler;
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use serde_json::json;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
@@ -17,16 +18,14 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_dump))));
|
||||
}
|
||||
|
||||
crate::empty_analytics!(DumpAnalytics, "Dump Created");
|
||||
|
||||
pub async fn create_dump(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<IndexScheduler>>,
|
||||
auth_controller: GuardedData<ActionPolicy<{ actions::DUMPS_CREATE }>, Data<AuthController>>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.publish(DumpAnalytics::default(), &req);
|
||||
analytics.publish("Dump Created".to_string(), json!({}), Some(&req));
|
||||
|
||||
let task = KindWithContent::DumpCreation {
|
||||
keys: auth_controller.list_keys()?,
|
||||
|
||||
@@ -6,10 +6,10 @@ use index_scheduler::IndexScheduler;
|
||||
use meilisearch_types::deserr::DeserrJsonError;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::keys::actions;
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::{Aggregate, Analytics};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::ActionPolicy;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
@@ -17,7 +17,7 @@ use crate::extractors::sequential_extractor::SeqHandler;
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(
|
||||
web::resource("")
|
||||
.route(web::get().to(get_features))
|
||||
.route(web::get().to(SeqHandler(get_features)))
|
||||
.route(web::patch().to(SeqHandler(patch_features))),
|
||||
);
|
||||
}
|
||||
@@ -27,9 +27,12 @@ async fn get_features(
|
||||
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_GET }>,
|
||||
Data<IndexScheduler>,
|
||||
>,
|
||||
req: HttpRequest,
|
||||
analytics: Data<dyn Analytics>,
|
||||
) -> HttpResponse {
|
||||
let features = index_scheduler.features();
|
||||
|
||||
analytics.publish("Experimental features Seen".to_string(), json!(null), Some(&req));
|
||||
let features = features.runtime_features();
|
||||
debug!(returns = ?features, "Get features");
|
||||
HttpResponse::Ok().json(features)
|
||||
@@ -50,35 +53,6 @@ pub struct RuntimeTogglableFeatures {
|
||||
pub contains_filter: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct PatchExperimentalFeatureAnalytics {
|
||||
vector_store: bool,
|
||||
metrics: bool,
|
||||
logs_route: bool,
|
||||
edit_documents_by_function: bool,
|
||||
contains_filter: bool,
|
||||
}
|
||||
|
||||
impl Aggregate for PatchExperimentalFeatureAnalytics {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Experimental features Updated"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
vector_store: new.vector_store,
|
||||
metrics: new.metrics,
|
||||
logs_route: new.logs_route,
|
||||
edit_documents_by_function: new.edit_documents_by_function,
|
||||
contains_filter: new.contains_filter,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn patch_features(
|
||||
index_scheduler: GuardedData<
|
||||
ActionPolicy<{ actions::EXPERIMENTAL_FEATURES_UPDATE }>,
|
||||
@@ -86,7 +60,7 @@ async fn patch_features(
|
||||
>,
|
||||
new_features: AwebJson<RuntimeTogglableFeatures, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
analytics: Data<Analytics>,
|
||||
analytics: Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let features = index_scheduler.features();
|
||||
debug!(parameters = ?new_features, "Patch features");
|
||||
@@ -115,14 +89,15 @@ async fn patch_features(
|
||||
} = new_features;
|
||||
|
||||
analytics.publish(
|
||||
PatchExperimentalFeatureAnalytics {
|
||||
vector_store,
|
||||
metrics,
|
||||
logs_route,
|
||||
edit_documents_by_function,
|
||||
contains_filter,
|
||||
},
|
||||
&req,
|
||||
"Experimental features Updated".to_string(),
|
||||
json!({
|
||||
"vector_store": vector_store,
|
||||
"metrics": metrics,
|
||||
"logs_route": logs_route,
|
||||
"edit_documents_by_function": edit_documents_by_function,
|
||||
"contains_filter": contains_filter,
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
index_scheduler.put_runtime_features(new_features)?;
|
||||
debug!(returns = ?new_features, "Patch features");
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
use std::collections::HashSet;
|
||||
use std::io::ErrorKind;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use actix_web::http::header::CONTENT_TYPE;
|
||||
use actix_web::web::Data;
|
||||
@@ -25,14 +23,14 @@ use meilisearch_types::tasks::KindWithContent;
|
||||
use meilisearch_types::{milli, Document, Index};
|
||||
use mime::Mime;
|
||||
use once_cell::sync::Lazy;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde::Deserialize;
|
||||
use serde_json::Value;
|
||||
use tempfile::tempfile;
|
||||
use tokio::fs::File;
|
||||
use tokio::io::{AsyncSeekExt, AsyncWriteExt, BufWriter};
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::{Aggregate, AggregateMethod, Analytics};
|
||||
use crate::analytics::{Analytics, DocumentDeletionKind, DocumentFetchKind};
|
||||
use crate::error::MeilisearchHttpError;
|
||||
use crate::error::PayloadError::ReceivePayload;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
@@ -43,7 +41,7 @@ use crate::routes::{
|
||||
get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT,
|
||||
};
|
||||
use crate::search::{parse_filter, RetrieveVectors};
|
||||
use crate::{aggregate_methods, Opt};
|
||||
use crate::Opt;
|
||||
|
||||
static ACCEPTED_CONTENT_TYPE: Lazy<Vec<String>> = Lazy::new(|| {
|
||||
vec!["application/json".to_string(), "application/x-ndjson".to_string(), "text/csv".to_string()]
|
||||
@@ -102,84 +100,12 @@ pub struct GetDocument {
|
||||
retrieve_vectors: Param<bool>,
|
||||
}
|
||||
|
||||
aggregate_methods!(
|
||||
DocumentsGET => "Documents Fetched GET",
|
||||
DocumentsPOST => "Documents Fetched POST",
|
||||
);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct DocumentsFetchAggregator<Method: AggregateMethod> {
|
||||
// a call on ../documents/:doc_id
|
||||
per_document_id: bool,
|
||||
// if a filter was used
|
||||
per_filter: bool,
|
||||
|
||||
#[serde(rename = "vector.retrieve_vectors")]
|
||||
retrieve_vectors: bool,
|
||||
|
||||
// pagination
|
||||
#[serde(rename = "pagination.max_limit")]
|
||||
max_limit: usize,
|
||||
#[serde(rename = "pagination.max_offset")]
|
||||
max_offset: usize,
|
||||
|
||||
marker: std::marker::PhantomData<Method>,
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
pub enum DocumentFetchKind {
|
||||
PerDocumentId { retrieve_vectors: bool },
|
||||
Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool },
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> DocumentsFetchAggregator<Method> {
|
||||
pub fn from_query(query: &DocumentFetchKind) -> Self {
|
||||
let (limit, offset, retrieve_vectors) = match query {
|
||||
DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors),
|
||||
DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => {
|
||||
(*limit, *offset, *retrieve_vectors)
|
||||
}
|
||||
};
|
||||
|
||||
Self {
|
||||
per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }),
|
||||
per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter),
|
||||
max_limit: limit,
|
||||
max_offset: offset,
|
||||
retrieve_vectors,
|
||||
|
||||
marker: PhantomData,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> Aggregate for DocumentsFetchAggregator<Method> {
|
||||
fn event_name(&self) -> &'static str {
|
||||
Method::event_name()
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
per_document_id: self.per_document_id | new.per_document_id,
|
||||
per_filter: self.per_filter | new.per_filter,
|
||||
retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors,
|
||||
max_limit: self.max_limit.max(new.max_limit),
|
||||
max_offset: self.max_offset.max(new.max_offset),
|
||||
marker: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn get_document(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_GET }>, Data<IndexScheduler>>,
|
||||
document_param: web::Path<DocumentParam>,
|
||||
params: AwebQueryParameter<GetDocument, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let DocumentParam { index_uid, document_id } = document_param.into_inner();
|
||||
debug!(parameters = ?params, "Get document");
|
||||
@@ -191,15 +117,8 @@ pub async fn get_document(
|
||||
let features = index_scheduler.features();
|
||||
let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?;
|
||||
|
||||
analytics.publish(
|
||||
DocumentsFetchAggregator::<DocumentsGET> {
|
||||
retrieve_vectors: param_retrieve_vectors.0,
|
||||
per_document_id: true,
|
||||
per_filter: false,
|
||||
max_limit: 0,
|
||||
max_offset: 0,
|
||||
marker: PhantomData,
|
||||
},
|
||||
analytics.get_fetch_documents(
|
||||
&DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 },
|
||||
&req,
|
||||
);
|
||||
|
||||
@@ -210,52 +129,17 @@ pub async fn get_document(
|
||||
Ok(HttpResponse::Ok().json(document))
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct DocumentsDeletionAggregator {
|
||||
per_document_id: bool,
|
||||
clear_all: bool,
|
||||
per_batch: bool,
|
||||
per_filter: bool,
|
||||
}
|
||||
|
||||
impl Aggregate for DocumentsDeletionAggregator {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Documents Deleted"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
per_document_id: self.per_document_id | new.per_document_id,
|
||||
clear_all: self.clear_all | new.clear_all,
|
||||
per_batch: self.per_batch | new.per_batch,
|
||||
per_filter: self.per_filter | new.per_filter,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn delete_document(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_DELETE }>, Data<IndexScheduler>>,
|
||||
path: web::Path<DocumentParam>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let DocumentParam { index_uid, document_id } = path.into_inner();
|
||||
let index_uid = IndexUid::try_from(index_uid)?;
|
||||
|
||||
analytics.publish(
|
||||
DocumentsDeletionAggregator {
|
||||
per_document_id: true,
|
||||
clear_all: false,
|
||||
per_batch: false,
|
||||
per_filter: false,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
analytics.delete_documents(DocumentDeletionKind::PerDocumentId, &req);
|
||||
|
||||
let task = KindWithContent::DocumentDeletion {
|
||||
index_uid: index_uid.to_string(),
|
||||
@@ -306,19 +190,17 @@ pub async fn documents_by_query_post(
|
||||
index_uid: web::Path<String>,
|
||||
body: AwebJson<BrowseQuery, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let body = body.into_inner();
|
||||
debug!(parameters = ?body, "Get documents POST");
|
||||
|
||||
analytics.publish(
|
||||
DocumentsFetchAggregator::<DocumentsPOST> {
|
||||
per_filter: body.filter.is_some(),
|
||||
analytics.post_fetch_documents(
|
||||
&DocumentFetchKind::Normal {
|
||||
with_filter: body.filter.is_some(),
|
||||
limit: body.limit,
|
||||
offset: body.offset,
|
||||
retrieve_vectors: body.retrieve_vectors,
|
||||
max_limit: body.limit,
|
||||
max_offset: body.offset,
|
||||
per_document_id: false,
|
||||
marker: PhantomData,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
@@ -331,7 +213,7 @@ pub async fn get_documents(
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebQueryParameter<BrowseQueryGet, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?params, "Get documents GET");
|
||||
|
||||
@@ -353,14 +235,12 @@ pub async fn get_documents(
|
||||
filter,
|
||||
};
|
||||
|
||||
analytics.publish(
|
||||
DocumentsFetchAggregator::<DocumentsGET> {
|
||||
per_filter: query.filter.is_some(),
|
||||
analytics.get_fetch_documents(
|
||||
&DocumentFetchKind::Normal {
|
||||
with_filter: query.filter.is_some(),
|
||||
limit: query.limit,
|
||||
offset: query.offset,
|
||||
retrieve_vectors: query.retrieve_vectors,
|
||||
max_limit: query.limit,
|
||||
max_offset: query.offset,
|
||||
per_document_id: false,
|
||||
marker: PhantomData,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
@@ -418,39 +298,6 @@ fn from_char_csv_delimiter(
|
||||
}
|
||||
}
|
||||
|
||||
aggregate_methods!(
|
||||
Replaced => "Documents Added",
|
||||
Updated => "Documents Updated",
|
||||
);
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub struct DocumentsAggregator<T: AggregateMethod> {
|
||||
payload_types: HashSet<String>,
|
||||
primary_key: HashSet<String>,
|
||||
index_creation: bool,
|
||||
#[serde(skip)]
|
||||
method: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> Aggregate for DocumentsAggregator<Method> {
|
||||
fn event_name(&self) -> &'static str {
|
||||
Method::event_name()
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
payload_types: self.payload_types.union(&new.payload_types).cloned().collect(),
|
||||
primary_key: self.primary_key.union(&new.primary_key).cloned().collect(),
|
||||
index_creation: self.index_creation | new.index_creation,
|
||||
method: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn replace_documents(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ADD }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
@@ -458,32 +305,16 @@ pub async fn replace_documents(
|
||||
body: Payload,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
debug!(parameters = ?params, "Replace documents");
|
||||
let params = params.into_inner();
|
||||
|
||||
let mut content_types = HashSet::new();
|
||||
let content_type = req
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|s| s.to_str().ok())
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
content_types.insert(content_type);
|
||||
let mut primary_keys = HashSet::new();
|
||||
if let Some(primary_key) = params.primary_key.clone() {
|
||||
primary_keys.insert(primary_key);
|
||||
}
|
||||
analytics.publish(
|
||||
DocumentsAggregator::<Replaced> {
|
||||
payload_types: content_types,
|
||||
primary_key: primary_keys,
|
||||
index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
||||
method: PhantomData,
|
||||
},
|
||||
analytics.add_documents(
|
||||
¶ms,
|
||||
index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
||||
&req,
|
||||
);
|
||||
|
||||
@@ -515,32 +346,16 @@ pub async fn update_documents(
|
||||
body: Payload,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
let params = params.into_inner();
|
||||
debug!(parameters = ?params, "Update documents");
|
||||
|
||||
let mut content_types = HashSet::new();
|
||||
let content_type = req
|
||||
.headers()
|
||||
.get(CONTENT_TYPE)
|
||||
.and_then(|s| s.to_str().ok())
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
content_types.insert(content_type);
|
||||
let mut primary_keys = HashSet::new();
|
||||
if let Some(primary_key) = params.primary_key.clone() {
|
||||
primary_keys.insert(primary_key);
|
||||
}
|
||||
analytics.publish(
|
||||
DocumentsAggregator::<Updated> {
|
||||
payload_types: content_types,
|
||||
primary_key: primary_keys,
|
||||
index_creation: index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
||||
method: PhantomData,
|
||||
},
|
||||
analytics.add_documents(
|
||||
¶ms,
|
||||
index_scheduler.index_exists(&index_uid).map_or(true, |x| !x),
|
||||
&req,
|
||||
);
|
||||
|
||||
@@ -709,20 +524,12 @@ pub async fn delete_documents_batch(
|
||||
body: web::Json<Vec<Value>>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Delete documents by batch");
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
analytics.publish(
|
||||
DocumentsDeletionAggregator {
|
||||
per_batch: true,
|
||||
per_document_id: false,
|
||||
clear_all: false,
|
||||
per_filter: false,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
analytics.delete_documents(DocumentDeletionKind::PerBatch, &req);
|
||||
|
||||
let ids = body
|
||||
.iter()
|
||||
@@ -755,22 +562,14 @@ pub async fn delete_documents_by_filter(
|
||||
body: AwebJson<DocumentDeletionByFilter, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Delete documents by filter");
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
let index_uid = index_uid.into_inner();
|
||||
let filter = body.into_inner().filter;
|
||||
|
||||
analytics.publish(
|
||||
DocumentsDeletionAggregator {
|
||||
per_filter: true,
|
||||
per_document_id: false,
|
||||
clear_all: false,
|
||||
per_batch: false,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
analytics.delete_documents(DocumentDeletionKind::PerFilter, &req);
|
||||
|
||||
// we ensure the filter is well formed before enqueuing it
|
||||
crate::search::parse_filter(&filter, Code::InvalidDocumentFilter, index_scheduler.features())?
|
||||
@@ -800,41 +599,13 @@ pub struct DocumentEditionByFunction {
|
||||
pub function: String,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct EditDocumentsByFunctionAggregator {
|
||||
// Set to true if at least one request was filtered
|
||||
filtered: bool,
|
||||
// Set to true if at least one request contained a context
|
||||
with_context: bool,
|
||||
|
||||
index_creation: bool,
|
||||
}
|
||||
|
||||
impl Aggregate for EditDocumentsByFunctionAggregator {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Documents Edited By Function"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
filtered: self.filtered | new.filtered,
|
||||
with_context: self.with_context | new.with_context,
|
||||
index_creation: self.index_creation | new.index_creation,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn edit_documents_by_function(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::DOCUMENTS_ALL }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebJson<DocumentEditionByFunction, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?params, "Edit documents by function");
|
||||
|
||||
@@ -846,12 +617,9 @@ pub async fn edit_documents_by_function(
|
||||
let index_uid = index_uid.into_inner();
|
||||
let params = params.into_inner();
|
||||
|
||||
analytics.publish(
|
||||
EditDocumentsByFunctionAggregator {
|
||||
filtered: params.filter.is_some(),
|
||||
with_context: params.context.is_some(),
|
||||
index_creation: index_scheduler.index(&index_uid).is_err(),
|
||||
},
|
||||
analytics.update_documents_by_function(
|
||||
¶ms,
|
||||
index_scheduler.index(&index_uid).is_err(),
|
||||
&req,
|
||||
);
|
||||
|
||||
@@ -902,18 +670,10 @@ pub async fn clear_all_documents(
|
||||
index_uid: web::Path<String>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
analytics.publish(
|
||||
DocumentsDeletionAggregator {
|
||||
clear_all: true,
|
||||
per_document_id: false,
|
||||
per_batch: false,
|
||||
per_filter: false,
|
||||
},
|
||||
&req,
|
||||
);
|
||||
analytics.delete_documents(DocumentDeletionKind::ClearAll, &req);
|
||||
|
||||
let task = KindWithContent::DocumentClear { index_uid: index_uid.to_string() };
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
use std::collections::{BinaryHeap, HashSet};
|
||||
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use deserr::actix_web::AwebJson;
|
||||
@@ -12,15 +10,14 @@ use meilisearch_types::locales::Locale;
|
||||
use serde_json::Value;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::{Aggregate, Analytics};
|
||||
use crate::analytics::{Analytics, FacetSearchAggregator};
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::routes::indexes::search::search_kind;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_facet_search, FacetSearchResult, HybridQuery, MatchingStrategy,
|
||||
RankingScoreThreshold, SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
|
||||
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||
DEFAULT_SEARCH_OFFSET,
|
||||
add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
||||
SearchQuery, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
|
||||
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
};
|
||||
use crate::search_queue::SearchQueue;
|
||||
|
||||
@@ -56,122 +53,20 @@ pub struct FacetSearchQuery {
|
||||
pub locales: Option<Vec<Locale>>,
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct FacetSearchAggregator {
|
||||
// requests
|
||||
total_received: usize,
|
||||
total_succeeded: usize,
|
||||
time_spent: BinaryHeap<usize>,
|
||||
|
||||
// The set of all facetNames that were used
|
||||
facet_names: HashSet<String>,
|
||||
|
||||
// As there been any other parameter than the facetName or facetQuery ones?
|
||||
additional_search_parameters_provided: bool,
|
||||
}
|
||||
|
||||
impl FacetSearchAggregator {
|
||||
#[allow(clippy::field_reassign_with_default)]
|
||||
pub fn from_query(query: &FacetSearchQuery) -> Self {
|
||||
let FacetSearchQuery {
|
||||
facet_query: _,
|
||||
facet_name,
|
||||
vector,
|
||||
q,
|
||||
filter,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
ranking_score_threshold,
|
||||
locales,
|
||||
} = query;
|
||||
|
||||
Self {
|
||||
total_received: 1,
|
||||
facet_names: Some(facet_name.clone()).into_iter().collect(),
|
||||
additional_search_parameters_provided: q.is_some()
|
||||
|| vector.is_some()
|
||||
|| filter.is_some()
|
||||
|| *matching_strategy != MatchingStrategy::default()
|
||||
|| attributes_to_search_on.is_some()
|
||||
|| hybrid.is_some()
|
||||
|| ranking_score_threshold.is_some()
|
||||
|| locales.is_some(),
|
||||
..Default::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, result: &FacetSearchResult) {
|
||||
let FacetSearchResult { facet_hits: _, facet_query: _, processing_time_ms } = result;
|
||||
self.total_succeeded = 1;
|
||||
self.time_spent.push(*processing_time_ms as usize);
|
||||
}
|
||||
}
|
||||
|
||||
impl Aggregate for FacetSearchAggregator {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Facet Searched POST"
|
||||
}
|
||||
|
||||
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
for time in new.time_spent {
|
||||
self.time_spent.push(time);
|
||||
}
|
||||
|
||||
Box::new(Self {
|
||||
total_received: self.total_received.saturating_add(new.total_received),
|
||||
total_succeeded: self.total_succeeded.saturating_add(new.total_succeeded),
|
||||
time_spent: self.time_spent,
|
||||
facet_names: self.facet_names.union(&new.facet_names).cloned().collect(),
|
||||
additional_search_parameters_provided: self.additional_search_parameters_provided
|
||||
| new.additional_search_parameters_provided,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
let Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
time_spent,
|
||||
facet_names,
|
||||
additional_search_parameters_provided,
|
||||
} = *self;
|
||||
// the index of the 99th percentage of value
|
||||
let percentile_99th = 0.99 * (total_succeeded as f64 - 1.) + 1.;
|
||||
// we get all the values in a sorted manner
|
||||
let time_spent = time_spent.into_sorted_vec();
|
||||
// We are only interested by the slowest value of the 99th fastest results
|
||||
let time_spent = time_spent.get(percentile_99th as usize);
|
||||
|
||||
serde_json::json!({
|
||||
"requests": {
|
||||
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
|
||||
"total_succeeded": total_succeeded,
|
||||
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||
"total_received": total_received,
|
||||
},
|
||||
"facets": {
|
||||
"total_distinct_facet_count": facet_names.len(),
|
||||
"additional_search_parameters_provided": additional_search_parameters_provided,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn search(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SEARCH }>, Data<IndexScheduler>>,
|
||||
search_queue: Data<SearchQueue>,
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebJson<FacetSearchQuery, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
let query = params.into_inner();
|
||||
debug!(parameters = ?query, "Facet search");
|
||||
|
||||
let mut aggregate = FacetSearchAggregator::from_query(&query);
|
||||
let mut aggregate = FacetSearchAggregator::from_query(&query, &req);
|
||||
|
||||
let facet_query = query.facet_query.clone();
|
||||
let facet_name = query.facet_name.clone();
|
||||
@@ -205,7 +100,7 @@ pub async fn search(
|
||||
if let Ok(ref search_result) = search_result {
|
||||
aggregate.succeed(search_result);
|
||||
}
|
||||
analytics.publish(aggregate, &req);
|
||||
analytics.post_facet_search(aggregate);
|
||||
|
||||
let search_result = search_result?;
|
||||
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::convert::Infallible;
|
||||
|
||||
use actix_web::web::Data;
|
||||
@@ -14,11 +13,12 @@ use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::{self, FieldDistribution, Index};
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
use super::{get_task_id, Pagination, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT};
|
||||
use crate::analytics::{Aggregate, Analytics};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
@@ -28,11 +28,8 @@ use crate::Opt;
|
||||
pub mod documents;
|
||||
pub mod facet_search;
|
||||
pub mod search;
|
||||
mod search_analytics;
|
||||
pub mod settings;
|
||||
mod settings_analytics;
|
||||
pub mod similar;
|
||||
mod similar_analytics;
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(
|
||||
@@ -126,31 +123,12 @@ pub struct IndexCreateRequest {
|
||||
primary_key: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct IndexCreatedAggregate {
|
||||
primary_key: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl Aggregate for IndexCreatedAggregate {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Index Created"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() })
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn create_index(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_CREATE }>, Data<IndexScheduler>>,
|
||||
body: AwebJson<IndexCreateRequest, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Create index");
|
||||
let IndexCreateRequest { primary_key, uid } = body.into_inner();
|
||||
@@ -158,8 +136,9 @@ pub async fn create_index(
|
||||
let allow_index_creation = index_scheduler.filters().allow_index_creation(&uid);
|
||||
if allow_index_creation {
|
||||
analytics.publish(
|
||||
IndexCreatedAggregate { primary_key: primary_key.iter().cloned().collect() },
|
||||
&req,
|
||||
"Index Created".to_string(),
|
||||
json!({ "primary_key": primary_key }),
|
||||
Some(&req),
|
||||
);
|
||||
|
||||
let task = KindWithContent::IndexCreation { index_uid: uid.to_string(), primary_key };
|
||||
@@ -215,38 +194,21 @@ pub async fn get_index(
|
||||
Ok(HttpResponse::Ok().json(index_view))
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct IndexUpdatedAggregate {
|
||||
primary_key: BTreeSet<String>,
|
||||
}
|
||||
|
||||
impl Aggregate for IndexUpdatedAggregate {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Index Updated"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self { primary_key: self.primary_key.union(&new.primary_key).cloned().collect() })
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
pub async fn update_index(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_UPDATE }>, Data<IndexScheduler>>,
|
||||
index_uid: web::Path<String>,
|
||||
body: AwebJson<UpdateIndexRequest, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?body, "Update index");
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
let body = body.into_inner();
|
||||
analytics.publish(
|
||||
IndexUpdatedAggregate { primary_key: body.primary_key.iter().cloned().collect() },
|
||||
&req,
|
||||
"Index Updated".to_string(),
|
||||
json!({ "primary_key": body.primary_key }),
|
||||
Some(&req),
|
||||
);
|
||||
|
||||
let task = KindWithContent::IndexUpdate {
|
||||
|
||||
@@ -13,13 +13,12 @@ use meilisearch_types::serde_cs::vec::CS;
|
||||
use serde_json::Value;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
use crate::analytics::{Analytics, SearchAggregator};
|
||||
use crate::error::MeilisearchHttpError;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS;
|
||||
use crate::routes::indexes::search_analytics::{SearchAggregator, SearchGET, SearchPOST};
|
||||
use crate::search::{
|
||||
add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold,
|
||||
RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH,
|
||||
@@ -226,7 +225,7 @@ pub async fn search_with_url_query(
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebQueryParameter<SearchQueryGet, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
debug!(parameters = ?params, "Search get");
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
@@ -238,7 +237,7 @@ pub async fn search_with_url_query(
|
||||
add_search_rules(&mut query.filter, search_rules);
|
||||
}
|
||||
|
||||
let mut aggregate = SearchAggregator::<SearchGET>::from_query(&query);
|
||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let features = index_scheduler.features();
|
||||
@@ -255,7 +254,7 @@ pub async fn search_with_url_query(
|
||||
if let Ok(ref search_result) = search_result {
|
||||
aggregate.succeed(search_result);
|
||||
}
|
||||
analytics.publish(aggregate, &req);
|
||||
analytics.get_search(aggregate);
|
||||
|
||||
let search_result = search_result?;
|
||||
|
||||
@@ -269,7 +268,7 @@ pub async fn search_with_post(
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebJson<SearchQuery, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
@@ -281,7 +280,7 @@ pub async fn search_with_post(
|
||||
add_search_rules(&mut query.filter, search_rules);
|
||||
}
|
||||
|
||||
let mut aggregate = SearchAggregator::<SearchPOST>::from_query(&query);
|
||||
let mut aggregate = SearchAggregator::from_query(&query, &req);
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
|
||||
@@ -303,7 +302,7 @@ pub async fn search_with_post(
|
||||
MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc();
|
||||
}
|
||||
}
|
||||
analytics.publish(aggregate, &req);
|
||||
analytics.post_search(aggregate);
|
||||
|
||||
let search_result = search_result?;
|
||||
|
||||
|
||||
@@ -1,485 +0,0 @@
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use serde_json::{json, Value};
|
||||
use std::collections::{BTreeSet, BinaryHeap, HashMap};
|
||||
|
||||
use meilisearch_types::locales::Locale;
|
||||
|
||||
use crate::{
|
||||
aggregate_methods,
|
||||
analytics::{Aggregate, AggregateMethod},
|
||||
search::{
|
||||
SearchQuery, SearchResult, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER,
|
||||
DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||
DEFAULT_SEMANTIC_RATIO,
|
||||
},
|
||||
};
|
||||
|
||||
aggregate_methods!(
|
||||
SearchGET => "Documents Searched GET",
|
||||
SearchPOST => "Documents Searched POST",
|
||||
);
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SearchAggregator<Method: AggregateMethod> {
|
||||
// requests
|
||||
total_received: usize,
|
||||
total_succeeded: usize,
|
||||
total_degraded: usize,
|
||||
total_used_negative_operator: usize,
|
||||
time_spent: BinaryHeap<usize>,
|
||||
|
||||
// sort
|
||||
sort_with_geo_point: bool,
|
||||
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||
sort_sum_of_criteria_terms: usize,
|
||||
// every time a request has a filter, this field must be incremented by one
|
||||
sort_total_number_of_criteria: usize,
|
||||
|
||||
// distinct
|
||||
distinct: bool,
|
||||
|
||||
// filter
|
||||
filter_with_geo_radius: bool,
|
||||
filter_with_geo_bounding_box: bool,
|
||||
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||
filter_sum_of_criteria_terms: usize,
|
||||
// every time a request has a filter, this field must be incremented by one
|
||||
filter_total_number_of_criteria: usize,
|
||||
used_syntax: HashMap<String, usize>,
|
||||
|
||||
// attributes_to_search_on
|
||||
// every time a search is done using attributes_to_search_on
|
||||
attributes_to_search_on_total_number_of_uses: usize,
|
||||
|
||||
// q
|
||||
// The maximum number of terms in a q request
|
||||
max_terms_number: usize,
|
||||
|
||||
// vector
|
||||
// The maximum number of floats in a vector request
|
||||
max_vector_size: usize,
|
||||
// Whether the semantic ratio passed to a hybrid search equals the default ratio.
|
||||
semantic_ratio: bool,
|
||||
hybrid: bool,
|
||||
retrieve_vectors: bool,
|
||||
|
||||
// every time a search is done, we increment the counter linked to the used settings
|
||||
matching_strategy: HashMap<String, usize>,
|
||||
|
||||
// List of the unique Locales passed as parameter
|
||||
locales: BTreeSet<Locale>,
|
||||
|
||||
// pagination
|
||||
max_limit: usize,
|
||||
max_offset: usize,
|
||||
finite_pagination: usize,
|
||||
|
||||
// formatting
|
||||
max_attributes_to_retrieve: usize,
|
||||
max_attributes_to_highlight: usize,
|
||||
highlight_pre_tag: bool,
|
||||
highlight_post_tag: bool,
|
||||
max_attributes_to_crop: usize,
|
||||
crop_marker: bool,
|
||||
show_matches_position: bool,
|
||||
crop_length: bool,
|
||||
|
||||
// facets
|
||||
facets_sum_of_terms: usize,
|
||||
facets_total_number_of_facets: usize,
|
||||
|
||||
// scoring
|
||||
show_ranking_score: bool,
|
||||
show_ranking_score_details: bool,
|
||||
ranking_score_threshold: bool,
|
||||
|
||||
marker: std::marker::PhantomData<Method>,
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> SearchAggregator<Method> {
|
||||
#[allow(clippy::field_reassign_with_default)]
|
||||
pub fn from_query(query: &SearchQuery) -> Self {
|
||||
let SearchQuery {
|
||||
q,
|
||||
vector,
|
||||
offset,
|
||||
limit,
|
||||
page,
|
||||
hits_per_page,
|
||||
attributes_to_retrieve: _,
|
||||
retrieve_vectors,
|
||||
attributes_to_crop: _,
|
||||
crop_length,
|
||||
attributes_to_highlight: _,
|
||||
show_matches_position,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
filter,
|
||||
sort,
|
||||
distinct,
|
||||
facets: _,
|
||||
highlight_pre_tag,
|
||||
highlight_post_tag,
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
ranking_score_threshold,
|
||||
locales,
|
||||
} = query;
|
||||
|
||||
let mut ret = Self::default();
|
||||
|
||||
ret.total_received = 1;
|
||||
|
||||
if let Some(ref sort) = sort {
|
||||
ret.sort_total_number_of_criteria = 1;
|
||||
ret.sort_with_geo_point = sort.iter().any(|s| s.contains("_geoPoint("));
|
||||
ret.sort_sum_of_criteria_terms = sort.len();
|
||||
}
|
||||
|
||||
ret.distinct = distinct.is_some();
|
||||
|
||||
if let Some(ref filter) = filter {
|
||||
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
|
||||
ret.filter_total_number_of_criteria = 1;
|
||||
|
||||
let syntax = match filter {
|
||||
Value::String(_) => "string".to_string(),
|
||||
Value::Array(values) => {
|
||||
if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) {
|
||||
"mixed".to_string()
|
||||
} else {
|
||||
"array".to_string()
|
||||
}
|
||||
}
|
||||
_ => "none".to_string(),
|
||||
};
|
||||
// convert the string to a HashMap
|
||||
ret.used_syntax.insert(syntax, 1);
|
||||
|
||||
let stringified_filters = filter.to_string();
|
||||
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
|
||||
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
|
||||
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
|
||||
}
|
||||
|
||||
// attributes_to_search_on
|
||||
if attributes_to_search_on.is_some() {
|
||||
ret.attributes_to_search_on_total_number_of_uses = 1;
|
||||
}
|
||||
|
||||
if let Some(ref q) = q {
|
||||
ret.max_terms_number = q.split_whitespace().count();
|
||||
}
|
||||
|
||||
if let Some(ref vector) = vector {
|
||||
ret.max_vector_size = vector.len();
|
||||
}
|
||||
ret.retrieve_vectors |= retrieve_vectors;
|
||||
|
||||
if query.is_finite_pagination() {
|
||||
let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT);
|
||||
ret.max_limit = limit;
|
||||
ret.max_offset = page.unwrap_or(1).saturating_sub(1) * limit;
|
||||
ret.finite_pagination = 1;
|
||||
} else {
|
||||
ret.max_limit = *limit;
|
||||
ret.max_offset = *offset;
|
||||
ret.finite_pagination = 0;
|
||||
}
|
||||
|
||||
ret.matching_strategy.insert(format!("{:?}", matching_strategy), 1);
|
||||
|
||||
if let Some(locales) = locales {
|
||||
ret.locales = locales.iter().copied().collect();
|
||||
}
|
||||
|
||||
ret.highlight_pre_tag = *highlight_pre_tag != DEFAULT_HIGHLIGHT_PRE_TAG();
|
||||
ret.highlight_post_tag = *highlight_post_tag != DEFAULT_HIGHLIGHT_POST_TAG();
|
||||
ret.crop_marker = *crop_marker != DEFAULT_CROP_MARKER();
|
||||
ret.crop_length = *crop_length != DEFAULT_CROP_LENGTH();
|
||||
ret.show_matches_position = *show_matches_position;
|
||||
|
||||
ret.show_ranking_score = *show_ranking_score;
|
||||
ret.show_ranking_score_details = *show_ranking_score_details;
|
||||
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
||||
|
||||
if let Some(hybrid) = hybrid {
|
||||
ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO();
|
||||
ret.hybrid = true;
|
||||
}
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, result: &SearchResult) {
|
||||
let SearchResult {
|
||||
hits: _,
|
||||
query: _,
|
||||
processing_time_ms,
|
||||
hits_info: _,
|
||||
semantic_hit_count: _,
|
||||
facet_distribution: _,
|
||||
facet_stats: _,
|
||||
degraded,
|
||||
used_negative_operator,
|
||||
} = result;
|
||||
|
||||
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||
if *degraded {
|
||||
self.total_degraded = self.total_degraded.saturating_add(1);
|
||||
}
|
||||
if *used_negative_operator {
|
||||
self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1);
|
||||
}
|
||||
self.time_spent.push(*processing_time_ms as usize);
|
||||
}
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> Aggregate for SearchAggregator<Method> {
|
||||
fn event_name(&self) -> &'static str {
|
||||
Method::event_name()
|
||||
}
|
||||
|
||||
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
let Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
mut time_spent,
|
||||
sort_with_geo_point,
|
||||
sort_sum_of_criteria_terms,
|
||||
sort_total_number_of_criteria,
|
||||
distinct,
|
||||
filter_with_geo_radius,
|
||||
filter_with_geo_bounding_box,
|
||||
filter_sum_of_criteria_terms,
|
||||
filter_total_number_of_criteria,
|
||||
used_syntax,
|
||||
attributes_to_search_on_total_number_of_uses,
|
||||
max_terms_number,
|
||||
max_vector_size,
|
||||
retrieve_vectors,
|
||||
matching_strategy,
|
||||
max_limit,
|
||||
max_offset,
|
||||
finite_pagination,
|
||||
max_attributes_to_retrieve,
|
||||
max_attributes_to_highlight,
|
||||
highlight_pre_tag,
|
||||
highlight_post_tag,
|
||||
max_attributes_to_crop,
|
||||
crop_marker,
|
||||
show_matches_position,
|
||||
crop_length,
|
||||
facets_sum_of_terms,
|
||||
facets_total_number_of_facets,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
semantic_ratio,
|
||||
hybrid,
|
||||
total_degraded,
|
||||
total_used_negative_operator,
|
||||
ranking_score_threshold,
|
||||
mut locales,
|
||||
marker: _,
|
||||
} = *new;
|
||||
|
||||
// request
|
||||
self.total_received = self.total_received.saturating_add(total_received);
|
||||
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
|
||||
self.total_degraded = self.total_degraded.saturating_add(total_degraded);
|
||||
self.total_used_negative_operator =
|
||||
self.total_used_negative_operator.saturating_add(total_used_negative_operator);
|
||||
self.time_spent.append(&mut time_spent);
|
||||
|
||||
// sort
|
||||
self.sort_with_geo_point |= sort_with_geo_point;
|
||||
self.sort_sum_of_criteria_terms =
|
||||
self.sort_sum_of_criteria_terms.saturating_add(sort_sum_of_criteria_terms);
|
||||
self.sort_total_number_of_criteria =
|
||||
self.sort_total_number_of_criteria.saturating_add(sort_total_number_of_criteria);
|
||||
|
||||
// distinct
|
||||
self.distinct |= distinct;
|
||||
|
||||
// filter
|
||||
self.filter_with_geo_radius |= filter_with_geo_radius;
|
||||
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
||||
self.filter_sum_of_criteria_terms =
|
||||
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
|
||||
self.filter_total_number_of_criteria =
|
||||
self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria);
|
||||
for (key, value) in used_syntax.into_iter() {
|
||||
let used_syntax = self.used_syntax.entry(key).or_insert(0);
|
||||
*used_syntax = used_syntax.saturating_add(value);
|
||||
}
|
||||
|
||||
// attributes_to_search_on
|
||||
self.attributes_to_search_on_total_number_of_uses = self
|
||||
.attributes_to_search_on_total_number_of_uses
|
||||
.saturating_add(attributes_to_search_on_total_number_of_uses);
|
||||
|
||||
// q
|
||||
self.max_terms_number = self.max_terms_number.max(max_terms_number);
|
||||
|
||||
// vector
|
||||
self.max_vector_size = self.max_vector_size.max(max_vector_size);
|
||||
self.retrieve_vectors |= retrieve_vectors;
|
||||
self.semantic_ratio |= semantic_ratio;
|
||||
self.hybrid |= hybrid;
|
||||
|
||||
// pagination
|
||||
self.max_limit = self.max_limit.max(max_limit);
|
||||
self.max_offset = self.max_offset.max(max_offset);
|
||||
self.finite_pagination += finite_pagination;
|
||||
|
||||
// formatting
|
||||
self.max_attributes_to_retrieve =
|
||||
self.max_attributes_to_retrieve.max(max_attributes_to_retrieve);
|
||||
self.max_attributes_to_highlight =
|
||||
self.max_attributes_to_highlight.max(max_attributes_to_highlight);
|
||||
self.highlight_pre_tag |= highlight_pre_tag;
|
||||
self.highlight_post_tag |= highlight_post_tag;
|
||||
self.max_attributes_to_crop = self.max_attributes_to_crop.max(max_attributes_to_crop);
|
||||
self.crop_marker |= crop_marker;
|
||||
self.show_matches_position |= show_matches_position;
|
||||
self.crop_length |= crop_length;
|
||||
|
||||
// facets
|
||||
self.facets_sum_of_terms = self.facets_sum_of_terms.saturating_add(facets_sum_of_terms);
|
||||
self.facets_total_number_of_facets =
|
||||
self.facets_total_number_of_facets.saturating_add(facets_total_number_of_facets);
|
||||
|
||||
// matching strategy
|
||||
for (key, value) in matching_strategy.into_iter() {
|
||||
let matching_strategy = self.matching_strategy.entry(key).or_insert(0);
|
||||
*matching_strategy = matching_strategy.saturating_add(value);
|
||||
}
|
||||
|
||||
// scoring
|
||||
self.show_ranking_score |= show_ranking_score;
|
||||
self.show_ranking_score_details |= show_ranking_score_details;
|
||||
self.ranking_score_threshold |= ranking_score_threshold;
|
||||
|
||||
// locales
|
||||
self.locales.append(&mut locales);
|
||||
|
||||
self
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
let Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
time_spent,
|
||||
sort_with_geo_point,
|
||||
sort_sum_of_criteria_terms,
|
||||
sort_total_number_of_criteria,
|
||||
distinct,
|
||||
filter_with_geo_radius,
|
||||
filter_with_geo_bounding_box,
|
||||
filter_sum_of_criteria_terms,
|
||||
filter_total_number_of_criteria,
|
||||
used_syntax,
|
||||
attributes_to_search_on_total_number_of_uses,
|
||||
max_terms_number,
|
||||
max_vector_size,
|
||||
retrieve_vectors,
|
||||
matching_strategy,
|
||||
max_limit,
|
||||
max_offset,
|
||||
finite_pagination,
|
||||
max_attributes_to_retrieve,
|
||||
max_attributes_to_highlight,
|
||||
highlight_pre_tag,
|
||||
highlight_post_tag,
|
||||
max_attributes_to_crop,
|
||||
crop_marker,
|
||||
show_matches_position,
|
||||
crop_length,
|
||||
facets_sum_of_terms,
|
||||
facets_total_number_of_facets,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
semantic_ratio,
|
||||
hybrid,
|
||||
total_degraded,
|
||||
total_used_negative_operator,
|
||||
ranking_score_threshold,
|
||||
locales,
|
||||
marker: _,
|
||||
} = *self;
|
||||
|
||||
// we get all the values in a sorted manner
|
||||
let time_spent = time_spent.into_sorted_vec();
|
||||
// the index of the 99th percentage of value
|
||||
let percentile_99th = time_spent.len() * 99 / 100;
|
||||
// We are only interested by the slowest value of the 99th fastest results
|
||||
let time_spent = time_spent.get(percentile_99th);
|
||||
|
||||
json!({
|
||||
"requests": {
|
||||
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
|
||||
"total_succeeded": total_succeeded,
|
||||
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||
"total_received": total_received,
|
||||
"total_degraded": total_degraded,
|
||||
"total_used_negative_operator": total_used_negative_operator,
|
||||
},
|
||||
"sort": {
|
||||
"with_geoPoint": sort_with_geo_point,
|
||||
"avg_criteria_number": format!("{:.2}", sort_sum_of_criteria_terms as f64 / sort_total_number_of_criteria as f64),
|
||||
},
|
||||
"distinct": distinct,
|
||||
"filter": {
|
||||
"with_geoRadius": filter_with_geo_radius,
|
||||
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
||||
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||
},
|
||||
"attributes_to_search_on": {
|
||||
"total_number_of_uses": attributes_to_search_on_total_number_of_uses,
|
||||
},
|
||||
"q": {
|
||||
"max_terms_number": max_terms_number,
|
||||
},
|
||||
"vector": {
|
||||
"max_vector_size": max_vector_size,
|
||||
"retrieve_vectors": retrieve_vectors,
|
||||
},
|
||||
"hybrid": {
|
||||
"enabled": hybrid,
|
||||
"semantic_ratio": semantic_ratio,
|
||||
},
|
||||
"pagination": {
|
||||
"max_limit": max_limit,
|
||||
"max_offset": max_offset,
|
||||
"most_used_navigation": if finite_pagination > (total_received / 2) { "exhaustive" } else { "estimated" },
|
||||
},
|
||||
"formatting": {
|
||||
"max_attributes_to_retrieve": max_attributes_to_retrieve,
|
||||
"max_attributes_to_highlight": max_attributes_to_highlight,
|
||||
"highlight_pre_tag": highlight_pre_tag,
|
||||
"highlight_post_tag": highlight_post_tag,
|
||||
"max_attributes_to_crop": max_attributes_to_crop,
|
||||
"crop_marker": crop_marker,
|
||||
"show_matches_position": show_matches_position,
|
||||
"crop_length": crop_length,
|
||||
},
|
||||
"facets": {
|
||||
"avg_facets_number": format!("{:.2}", facets_sum_of_terms as f64 / facets_total_number_of_facets as f64),
|
||||
},
|
||||
"matching_strategy": {
|
||||
"most_used_strategy": matching_strategy.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||
},
|
||||
"locales": locales,
|
||||
"scoring": {
|
||||
"show_ranking_score": show_ranking_score,
|
||||
"show_ranking_score_details": show_ranking_score_details,
|
||||
"ranking_score_threshold": ranking_score_threshold,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -1,14 +1,15 @@
|
||||
use super::settings_analytics::*;
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use deserr::actix_web::AwebJson;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch_types::deserr::DeserrJsonError;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::update::Setting;
|
||||
use meilisearch_types::settings::{settings, SecretPolicy, Settings, Unchecked};
|
||||
use meilisearch_types::settings::{settings, RankingRuleView, SecretPolicy, Settings, Unchecked};
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use serde_json::json;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
@@ -19,7 +20,7 @@ use crate::Opt;
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! make_setting_route {
|
||||
($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics:ident) => {
|
||||
($route:literal, $update_verb:ident, $type:ty, $err_ty:ty, $attr:ident, $camelcase_attr:literal, $analytics_var:ident, $analytics:expr) => {
|
||||
pub mod $attr {
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse, Resource};
|
||||
@@ -79,7 +80,7 @@ macro_rules! make_setting_route {
|
||||
body: deserr::actix_web::AwebJson<Option<$type>, $err_ty>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
$analytics_var: web::Data<dyn Analytics>,
|
||||
) -> std::result::Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
@@ -87,10 +88,7 @@ macro_rules! make_setting_route {
|
||||
debug!(parameters = ?body, "Update settings");
|
||||
|
||||
#[allow(clippy::redundant_closure_call)]
|
||||
analytics.publish(
|
||||
$crate::routes::indexes::settings_analytics::$analytics::new(body.as_ref()).into_settings(),
|
||||
&req,
|
||||
);
|
||||
$analytics(&body, &req);
|
||||
|
||||
let new_settings = Settings {
|
||||
$attr: match body {
|
||||
@@ -162,7 +160,21 @@ make_setting_route!(
|
||||
>,
|
||||
filterable_attributes,
|
||||
"filterableAttributes",
|
||||
FilterableAttributesAnalytics
|
||||
analytics,
|
||||
|setting: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"FilterableAttributes Updated".to_string(),
|
||||
json!({
|
||||
"filterable_attributes": {
|
||||
"total": setting.as_ref().map(|filter| filter.len()).unwrap_or(0),
|
||||
"has_geo": setting.as_ref().map(|filter| filter.contains("_geo")).unwrap_or(false),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -174,7 +186,21 @@ make_setting_route!(
|
||||
>,
|
||||
sortable_attributes,
|
||||
"sortableAttributes",
|
||||
SortableAttributesAnalytics
|
||||
analytics,
|
||||
|setting: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"SortableAttributes Updated".to_string(),
|
||||
json!({
|
||||
"sortable_attributes": {
|
||||
"total": setting.as_ref().map(|sort| sort.len()),
|
||||
"has_geo": setting.as_ref().map(|sort| sort.contains("_geo")),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -186,7 +212,21 @@ make_setting_route!(
|
||||
>,
|
||||
displayed_attributes,
|
||||
"displayedAttributes",
|
||||
DisplayedAttributesAnalytics
|
||||
analytics,
|
||||
|displayed: &Option<Vec<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"DisplayedAttributes Updated".to_string(),
|
||||
json!({
|
||||
"displayed_attributes": {
|
||||
"total": displayed.as_ref().map(|displayed| displayed.len()),
|
||||
"with_wildcard": displayed.as_ref().map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -198,7 +238,40 @@ make_setting_route!(
|
||||
>,
|
||||
typo_tolerance,
|
||||
"typoTolerance",
|
||||
TypoToleranceAnalytics
|
||||
analytics,
|
||||
|setting: &Option<meilisearch_types::settings::TypoSettings>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"TypoTolerance Updated".to_string(),
|
||||
json!({
|
||||
"typo_tolerance": {
|
||||
"enabled": setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))),
|
||||
"disable_on_attributes": setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
|
||||
"disable_on_words": setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
|
||||
"min_word_size_for_one_typo": setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.min_word_size_for_typos
|
||||
.as_ref()
|
||||
.set()
|
||||
.map(|s| s.one_typo.set()))
|
||||
.flatten(),
|
||||
"min_word_size_for_two_typos": setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.min_word_size_for_typos
|
||||
.as_ref()
|
||||
.set()
|
||||
.map(|s| s.two_typos.set()))
|
||||
.flatten(),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -210,7 +283,21 @@ make_setting_route!(
|
||||
>,
|
||||
searchable_attributes,
|
||||
"searchableAttributes",
|
||||
SearchableAttributesAnalytics
|
||||
analytics,
|
||||
|setting: &Option<Vec<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"SearchableAttributes Updated".to_string(),
|
||||
json!({
|
||||
"searchable_attributes": {
|
||||
"total": setting.as_ref().map(|searchable| searchable.len()),
|
||||
"with_wildcard": setting.as_ref().map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -222,7 +309,20 @@ make_setting_route!(
|
||||
>,
|
||||
stop_words,
|
||||
"stopWords",
|
||||
StopWordsAnalytics
|
||||
analytics,
|
||||
|stop_words: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"StopWords Updated".to_string(),
|
||||
json!({
|
||||
"stop_words": {
|
||||
"total": stop_words.as_ref().map(|stop_words| stop_words.len()),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -234,7 +334,20 @@ make_setting_route!(
|
||||
>,
|
||||
non_separator_tokens,
|
||||
"nonSeparatorTokens",
|
||||
NonSeparatorTokensAnalytics
|
||||
analytics,
|
||||
|non_separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"nonSeparatorTokens Updated".to_string(),
|
||||
json!({
|
||||
"non_separator_tokens": {
|
||||
"total": non_separator_tokens.as_ref().map(|non_separator_tokens| non_separator_tokens.len()),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -246,7 +359,20 @@ make_setting_route!(
|
||||
>,
|
||||
separator_tokens,
|
||||
"separatorTokens",
|
||||
SeparatorTokensAnalytics
|
||||
analytics,
|
||||
|separator_tokens: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"separatorTokens Updated".to_string(),
|
||||
json!({
|
||||
"separator_tokens": {
|
||||
"total": separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -258,7 +384,20 @@ make_setting_route!(
|
||||
>,
|
||||
dictionary,
|
||||
"dictionary",
|
||||
DictionaryAnalytics
|
||||
analytics,
|
||||
|dictionary: &Option<std::collections::BTreeSet<String>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"dictionary Updated".to_string(),
|
||||
json!({
|
||||
"dictionary": {
|
||||
"total": dictionary.as_ref().map(|dictionary| dictionary.len()),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -270,7 +409,20 @@ make_setting_route!(
|
||||
>,
|
||||
synonyms,
|
||||
"synonyms",
|
||||
SynonymsAnalytics
|
||||
analytics,
|
||||
|synonyms: &Option<std::collections::BTreeMap<String, Vec<String>>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"Synonyms Updated".to_string(),
|
||||
json!({
|
||||
"synonyms": {
|
||||
"total": synonyms.as_ref().map(|synonyms| synonyms.len()),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -282,7 +434,19 @@ make_setting_route!(
|
||||
>,
|
||||
distinct_attribute,
|
||||
"distinctAttribute",
|
||||
DistinctAttributeAnalytics
|
||||
analytics,
|
||||
|distinct: &Option<String>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
analytics.publish(
|
||||
"DistinctAttribute Updated".to_string(),
|
||||
json!({
|
||||
"distinct_attribute": {
|
||||
"set": distinct.is_some(),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -294,7 +458,20 @@ make_setting_route!(
|
||||
>,
|
||||
proximity_precision,
|
||||
"proximityPrecision",
|
||||
ProximityPrecisionAnalytics
|
||||
analytics,
|
||||
|precision: &Option<meilisearch_types::settings::ProximityPrecisionView>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
analytics.publish(
|
||||
"ProximityPrecision Updated".to_string(),
|
||||
json!({
|
||||
"proximity_precision": {
|
||||
"set": precision.is_some(),
|
||||
"value": precision.unwrap_or_default(),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -306,7 +483,17 @@ make_setting_route!(
|
||||
>,
|
||||
localized_attributes,
|
||||
"localizedAttributes",
|
||||
LocalesAnalytics
|
||||
analytics,
|
||||
|rules: &Option<Vec<meilisearch_types::locales::LocalizedAttributesRuleView>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
analytics.publish(
|
||||
"LocalizedAttributesRules Updated".to_string(),
|
||||
json!({
|
||||
"locales": rules.as_ref().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>())
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -318,7 +505,26 @@ make_setting_route!(
|
||||
>,
|
||||
ranking_rules,
|
||||
"rankingRules",
|
||||
RankingRulesAnalytics
|
||||
analytics,
|
||||
|setting: &Option<Vec<meilisearch_types::settings::RankingRuleView>>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"RankingRules Updated".to_string(),
|
||||
json!({
|
||||
"ranking_rules": {
|
||||
"words_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))),
|
||||
"typo_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))),
|
||||
"proximity_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Proximity))),
|
||||
"attribute_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Attribute))),
|
||||
"sort_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))),
|
||||
"exactness_position": setting.as_ref().map(|rr| rr.iter().position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Exactness))),
|
||||
"values": setting.as_ref().map(|rr| rr.iter().filter(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Asc(_) | meilisearch_types::settings::RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::<Vec<_>>().join(", ")),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -330,7 +536,25 @@ make_setting_route!(
|
||||
>,
|
||||
faceting,
|
||||
"faceting",
|
||||
FacetingAnalytics
|
||||
analytics,
|
||||
|setting: &Option<meilisearch_types::settings::FacetingSettings>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
||||
|
||||
analytics.publish(
|
||||
"Faceting Updated".to_string(),
|
||||
json!({
|
||||
"faceting": {
|
||||
"max_values_per_facet": setting.as_ref().and_then(|s| s.max_values_per_facet.set()),
|
||||
"sort_facet_values_by_star_count": setting.as_ref().and_then(|s| {
|
||||
s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
|
||||
}),
|
||||
"sort_facet_values_by_total": setting.as_ref().and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -342,7 +566,20 @@ make_setting_route!(
|
||||
>,
|
||||
pagination,
|
||||
"pagination",
|
||||
PaginationAnalytics
|
||||
analytics,
|
||||
|setting: &Option<meilisearch_types::settings::PaginationSettings>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
|
||||
analytics.publish(
|
||||
"Pagination Updated".to_string(),
|
||||
json!({
|
||||
"pagination": {
|
||||
"max_total_hits": setting.as_ref().and_then(|s| s.max_total_hits.set()),
|
||||
},
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
@@ -354,9 +591,75 @@ make_setting_route!(
|
||||
>,
|
||||
embedders,
|
||||
"embedders",
|
||||
EmbeddersAnalytics
|
||||
analytics,
|
||||
|setting: &Option<std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>>, req: &HttpRequest| {
|
||||
|
||||
|
||||
analytics.publish(
|
||||
"Embedders Updated".to_string(),
|
||||
serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
fn embedder_analytics(
|
||||
setting: Option<
|
||||
&std::collections::BTreeMap<
|
||||
String,
|
||||
Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>,
|
||||
>,
|
||||
>,
|
||||
) -> serde_json::Value {
|
||||
let mut sources = std::collections::HashSet::new();
|
||||
|
||||
if let Some(s) = &setting {
|
||||
for source in s
|
||||
.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.filter_map(|config| config.source.set())
|
||||
{
|
||||
use meilisearch_types::milli::vector::settings::EmbedderSource;
|
||||
match source {
|
||||
EmbedderSource::OpenAi => sources.insert("openAi"),
|
||||
EmbedderSource::HuggingFace => sources.insert("huggingFace"),
|
||||
EmbedderSource::UserProvided => sources.insert("userProvided"),
|
||||
EmbedderSource::Ollama => sources.insert("ollama"),
|
||||
EmbedderSource::Rest => sources.insert("rest"),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let document_template_used = setting.as_ref().map(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.any(|config| config.document_template.set().is_some())
|
||||
});
|
||||
|
||||
let document_template_max_bytes = setting.as_ref().and_then(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.filter_map(|config| config.document_template_max_bytes.set())
|
||||
.max()
|
||||
});
|
||||
|
||||
let binary_quantization_used = setting.as_ref().map(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.any(|config| config.binary_quantized.set().is_some())
|
||||
});
|
||||
|
||||
json!(
|
||||
{
|
||||
"total": setting.as_ref().map(|s| s.len()),
|
||||
"sources": sources,
|
||||
"document_template_used": document_template_used,
|
||||
"document_template_max_bytes": document_template_max_bytes,
|
||||
"binary_quantization_used": binary_quantization_used,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
make_setting_route!(
|
||||
"/search-cutoff-ms",
|
||||
put,
|
||||
@@ -366,7 +669,14 @@ make_setting_route!(
|
||||
>,
|
||||
search_cutoff_ms,
|
||||
"searchCutoffMs",
|
||||
SearchCutoffMsAnalytics
|
||||
analytics,
|
||||
|setting: &Option<u64>, req: &HttpRequest| {
|
||||
analytics.publish(
|
||||
"Search Cutoff Updated".to_string(),
|
||||
serde_json::json!({"search_cutoff_ms": setting }),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
macro_rules! generate_configure {
|
||||
@@ -410,7 +720,7 @@ pub async fn update_all(
|
||||
body: AwebJson<Settings<Unchecked>, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
@@ -419,45 +729,104 @@ pub async fn update_all(
|
||||
let new_settings = validate_settings(new_settings, &index_scheduler)?;
|
||||
|
||||
analytics.publish(
|
||||
SettingsAnalytics {
|
||||
ranking_rules: RankingRulesAnalytics::new(new_settings.ranking_rules.as_ref().set()),
|
||||
searchable_attributes: SearchableAttributesAnalytics::new(
|
||||
new_settings.searchable_attributes.as_ref().set(),
|
||||
),
|
||||
displayed_attributes: DisplayedAttributesAnalytics::new(
|
||||
new_settings.displayed_attributes.as_ref().set(),
|
||||
),
|
||||
sortable_attributes: SortableAttributesAnalytics::new(
|
||||
new_settings.sortable_attributes.as_ref().set(),
|
||||
),
|
||||
filterable_attributes: FilterableAttributesAnalytics::new(
|
||||
new_settings.filterable_attributes.as_ref().set(),
|
||||
),
|
||||
distinct_attribute: DistinctAttributeAnalytics::new(
|
||||
new_settings.distinct_attribute.as_ref().set(),
|
||||
),
|
||||
proximity_precision: ProximityPrecisionAnalytics::new(
|
||||
new_settings.proximity_precision.as_ref().set(),
|
||||
),
|
||||
typo_tolerance: TypoToleranceAnalytics::new(new_settings.typo_tolerance.as_ref().set()),
|
||||
faceting: FacetingAnalytics::new(new_settings.faceting.as_ref().set()),
|
||||
pagination: PaginationAnalytics::new(new_settings.pagination.as_ref().set()),
|
||||
stop_words: StopWordsAnalytics::new(new_settings.stop_words.as_ref().set()),
|
||||
synonyms: SynonymsAnalytics::new(new_settings.synonyms.as_ref().set()),
|
||||
embedders: EmbeddersAnalytics::new(new_settings.embedders.as_ref().set()),
|
||||
search_cutoff_ms: SearchCutoffMsAnalytics::new(
|
||||
new_settings.search_cutoff_ms.as_ref().set(),
|
||||
),
|
||||
locales: LocalesAnalytics::new(new_settings.localized_attributes.as_ref().set()),
|
||||
dictionary: DictionaryAnalytics::new(new_settings.dictionary.as_ref().set()),
|
||||
separator_tokens: SeparatorTokensAnalytics::new(
|
||||
new_settings.separator_tokens.as_ref().set(),
|
||||
),
|
||||
non_separator_tokens: NonSeparatorTokensAnalytics::new(
|
||||
new_settings.non_separator_tokens.as_ref().set(),
|
||||
),
|
||||
},
|
||||
&req,
|
||||
"Settings Updated".to_string(),
|
||||
json!({
|
||||
"ranking_rules": {
|
||||
"words_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Words))),
|
||||
"typo_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Typo))),
|
||||
"proximity_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Proximity))),
|
||||
"attribute_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Attribute))),
|
||||
"sort_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Sort))),
|
||||
"exactness_position": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().position(|s| matches!(s, RankingRuleView::Exactness))),
|
||||
"values": new_settings.ranking_rules.as_ref().set().map(|rr| rr.iter().filter(|s| !matches!(s, RankingRuleView::Asc(_) | RankingRuleView::Desc(_)) ).map(|x| x.to_string()).collect::<Vec<_>>().join(", ")),
|
||||
},
|
||||
"searchable_attributes": {
|
||||
"total": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.len()),
|
||||
"with_wildcard": new_settings.searchable_attributes.as_ref().set().map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
|
||||
},
|
||||
"displayed_attributes": {
|
||||
"total": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.len()),
|
||||
"with_wildcard": new_settings.displayed_attributes.as_ref().set().map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
|
||||
},
|
||||
"sortable_attributes": {
|
||||
"total": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.len()),
|
||||
"has_geo": new_settings.sortable_attributes.as_ref().set().map(|sort| sort.iter().any(|s| s == "_geo")),
|
||||
},
|
||||
"filterable_attributes": {
|
||||
"total": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.len()),
|
||||
"has_geo": new_settings.filterable_attributes.as_ref().set().map(|filter| filter.iter().any(|s| s == "_geo")),
|
||||
},
|
||||
"distinct_attribute": {
|
||||
"set": new_settings.distinct_attribute.as_ref().set().is_some()
|
||||
},
|
||||
"proximity_precision": {
|
||||
"set": new_settings.proximity_precision.as_ref().set().is_some(),
|
||||
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
|
||||
},
|
||||
"typo_tolerance": {
|
||||
"enabled": new_settings.typo_tolerance
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.enabled.as_ref().set())
|
||||
.copied(),
|
||||
"disable_on_attributes": new_settings.typo_tolerance
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
|
||||
"disable_on_words": new_settings.typo_tolerance
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
|
||||
"min_word_size_for_one_typo": new_settings.typo_tolerance
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.min_word_size_for_typos
|
||||
.as_ref()
|
||||
.set()
|
||||
.map(|s| s.one_typo.set()))
|
||||
.flatten(),
|
||||
"min_word_size_for_two_typos": new_settings.typo_tolerance
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.min_word_size_for_typos
|
||||
.as_ref()
|
||||
.set()
|
||||
.map(|s| s.two_typos.set()))
|
||||
.flatten(),
|
||||
},
|
||||
"faceting": {
|
||||
"max_values_per_facet": new_settings.faceting
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.max_values_per_facet.as_ref().set()),
|
||||
"sort_facet_values_by_star_count": new_settings.faceting
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| {
|
||||
s.sort_facet_values_by.as_ref().set().map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
|
||||
}),
|
||||
"sort_facet_values_by_total": new_settings.faceting
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
|
||||
},
|
||||
"pagination": {
|
||||
"max_total_hits": new_settings.pagination
|
||||
.as_ref()
|
||||
.set()
|
||||
.and_then(|s| s.max_total_hits.as_ref().set()),
|
||||
},
|
||||
"stop_words": {
|
||||
"total": new_settings.stop_words.as_ref().set().map(|stop_words| stop_words.len()),
|
||||
},
|
||||
"synonyms": {
|
||||
"total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()),
|
||||
},
|
||||
"embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set()),
|
||||
"search_cutoff_ms": new_settings.search_cutoff_ms.as_ref().set(),
|
||||
"locales": new_settings.localized_attributes.as_ref().set().map(|rules| rules.iter().flat_map(|rule| rule.locales.iter().cloned()).collect::<std::collections::BTreeSet<_>>()),
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
|
||||
let allow_index_creation = index_scheduler.filters().allow_index_creation(&index_uid);
|
||||
|
||||
@@ -1,621 +0,0 @@
|
||||
//! All the structures used to make the analytics on the settings works.
|
||||
//! The signatures of the `new` functions are not very rust idiomatic because they must match the types received
|
||||
//! through the sub-settings route directly without any manipulation.
|
||||
//! This is why we often use a `Option<&Vec<_>>` instead of a `Option<&[_]>`.
|
||||
|
||||
use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView};
|
||||
use meilisearch_types::milli::update::Setting;
|
||||
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
||||
use meilisearch_types::settings::{
|
||||
FacetingSettings, PaginationSettings, ProximityPrecisionView, TypoSettings,
|
||||
};
|
||||
use meilisearch_types::{facet_values_sort::FacetValuesSort, settings::RankingRuleView};
|
||||
use serde::Serialize;
|
||||
use std::collections::{BTreeMap, BTreeSet, HashSet};
|
||||
|
||||
use crate::analytics::Aggregate;
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct SettingsAnalytics {
|
||||
pub ranking_rules: RankingRulesAnalytics,
|
||||
pub searchable_attributes: SearchableAttributesAnalytics,
|
||||
pub displayed_attributes: DisplayedAttributesAnalytics,
|
||||
pub sortable_attributes: SortableAttributesAnalytics,
|
||||
pub filterable_attributes: FilterableAttributesAnalytics,
|
||||
pub distinct_attribute: DistinctAttributeAnalytics,
|
||||
pub proximity_precision: ProximityPrecisionAnalytics,
|
||||
pub typo_tolerance: TypoToleranceAnalytics,
|
||||
pub faceting: FacetingAnalytics,
|
||||
pub pagination: PaginationAnalytics,
|
||||
pub stop_words: StopWordsAnalytics,
|
||||
pub synonyms: SynonymsAnalytics,
|
||||
pub embedders: EmbeddersAnalytics,
|
||||
pub search_cutoff_ms: SearchCutoffMsAnalytics,
|
||||
pub locales: LocalesAnalytics,
|
||||
pub dictionary: DictionaryAnalytics,
|
||||
pub separator_tokens: SeparatorTokensAnalytics,
|
||||
pub non_separator_tokens: NonSeparatorTokensAnalytics,
|
||||
}
|
||||
|
||||
impl Aggregate for SettingsAnalytics {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Settings Updated"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
ranking_rules: RankingRulesAnalytics {
|
||||
words_position: new
|
||||
.ranking_rules
|
||||
.words_position
|
||||
.or(self.ranking_rules.words_position),
|
||||
typo_position: new.ranking_rules.typo_position.or(self.ranking_rules.typo_position),
|
||||
proximity_position: new
|
||||
.ranking_rules
|
||||
.proximity_position
|
||||
.or(self.ranking_rules.proximity_position),
|
||||
attribute_position: new
|
||||
.ranking_rules
|
||||
.attribute_position
|
||||
.or(self.ranking_rules.attribute_position),
|
||||
sort_position: new.ranking_rules.sort_position.or(self.ranking_rules.sort_position),
|
||||
exactness_position: new
|
||||
.ranking_rules
|
||||
.exactness_position
|
||||
.or(self.ranking_rules.exactness_position),
|
||||
values: new.ranking_rules.values.or(self.ranking_rules.values),
|
||||
},
|
||||
searchable_attributes: SearchableAttributesAnalytics {
|
||||
total: new.searchable_attributes.total.or(self.searchable_attributes.total),
|
||||
with_wildcard: new
|
||||
.searchable_attributes
|
||||
.with_wildcard
|
||||
.or(self.searchable_attributes.with_wildcard),
|
||||
},
|
||||
displayed_attributes: DisplayedAttributesAnalytics {
|
||||
total: new.displayed_attributes.total.or(self.displayed_attributes.total),
|
||||
with_wildcard: new
|
||||
.displayed_attributes
|
||||
.with_wildcard
|
||||
.or(self.displayed_attributes.with_wildcard),
|
||||
},
|
||||
sortable_attributes: SortableAttributesAnalytics {
|
||||
total: new.sortable_attributes.total.or(self.sortable_attributes.total),
|
||||
has_geo: new.sortable_attributes.has_geo.or(self.sortable_attributes.has_geo),
|
||||
},
|
||||
filterable_attributes: FilterableAttributesAnalytics {
|
||||
total: new.filterable_attributes.total.or(self.filterable_attributes.total),
|
||||
has_geo: new.filterable_attributes.has_geo.or(self.filterable_attributes.has_geo),
|
||||
},
|
||||
distinct_attribute: DistinctAttributeAnalytics {
|
||||
set: self.distinct_attribute.set | new.distinct_attribute.set,
|
||||
},
|
||||
proximity_precision: ProximityPrecisionAnalytics {
|
||||
set: self.proximity_precision.set | new.proximity_precision.set,
|
||||
value: new.proximity_precision.value.or(self.proximity_precision.value),
|
||||
},
|
||||
typo_tolerance: TypoToleranceAnalytics {
|
||||
enabled: new.typo_tolerance.enabled.or(self.typo_tolerance.enabled),
|
||||
disable_on_attributes: new
|
||||
.typo_tolerance
|
||||
.disable_on_attributes
|
||||
.or(self.typo_tolerance.disable_on_attributes),
|
||||
disable_on_words: new
|
||||
.typo_tolerance
|
||||
.disable_on_words
|
||||
.or(self.typo_tolerance.disable_on_words),
|
||||
min_word_size_for_one_typo: new
|
||||
.typo_tolerance
|
||||
.min_word_size_for_one_typo
|
||||
.or(self.typo_tolerance.min_word_size_for_one_typo),
|
||||
min_word_size_for_two_typos: new
|
||||
.typo_tolerance
|
||||
.min_word_size_for_two_typos
|
||||
.or(self.typo_tolerance.min_word_size_for_two_typos),
|
||||
},
|
||||
faceting: FacetingAnalytics {
|
||||
max_values_per_facet: new
|
||||
.faceting
|
||||
.max_values_per_facet
|
||||
.or(self.faceting.max_values_per_facet),
|
||||
sort_facet_values_by_star_count: new
|
||||
.faceting
|
||||
.sort_facet_values_by_star_count
|
||||
.or(self.faceting.sort_facet_values_by_star_count),
|
||||
sort_facet_values_by_total: new
|
||||
.faceting
|
||||
.sort_facet_values_by_total
|
||||
.or(self.faceting.sort_facet_values_by_total),
|
||||
},
|
||||
pagination: PaginationAnalytics {
|
||||
max_total_hits: new.pagination.max_total_hits.or(self.pagination.max_total_hits),
|
||||
},
|
||||
stop_words: StopWordsAnalytics {
|
||||
total: new.stop_words.total.or(self.stop_words.total),
|
||||
},
|
||||
synonyms: SynonymsAnalytics { total: new.synonyms.total.or(self.synonyms.total) },
|
||||
embedders: EmbeddersAnalytics {
|
||||
total: new.embedders.total.or(self.embedders.total),
|
||||
sources: match (self.embedders.sources, new.embedders.sources) {
|
||||
(None, None) => None,
|
||||
(Some(sources), None) | (None, Some(sources)) => Some(sources),
|
||||
(Some(this), Some(other)) => Some(this.union(&other).cloned().collect()),
|
||||
},
|
||||
document_template_used: match (
|
||||
self.embedders.document_template_used,
|
||||
new.embedders.document_template_used,
|
||||
) {
|
||||
(None, None) => None,
|
||||
(Some(used), None) | (None, Some(used)) => Some(used),
|
||||
(Some(this), Some(other)) => Some(this | other),
|
||||
},
|
||||
document_template_max_bytes: match (
|
||||
self.embedders.document_template_max_bytes,
|
||||
new.embedders.document_template_max_bytes,
|
||||
) {
|
||||
(None, None) => None,
|
||||
(Some(bytes), None) | (None, Some(bytes)) => Some(bytes),
|
||||
(Some(this), Some(other)) => Some(this.max(other)),
|
||||
},
|
||||
binary_quantization_used: match (
|
||||
self.embedders.binary_quantization_used,
|
||||
new.embedders.binary_quantization_used,
|
||||
) {
|
||||
(None, None) => None,
|
||||
(Some(bq), None) | (None, Some(bq)) => Some(bq),
|
||||
(Some(this), Some(other)) => Some(this | other),
|
||||
},
|
||||
},
|
||||
search_cutoff_ms: SearchCutoffMsAnalytics {
|
||||
search_cutoff_ms: new
|
||||
.search_cutoff_ms
|
||||
.search_cutoff_ms
|
||||
.or(self.search_cutoff_ms.search_cutoff_ms),
|
||||
},
|
||||
locales: LocalesAnalytics { locales: new.locales.locales.or(self.locales.locales) },
|
||||
dictionary: DictionaryAnalytics {
|
||||
total: new.dictionary.total.or(self.dictionary.total),
|
||||
},
|
||||
separator_tokens: SeparatorTokensAnalytics {
|
||||
total: new.non_separator_tokens.total.or(self.separator_tokens.total),
|
||||
},
|
||||
non_separator_tokens: NonSeparatorTokensAnalytics {
|
||||
total: new.non_separator_tokens.total.or(self.non_separator_tokens.total),
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct RankingRulesAnalytics {
|
||||
pub words_position: Option<usize>,
|
||||
pub typo_position: Option<usize>,
|
||||
pub proximity_position: Option<usize>,
|
||||
pub attribute_position: Option<usize>,
|
||||
pub sort_position: Option<usize>,
|
||||
pub exactness_position: Option<usize>,
|
||||
pub values: Option<String>,
|
||||
}
|
||||
|
||||
impl RankingRulesAnalytics {
|
||||
pub fn new(rr: Option<&Vec<RankingRuleView>>) -> Self {
|
||||
RankingRulesAnalytics {
|
||||
words_position: rr.as_ref().and_then(|rr| {
|
||||
rr.iter()
|
||||
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Words))
|
||||
}),
|
||||
typo_position: rr.as_ref().and_then(|rr| {
|
||||
rr.iter()
|
||||
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Typo))
|
||||
}),
|
||||
proximity_position: rr.as_ref().and_then(|rr| {
|
||||
rr.iter().position(|s| {
|
||||
matches!(s, meilisearch_types::settings::RankingRuleView::Proximity)
|
||||
})
|
||||
}),
|
||||
attribute_position: rr.as_ref().and_then(|rr| {
|
||||
rr.iter().position(|s| {
|
||||
matches!(s, meilisearch_types::settings::RankingRuleView::Attribute)
|
||||
})
|
||||
}),
|
||||
sort_position: rr.as_ref().and_then(|rr| {
|
||||
rr.iter()
|
||||
.position(|s| matches!(s, meilisearch_types::settings::RankingRuleView::Sort))
|
||||
}),
|
||||
exactness_position: rr.as_ref().and_then(|rr| {
|
||||
rr.iter().position(|s| {
|
||||
matches!(s, meilisearch_types::settings::RankingRuleView::Exactness)
|
||||
})
|
||||
}),
|
||||
values: rr.as_ref().map(|rr| {
|
||||
rr.iter()
|
||||
.filter(|s| {
|
||||
matches!(
|
||||
s,
|
||||
meilisearch_types::settings::RankingRuleView::Asc(_)
|
||||
| meilisearch_types::settings::RankingRuleView::Desc(_)
|
||||
)
|
||||
})
|
||||
.map(|x| x.to_string())
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { ranking_rules: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct SearchableAttributesAnalytics {
|
||||
pub total: Option<usize>,
|
||||
pub with_wildcard: Option<bool>,
|
||||
}
|
||||
|
||||
impl SearchableAttributesAnalytics {
|
||||
pub fn new(setting: Option<&Vec<String>>) -> Self {
|
||||
Self {
|
||||
total: setting.as_ref().map(|searchable| searchable.len()),
|
||||
with_wildcard: setting
|
||||
.as_ref()
|
||||
.map(|searchable| searchable.iter().any(|searchable| searchable == "*")),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { searchable_attributes: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct DisplayedAttributesAnalytics {
|
||||
pub total: Option<usize>,
|
||||
pub with_wildcard: Option<bool>,
|
||||
}
|
||||
|
||||
impl DisplayedAttributesAnalytics {
|
||||
pub fn new(displayed: Option<&Vec<String>>) -> Self {
|
||||
Self {
|
||||
total: displayed.as_ref().map(|displayed| displayed.len()),
|
||||
with_wildcard: displayed
|
||||
.as_ref()
|
||||
.map(|displayed| displayed.iter().any(|displayed| displayed == "*")),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { displayed_attributes: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct SortableAttributesAnalytics {
|
||||
pub total: Option<usize>,
|
||||
pub has_geo: Option<bool>,
|
||||
}
|
||||
|
||||
impl SortableAttributesAnalytics {
|
||||
pub fn new(setting: Option<&BTreeSet<String>>) -> Self {
|
||||
Self {
|
||||
total: setting.as_ref().map(|sort| sort.len()),
|
||||
has_geo: setting.as_ref().map(|sort| sort.contains("_geo")),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { sortable_attributes: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct FilterableAttributesAnalytics {
|
||||
pub total: Option<usize>,
|
||||
pub has_geo: Option<bool>,
|
||||
}
|
||||
|
||||
impl FilterableAttributesAnalytics {
|
||||
pub fn new(setting: Option<&BTreeSet<String>>) -> Self {
|
||||
Self {
|
||||
total: setting.as_ref().map(|filter| filter.len()),
|
||||
has_geo: setting.as_ref().map(|filter| filter.contains("_geo")),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { filterable_attributes: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct DistinctAttributeAnalytics {
|
||||
pub set: bool,
|
||||
}
|
||||
|
||||
impl DistinctAttributeAnalytics {
|
||||
pub fn new(distinct: Option<&String>) -> Self {
|
||||
Self { set: distinct.is_some() }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { distinct_attribute: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct ProximityPrecisionAnalytics {
|
||||
pub set: bool,
|
||||
pub value: Option<ProximityPrecisionView>,
|
||||
}
|
||||
|
||||
impl ProximityPrecisionAnalytics {
|
||||
pub fn new(precision: Option<&ProximityPrecisionView>) -> Self {
|
||||
Self { set: precision.is_some(), value: precision.cloned() }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { proximity_precision: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct TypoToleranceAnalytics {
|
||||
pub enabled: Option<bool>,
|
||||
pub disable_on_attributes: Option<bool>,
|
||||
pub disable_on_words: Option<bool>,
|
||||
pub min_word_size_for_one_typo: Option<u8>,
|
||||
pub min_word_size_for_two_typos: Option<u8>,
|
||||
}
|
||||
|
||||
impl TypoToleranceAnalytics {
|
||||
pub fn new(setting: Option<&TypoSettings>) -> Self {
|
||||
Self {
|
||||
enabled: setting.as_ref().map(|s| !matches!(s.enabled, Setting::Set(false))),
|
||||
disable_on_attributes: setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.disable_on_attributes.as_ref().set().map(|m| !m.is_empty())),
|
||||
disable_on_words: setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.disable_on_words.as_ref().set().map(|m| !m.is_empty())),
|
||||
min_word_size_for_one_typo: setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.one_typo.set()))
|
||||
.flatten(),
|
||||
min_word_size_for_two_typos: setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.min_word_size_for_typos.as_ref().set().map(|s| s.two_typos.set()))
|
||||
.flatten(),
|
||||
}
|
||||
}
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { typo_tolerance: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct FacetingAnalytics {
|
||||
pub max_values_per_facet: Option<usize>,
|
||||
pub sort_facet_values_by_star_count: Option<bool>,
|
||||
pub sort_facet_values_by_total: Option<usize>,
|
||||
}
|
||||
|
||||
impl FacetingAnalytics {
|
||||
pub fn new(setting: Option<&FacetingSettings>) -> Self {
|
||||
Self {
|
||||
max_values_per_facet: setting.as_ref().and_then(|s| s.max_values_per_facet.set()),
|
||||
sort_facet_values_by_star_count: setting.as_ref().and_then(|s| {
|
||||
s.sort_facet_values_by
|
||||
.as_ref()
|
||||
.set()
|
||||
.map(|s| s.iter().any(|(k, v)| k == "*" && v == &FacetValuesSort::Count))
|
||||
}),
|
||||
sort_facet_values_by_total: setting
|
||||
.as_ref()
|
||||
.and_then(|s| s.sort_facet_values_by.as_ref().set().map(|s| s.len())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { faceting: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct PaginationAnalytics {
|
||||
pub max_total_hits: Option<usize>,
|
||||
}
|
||||
|
||||
impl PaginationAnalytics {
|
||||
pub fn new(setting: Option<&PaginationSettings>) -> Self {
|
||||
Self { max_total_hits: setting.as_ref().and_then(|s| s.max_total_hits.set()) }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { pagination: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct StopWordsAnalytics {
|
||||
pub total: Option<usize>,
|
||||
}
|
||||
|
||||
impl StopWordsAnalytics {
|
||||
pub fn new(stop_words: Option<&BTreeSet<String>>) -> Self {
|
||||
Self { total: stop_words.as_ref().map(|stop_words| stop_words.len()) }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { stop_words: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct SynonymsAnalytics {
|
||||
pub total: Option<usize>,
|
||||
}
|
||||
|
||||
impl SynonymsAnalytics {
|
||||
pub fn new(synonyms: Option<&BTreeMap<String, Vec<String>>>) -> Self {
|
||||
Self { total: synonyms.as_ref().map(|synonyms| synonyms.len()) }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { synonyms: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct EmbeddersAnalytics {
|
||||
// last
|
||||
pub total: Option<usize>,
|
||||
// Merge the sources
|
||||
pub sources: Option<HashSet<String>>,
|
||||
// |=
|
||||
pub document_template_used: Option<bool>,
|
||||
// max
|
||||
pub document_template_max_bytes: Option<usize>,
|
||||
// |=
|
||||
pub binary_quantization_used: Option<bool>,
|
||||
}
|
||||
|
||||
impl EmbeddersAnalytics {
|
||||
pub fn new(setting: Option<&BTreeMap<String, Setting<EmbeddingSettings>>>) -> Self {
|
||||
let mut sources = std::collections::HashSet::new();
|
||||
|
||||
if let Some(s) = &setting {
|
||||
for source in s
|
||||
.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.filter_map(|config| config.source.set())
|
||||
{
|
||||
use meilisearch_types::milli::vector::settings::EmbedderSource;
|
||||
match source {
|
||||
EmbedderSource::OpenAi => sources.insert("openAi".to_string()),
|
||||
EmbedderSource::HuggingFace => sources.insert("huggingFace".to_string()),
|
||||
EmbedderSource::UserProvided => sources.insert("userProvided".to_string()),
|
||||
EmbedderSource::Ollama => sources.insert("ollama".to_string()),
|
||||
EmbedderSource::Rest => sources.insert("rest".to_string()),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
Self {
|
||||
total: setting.as_ref().map(|s| s.len()),
|
||||
sources: Some(sources),
|
||||
document_template_used: setting.as_ref().map(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.any(|config| config.document_template.set().is_some())
|
||||
}),
|
||||
document_template_max_bytes: setting.as_ref().and_then(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.filter_map(|config| config.document_template_max_bytes.set())
|
||||
.max()
|
||||
}),
|
||||
binary_quantization_used: setting.as_ref().map(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.any(|config| config.binary_quantized.set().is_some())
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { embedders: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
#[serde(transparent)]
|
||||
pub struct SearchCutoffMsAnalytics {
|
||||
pub search_cutoff_ms: Option<u64>,
|
||||
}
|
||||
|
||||
impl SearchCutoffMsAnalytics {
|
||||
pub fn new(setting: Option<&u64>) -> Self {
|
||||
Self { search_cutoff_ms: setting.copied() }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { search_cutoff_ms: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
#[serde(transparent)]
|
||||
pub struct LocalesAnalytics {
|
||||
pub locales: Option<BTreeSet<Locale>>,
|
||||
}
|
||||
|
||||
impl LocalesAnalytics {
|
||||
pub fn new(rules: Option<&Vec<LocalizedAttributesRuleView>>) -> Self {
|
||||
LocalesAnalytics {
|
||||
locales: rules.as_ref().map(|rules| {
|
||||
rules
|
||||
.iter()
|
||||
.flat_map(|rule| rule.locales.iter().cloned())
|
||||
.collect::<std::collections::BTreeSet<_>>()
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { locales: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct DictionaryAnalytics {
|
||||
pub total: Option<usize>,
|
||||
}
|
||||
|
||||
impl DictionaryAnalytics {
|
||||
pub fn new(dictionary: Option<&BTreeSet<String>>) -> Self {
|
||||
Self { total: dictionary.as_ref().map(|dictionary| dictionary.len()) }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { dictionary: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct SeparatorTokensAnalytics {
|
||||
pub total: Option<usize>,
|
||||
}
|
||||
|
||||
impl SeparatorTokensAnalytics {
|
||||
pub fn new(separator_tokens: Option<&BTreeSet<String>>) -> Self {
|
||||
Self { total: separator_tokens.as_ref().map(|separator_tokens| separator_tokens.len()) }
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { separator_tokens: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Default)]
|
||||
pub struct NonSeparatorTokensAnalytics {
|
||||
pub total: Option<usize>,
|
||||
}
|
||||
|
||||
impl NonSeparatorTokensAnalytics {
|
||||
pub fn new(non_separator_tokens: Option<&BTreeSet<String>>) -> Self {
|
||||
Self {
|
||||
total: non_separator_tokens
|
||||
.as_ref()
|
||||
.map(|non_separator_tokens| non_separator_tokens.len()),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_settings(self) -> SettingsAnalytics {
|
||||
SettingsAnalytics { non_separator_tokens: self, ..Default::default() }
|
||||
}
|
||||
}
|
||||
@@ -13,10 +13,9 @@ use serde_json::Value;
|
||||
use tracing::debug;
|
||||
|
||||
use super::ActionPolicy;
|
||||
use crate::analytics::Analytics;
|
||||
use crate::analytics::{Analytics, SimilarAggregator};
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::indexes::similar_analytics::{SimilarAggregator, SimilarGET, SimilarPOST};
|
||||
use crate::search::{
|
||||
add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind,
|
||||
SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
@@ -35,13 +34,13 @@ pub async fn similar_get(
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebQueryParameter<SimilarQueryGet, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
let query = params.0.try_into()?;
|
||||
|
||||
let mut aggregate = SimilarAggregator::<SimilarGET>::from_query(&query);
|
||||
let mut aggregate = SimilarAggregator::from_query(&query, &req);
|
||||
|
||||
debug!(parameters = ?query, "Similar get");
|
||||
|
||||
@@ -50,7 +49,7 @@ pub async fn similar_get(
|
||||
if let Ok(similar) = &similar {
|
||||
aggregate.succeed(similar);
|
||||
}
|
||||
analytics.publish(aggregate, &req);
|
||||
analytics.get_similar(aggregate);
|
||||
|
||||
let similar = similar?;
|
||||
|
||||
@@ -63,21 +62,21 @@ pub async fn similar_post(
|
||||
index_uid: web::Path<String>,
|
||||
params: AwebJson<SimilarQuery, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
let query = params.into_inner();
|
||||
debug!(parameters = ?query, "Similar post");
|
||||
|
||||
let mut aggregate = SimilarAggregator::<SimilarPOST>::from_query(&query);
|
||||
let mut aggregate = SimilarAggregator::from_query(&query, &req);
|
||||
|
||||
let similar = similar(index_scheduler, index_uid, query).await;
|
||||
|
||||
if let Ok(similar) = &similar {
|
||||
aggregate.succeed(similar);
|
||||
}
|
||||
analytics.publish(aggregate, &req);
|
||||
analytics.post_similar(aggregate);
|
||||
|
||||
let similar = similar?;
|
||||
|
||||
|
||||
@@ -1,235 +0,0 @@
|
||||
use std::collections::{BinaryHeap, HashMap};
|
||||
|
||||
use once_cell::sync::Lazy;
|
||||
use regex::Regex;
|
||||
use serde_json::{json, Value};
|
||||
|
||||
use crate::{
|
||||
aggregate_methods,
|
||||
analytics::{Aggregate, AggregateMethod},
|
||||
search::{SimilarQuery, SimilarResult},
|
||||
};
|
||||
|
||||
aggregate_methods!(
|
||||
SimilarPOST => "Similar POST",
|
||||
SimilarGET => "Similar GET",
|
||||
);
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct SimilarAggregator<Method: AggregateMethod> {
|
||||
// requests
|
||||
total_received: usize,
|
||||
total_succeeded: usize,
|
||||
time_spent: BinaryHeap<usize>,
|
||||
|
||||
// filter
|
||||
filter_with_geo_radius: bool,
|
||||
filter_with_geo_bounding_box: bool,
|
||||
// every time a request has a filter, this field must be incremented by the number of terms it contains
|
||||
filter_sum_of_criteria_terms: usize,
|
||||
// every time a request has a filter, this field must be incremented by one
|
||||
filter_total_number_of_criteria: usize,
|
||||
used_syntax: HashMap<String, usize>,
|
||||
|
||||
// Whether a non-default embedder was specified
|
||||
retrieve_vectors: bool,
|
||||
|
||||
// pagination
|
||||
max_limit: usize,
|
||||
max_offset: usize,
|
||||
|
||||
// formatting
|
||||
max_attributes_to_retrieve: usize,
|
||||
|
||||
// scoring
|
||||
show_ranking_score: bool,
|
||||
show_ranking_score_details: bool,
|
||||
ranking_score_threshold: bool,
|
||||
|
||||
marker: std::marker::PhantomData<Method>,
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> SimilarAggregator<Method> {
|
||||
#[allow(clippy::field_reassign_with_default)]
|
||||
pub fn from_query(query: &SimilarQuery) -> Self {
|
||||
let SimilarQuery {
|
||||
id: _,
|
||||
embedder: _,
|
||||
offset,
|
||||
limit,
|
||||
attributes_to_retrieve: _,
|
||||
retrieve_vectors,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
filter,
|
||||
ranking_score_threshold,
|
||||
} = query;
|
||||
|
||||
let mut ret = Self::default();
|
||||
|
||||
ret.total_received = 1;
|
||||
|
||||
if let Some(ref filter) = filter {
|
||||
static RE: Lazy<Regex> = Lazy::new(|| Regex::new("AND | OR").unwrap());
|
||||
ret.filter_total_number_of_criteria = 1;
|
||||
|
||||
let syntax = match filter {
|
||||
Value::String(_) => "string".to_string(),
|
||||
Value::Array(values) => {
|
||||
if values.iter().map(|v| v.to_string()).any(|s| RE.is_match(&s)) {
|
||||
"mixed".to_string()
|
||||
} else {
|
||||
"array".to_string()
|
||||
}
|
||||
}
|
||||
_ => "none".to_string(),
|
||||
};
|
||||
// convert the string to a HashMap
|
||||
ret.used_syntax.insert(syntax, 1);
|
||||
|
||||
let stringified_filters = filter.to_string();
|
||||
ret.filter_with_geo_radius = stringified_filters.contains("_geoRadius(");
|
||||
ret.filter_with_geo_bounding_box = stringified_filters.contains("_geoBoundingBox(");
|
||||
ret.filter_sum_of_criteria_terms = RE.split(&stringified_filters).count();
|
||||
}
|
||||
|
||||
ret.max_limit = *limit;
|
||||
ret.max_offset = *offset;
|
||||
|
||||
ret.show_ranking_score = *show_ranking_score;
|
||||
ret.show_ranking_score_details = *show_ranking_score_details;
|
||||
ret.ranking_score_threshold = ranking_score_threshold.is_some();
|
||||
|
||||
ret.retrieve_vectors = *retrieve_vectors;
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self, result: &SimilarResult) {
|
||||
let SimilarResult { id: _, hits: _, processing_time_ms, hits_info: _ } = result;
|
||||
|
||||
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||
|
||||
self.time_spent.push(*processing_time_ms as usize);
|
||||
}
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod> Aggregate for SimilarAggregator<Method> {
|
||||
fn event_name(&self) -> &'static str {
|
||||
Method::event_name()
|
||||
}
|
||||
|
||||
/// Aggregate one [SimilarAggregator] into another.
|
||||
fn aggregate(mut self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
let Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
mut time_spent,
|
||||
filter_with_geo_radius,
|
||||
filter_with_geo_bounding_box,
|
||||
filter_sum_of_criteria_terms,
|
||||
filter_total_number_of_criteria,
|
||||
used_syntax,
|
||||
max_limit,
|
||||
max_offset,
|
||||
max_attributes_to_retrieve,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
ranking_score_threshold,
|
||||
retrieve_vectors,
|
||||
marker: _,
|
||||
} = *new;
|
||||
|
||||
// request
|
||||
self.total_received = self.total_received.saturating_add(total_received);
|
||||
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
|
||||
self.time_spent.append(&mut time_spent);
|
||||
|
||||
// filter
|
||||
self.filter_with_geo_radius |= filter_with_geo_radius;
|
||||
self.filter_with_geo_bounding_box |= filter_with_geo_bounding_box;
|
||||
self.filter_sum_of_criteria_terms =
|
||||
self.filter_sum_of_criteria_terms.saturating_add(filter_sum_of_criteria_terms);
|
||||
self.filter_total_number_of_criteria =
|
||||
self.filter_total_number_of_criteria.saturating_add(filter_total_number_of_criteria);
|
||||
for (key, value) in used_syntax.into_iter() {
|
||||
let used_syntax = self.used_syntax.entry(key).or_insert(0);
|
||||
*used_syntax = used_syntax.saturating_add(value);
|
||||
}
|
||||
|
||||
self.retrieve_vectors |= retrieve_vectors;
|
||||
|
||||
// pagination
|
||||
self.max_limit = self.max_limit.max(max_limit);
|
||||
self.max_offset = self.max_offset.max(max_offset);
|
||||
|
||||
// formatting
|
||||
self.max_attributes_to_retrieve =
|
||||
self.max_attributes_to_retrieve.max(max_attributes_to_retrieve);
|
||||
|
||||
// scoring
|
||||
self.show_ranking_score |= show_ranking_score;
|
||||
self.show_ranking_score_details |= show_ranking_score_details;
|
||||
self.ranking_score_threshold |= ranking_score_threshold;
|
||||
|
||||
self
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
let Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
time_spent,
|
||||
filter_with_geo_radius,
|
||||
filter_with_geo_bounding_box,
|
||||
filter_sum_of_criteria_terms,
|
||||
filter_total_number_of_criteria,
|
||||
used_syntax,
|
||||
max_limit,
|
||||
max_offset,
|
||||
max_attributes_to_retrieve,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
ranking_score_threshold,
|
||||
retrieve_vectors,
|
||||
marker: _,
|
||||
} = *self;
|
||||
|
||||
// we get all the values in a sorted manner
|
||||
let time_spent = time_spent.into_sorted_vec();
|
||||
// the index of the 99th percentage of value
|
||||
let percentile_99th = time_spent.len() * 99 / 100;
|
||||
// We are only interested by the slowest value of the 99th fastest results
|
||||
let time_spent = time_spent.get(percentile_99th);
|
||||
|
||||
json!({
|
||||
"requests": {
|
||||
"99th_response_time": time_spent.map(|t| format!("{:.2}", t)),
|
||||
"total_succeeded": total_succeeded,
|
||||
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||
"total_received": total_received,
|
||||
},
|
||||
"filter": {
|
||||
"with_geoRadius": filter_with_geo_radius,
|
||||
"with_geoBoundingBox": filter_with_geo_bounding_box,
|
||||
"avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64),
|
||||
"most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)),
|
||||
},
|
||||
"vector": {
|
||||
"retrieve_vectors": retrieve_vectors,
|
||||
},
|
||||
"pagination": {
|
||||
"max_limit": max_limit,
|
||||
"max_offset": max_offset,
|
||||
},
|
||||
"formatting": {
|
||||
"max_attributes_to_retrieve": max_attributes_to_retrieve,
|
||||
},
|
||||
"scoring": {
|
||||
"show_ranking_score": show_ranking_score,
|
||||
"show_ranking_score_details": show_ranking_score_details,
|
||||
"ranking_score_threshold": ranking_score_threshold,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -25,7 +25,6 @@ pub mod indexes;
|
||||
mod logs;
|
||||
mod metrics;
|
||||
mod multi_search;
|
||||
mod multi_search_analytics;
|
||||
mod snapshot;
|
||||
mod swap_indexes;
|
||||
pub mod tasks;
|
||||
|
||||
@@ -9,7 +9,7 @@ use meilisearch_types::keys::actions;
|
||||
use serde::Serialize;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
use crate::analytics::{Analytics, MultiSearchAggregator};
|
||||
use crate::error::MeilisearchHttpError;
|
||||
use crate::extractors::authentication::policies::ActionPolicy;
|
||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||
@@ -21,8 +21,6 @@ use crate::search::{
|
||||
};
|
||||
use crate::search_queue::SearchQueue;
|
||||
|
||||
use super::multi_search_analytics::MultiSearchAggregator;
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(multi_search_with_post))));
|
||||
}
|
||||
@@ -37,7 +35,7 @@ pub async fn multi_search_with_post(
|
||||
search_queue: Data<SearchQueue>,
|
||||
params: AwebJson<FederatedSearch, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
// Since we don't want to process half of the search requests and then get a permit refused
|
||||
// we're going to get one permit for the whole duration of the multi-search request.
|
||||
@@ -45,7 +43,7 @@ pub async fn multi_search_with_post(
|
||||
|
||||
let federated_search = params.into_inner();
|
||||
|
||||
let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search);
|
||||
let mut multi_aggregate = MultiSearchAggregator::from_federated_search(&federated_search, &req);
|
||||
|
||||
let FederatedSearch { mut queries, federation } = federated_search;
|
||||
|
||||
@@ -89,7 +87,7 @@ pub async fn multi_search_with_post(
|
||||
multi_aggregate.succeed();
|
||||
}
|
||||
|
||||
analytics.publish(multi_aggregate, &req);
|
||||
analytics.post_multi_search(multi_aggregate);
|
||||
HttpResponse::Ok().json(search_result??)
|
||||
}
|
||||
None => {
|
||||
@@ -151,7 +149,7 @@ pub async fn multi_search_with_post(
|
||||
if search_results.is_ok() {
|
||||
multi_aggregate.succeed();
|
||||
}
|
||||
analytics.publish(multi_aggregate, &req);
|
||||
analytics.post_multi_search(multi_aggregate);
|
||||
|
||||
let search_results = search_results.map_err(|(mut err, query_index)| {
|
||||
// Add the query index that failed as context for the error message.
|
||||
|
||||
@@ -1,170 +0,0 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use serde_json::json;
|
||||
|
||||
use crate::{
|
||||
analytics::Aggregate,
|
||||
search::{FederatedSearch, SearchQueryWithIndex},
|
||||
};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MultiSearchAggregator {
|
||||
// requests
|
||||
total_received: usize,
|
||||
total_succeeded: usize,
|
||||
|
||||
// sum of the number of distinct indexes in each single request, use with total_received to compute an avg
|
||||
total_distinct_index_count: usize,
|
||||
// number of queries with a single index, use with total_received to compute a proportion
|
||||
total_single_index: usize,
|
||||
|
||||
// sum of the number of search queries in the requests, use with total_received to compute an average
|
||||
total_search_count: usize,
|
||||
|
||||
// scoring
|
||||
show_ranking_score: bool,
|
||||
show_ranking_score_details: bool,
|
||||
|
||||
// federation
|
||||
use_federation: bool,
|
||||
}
|
||||
|
||||
impl MultiSearchAggregator {
|
||||
pub fn from_federated_search(federated_search: &FederatedSearch) -> Self {
|
||||
let use_federation = federated_search.federation.is_some();
|
||||
|
||||
let distinct_indexes: HashSet<_> = federated_search
|
||||
.queries
|
||||
.iter()
|
||||
.map(|query| {
|
||||
let query = &query;
|
||||
// make sure we get a compilation error if a field gets added to / removed from SearchQueryWithIndex
|
||||
let SearchQueryWithIndex {
|
||||
index_uid,
|
||||
federation_options: _,
|
||||
q: _,
|
||||
vector: _,
|
||||
offset: _,
|
||||
limit: _,
|
||||
page: _,
|
||||
hits_per_page: _,
|
||||
attributes_to_retrieve: _,
|
||||
retrieve_vectors: _,
|
||||
attributes_to_crop: _,
|
||||
crop_length: _,
|
||||
attributes_to_highlight: _,
|
||||
show_ranking_score: _,
|
||||
show_ranking_score_details: _,
|
||||
show_matches_position: _,
|
||||
filter: _,
|
||||
sort: _,
|
||||
distinct: _,
|
||||
facets: _,
|
||||
highlight_pre_tag: _,
|
||||
highlight_post_tag: _,
|
||||
crop_marker: _,
|
||||
matching_strategy: _,
|
||||
attributes_to_search_on: _,
|
||||
hybrid: _,
|
||||
ranking_score_threshold: _,
|
||||
locales: _,
|
||||
} = query;
|
||||
|
||||
index_uid.as_str()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let show_ranking_score =
|
||||
federated_search.queries.iter().any(|query| query.show_ranking_score);
|
||||
let show_ranking_score_details =
|
||||
federated_search.queries.iter().any(|query| query.show_ranking_score_details);
|
||||
|
||||
Self {
|
||||
total_received: 1,
|
||||
total_succeeded: 0,
|
||||
total_distinct_index_count: distinct_indexes.len(),
|
||||
total_single_index: if distinct_indexes.len() == 1 { 1 } else { 0 },
|
||||
total_search_count: federated_search.queries.len(),
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
use_federation,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn succeed(&mut self) {
|
||||
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||
}
|
||||
}
|
||||
|
||||
impl Aggregate for MultiSearchAggregator {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Documents Searched by Multi-Search POST"
|
||||
}
|
||||
|
||||
/// Aggregate one [MultiSearchAggregator] into another.
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
// write the aggregate in a way that will cause a compilation error if a field is added.
|
||||
|
||||
// get ownership of self, replacing it by a default value.
|
||||
let this = *self;
|
||||
|
||||
let total_received = this.total_received.saturating_add(new.total_received);
|
||||
let total_succeeded = this.total_succeeded.saturating_add(new.total_succeeded);
|
||||
let total_distinct_index_count =
|
||||
this.total_distinct_index_count.saturating_add(new.total_distinct_index_count);
|
||||
let total_single_index = this.total_single_index.saturating_add(new.total_single_index);
|
||||
let total_search_count = this.total_search_count.saturating_add(new.total_search_count);
|
||||
let show_ranking_score = this.show_ranking_score || new.show_ranking_score;
|
||||
let show_ranking_score_details =
|
||||
this.show_ranking_score_details || new.show_ranking_score_details;
|
||||
let use_federation = this.use_federation || new.use_federation;
|
||||
|
||||
Box::new(Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
total_distinct_index_count,
|
||||
total_single_index,
|
||||
total_search_count,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
use_federation,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
let Self {
|
||||
total_received,
|
||||
total_succeeded,
|
||||
total_distinct_index_count,
|
||||
total_single_index,
|
||||
total_search_count,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
use_federation,
|
||||
} = *self;
|
||||
|
||||
json!({
|
||||
"requests": {
|
||||
"total_succeeded": total_succeeded,
|
||||
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||
"total_received": total_received,
|
||||
},
|
||||
"indexes": {
|
||||
"total_single_index": total_single_index,
|
||||
"total_distinct_index_count": total_distinct_index_count,
|
||||
"avg_distinct_index_count": (total_distinct_index_count as f64) / (total_received as f64), // not 0 else returned early
|
||||
},
|
||||
"searches": {
|
||||
"total_search_count": total_search_count,
|
||||
"avg_search_count": (total_search_count as f64) / (total_received as f64),
|
||||
},
|
||||
"scoring": {
|
||||
"show_ranking_score": show_ranking_score,
|
||||
"show_ranking_score_details": show_ranking_score_details,
|
||||
},
|
||||
"federation": {
|
||||
"use_federation": use_federation,
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -3,6 +3,7 @@ use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use serde_json::json;
|
||||
use tracing::debug;
|
||||
|
||||
use crate::analytics::Analytics;
|
||||
@@ -16,15 +17,13 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
cfg.service(web::resource("").route(web::post().to(SeqHandler(create_snapshot))));
|
||||
}
|
||||
|
||||
crate::empty_analytics!(SnapshotAnalytics, "Snapshot Created");
|
||||
|
||||
pub async fn create_snapshot(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::SNAPSHOTS_CREATE }>, Data<IndexScheduler>>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
analytics.publish(SnapshotAnalytics::default(), &req);
|
||||
analytics.publish("Snapshot Created".to_string(), json!({}), Some(&req));
|
||||
|
||||
let task = KindWithContent::SnapshotCreation;
|
||||
let uid = get_task_id(&req, &opt)?;
|
||||
|
||||
@@ -8,10 +8,10 @@ use meilisearch_types::error::deserr_codes::InvalidSwapIndexes;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::tasks::{IndexSwap, KindWithContent};
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
|
||||
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::analytics::{Aggregate, Analytics};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::error::MeilisearchHttpError;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||
@@ -29,36 +29,21 @@ pub struct SwapIndexesPayload {
|
||||
indexes: Vec<IndexUid>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct IndexSwappedAnalytics {
|
||||
swap_operation_number: usize,
|
||||
}
|
||||
|
||||
impl Aggregate for IndexSwappedAnalytics {
|
||||
fn event_name(&self) -> &'static str {
|
||||
"Indexes Swapped"
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
swap_operation_number: self.swap_operation_number.max(new.swap_operation_number),
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn swap_indexes(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::INDEXES_SWAP }>, Data<IndexScheduler>>,
|
||||
params: AwebJson<Vec<SwapIndexesPayload>, DeserrJsonError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let params = params.into_inner();
|
||||
analytics.publish(IndexSwappedAnalytics { swap_operation_number: params.len() }, &req);
|
||||
analytics.publish(
|
||||
"Indexes Swapped".to_string(),
|
||||
json!({
|
||||
"swap_operation_number": params.len(),
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
let filters = index_scheduler.filters();
|
||||
|
||||
let mut swaps = vec![];
|
||||
|
||||
@@ -12,17 +12,18 @@ use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList};
|
||||
use meilisearch_types::task_view::TaskView;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status};
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
use time::macros::format_description;
|
||||
use time::{Date, Duration, OffsetDateTime, Time};
|
||||
use tokio::task;
|
||||
|
||||
use super::{get_task_id, is_dry_run, SummarizedTaskView};
|
||||
use crate::analytics::{Aggregate, AggregateMethod, Analytics};
|
||||
use crate::analytics::Analytics;
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::{aggregate_methods, Opt};
|
||||
use crate::Opt;
|
||||
|
||||
const DEFAULT_LIMIT: u32 = 20;
|
||||
|
||||
@@ -157,69 +158,12 @@ impl TaskDeletionOrCancelationQuery {
|
||||
}
|
||||
}
|
||||
|
||||
aggregate_methods!(
|
||||
CancelTasks => "Tasks Canceled",
|
||||
DeleteTasks => "Tasks Deleted",
|
||||
);
|
||||
|
||||
#[derive(Serialize)]
|
||||
struct TaskFilterAnalytics<Method: AggregateMethod> {
|
||||
filtered_by_uid: bool,
|
||||
filtered_by_index_uid: bool,
|
||||
filtered_by_type: bool,
|
||||
filtered_by_status: bool,
|
||||
filtered_by_canceled_by: bool,
|
||||
filtered_by_before_enqueued_at: bool,
|
||||
filtered_by_after_enqueued_at: bool,
|
||||
filtered_by_before_started_at: bool,
|
||||
filtered_by_after_started_at: bool,
|
||||
filtered_by_before_finished_at: bool,
|
||||
filtered_by_after_finished_at: bool,
|
||||
|
||||
#[serde(skip)]
|
||||
marker: std::marker::PhantomData<Method>,
|
||||
}
|
||||
|
||||
impl<Method: AggregateMethod + 'static> Aggregate for TaskFilterAnalytics<Method> {
|
||||
fn event_name(&self) -> &'static str {
|
||||
Method::event_name()
|
||||
}
|
||||
|
||||
fn aggregate(self: Box<Self>, new: Box<Self>) -> Box<Self> {
|
||||
Box::new(Self {
|
||||
filtered_by_uid: self.filtered_by_uid | new.filtered_by_uid,
|
||||
filtered_by_index_uid: self.filtered_by_index_uid | new.filtered_by_index_uid,
|
||||
filtered_by_type: self.filtered_by_type | new.filtered_by_type,
|
||||
filtered_by_status: self.filtered_by_status | new.filtered_by_status,
|
||||
filtered_by_canceled_by: self.filtered_by_canceled_by | new.filtered_by_canceled_by,
|
||||
filtered_by_before_enqueued_at: self.filtered_by_before_enqueued_at
|
||||
| new.filtered_by_before_enqueued_at,
|
||||
filtered_by_after_enqueued_at: self.filtered_by_after_enqueued_at
|
||||
| new.filtered_by_after_enqueued_at,
|
||||
filtered_by_before_started_at: self.filtered_by_before_started_at
|
||||
| new.filtered_by_before_started_at,
|
||||
filtered_by_after_started_at: self.filtered_by_after_started_at
|
||||
| new.filtered_by_after_started_at,
|
||||
filtered_by_before_finished_at: self.filtered_by_before_finished_at
|
||||
| new.filtered_by_before_finished_at,
|
||||
filtered_by_after_finished_at: self.filtered_by_after_finished_at
|
||||
| new.filtered_by_after_finished_at,
|
||||
|
||||
marker: std::marker::PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
fn into_event(self: Box<Self>) -> serde_json::Value {
|
||||
serde_json::to_value(*self).unwrap_or_default()
|
||||
}
|
||||
}
|
||||
|
||||
async fn cancel_tasks(
|
||||
index_scheduler: GuardedData<ActionPolicy<{ actions::TASKS_CANCEL }>, Data<IndexScheduler>>,
|
||||
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let params = params.into_inner();
|
||||
|
||||
@@ -228,22 +172,21 @@ async fn cancel_tasks(
|
||||
}
|
||||
|
||||
analytics.publish(
|
||||
TaskFilterAnalytics::<CancelTasks> {
|
||||
filtered_by_uid: params.uids.is_some(),
|
||||
filtered_by_index_uid: params.index_uids.is_some(),
|
||||
filtered_by_type: params.types.is_some(),
|
||||
filtered_by_status: params.statuses.is_some(),
|
||||
filtered_by_canceled_by: params.canceled_by.is_some(),
|
||||
filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(),
|
||||
filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(),
|
||||
filtered_by_before_started_at: params.before_started_at.is_some(),
|
||||
filtered_by_after_started_at: params.after_started_at.is_some(),
|
||||
filtered_by_before_finished_at: params.before_finished_at.is_some(),
|
||||
filtered_by_after_finished_at: params.after_finished_at.is_some(),
|
||||
|
||||
marker: std::marker::PhantomData,
|
||||
},
|
||||
&req,
|
||||
"Tasks Canceled".to_string(),
|
||||
json!({
|
||||
"filtered_by_uid": params.uids.is_some(),
|
||||
"filtered_by_index_uid": params.index_uids.is_some(),
|
||||
"filtered_by_type": params.types.is_some(),
|
||||
"filtered_by_status": params.statuses.is_some(),
|
||||
"filtered_by_canceled_by": params.canceled_by.is_some(),
|
||||
"filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(),
|
||||
"filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(),
|
||||
"filtered_by_before_started_at": params.before_started_at.is_some(),
|
||||
"filtered_by_after_started_at": params.after_started_at.is_some(),
|
||||
"filtered_by_before_finished_at": params.before_finished_at.is_some(),
|
||||
"filtered_by_after_finished_at": params.after_finished_at.is_some(),
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
|
||||
let query = params.into_query();
|
||||
@@ -271,7 +214,7 @@ async fn delete_tasks(
|
||||
params: AwebQueryParameter<TaskDeletionOrCancelationQuery, DeserrQueryParamError>,
|
||||
req: HttpRequest,
|
||||
opt: web::Data<Opt>,
|
||||
analytics: web::Data<Analytics>,
|
||||
analytics: web::Data<dyn Analytics>,
|
||||
) -> Result<HttpResponse, ResponseError> {
|
||||
let params = params.into_inner();
|
||||
|
||||
@@ -280,24 +223,22 @@ async fn delete_tasks(
|
||||
}
|
||||
|
||||
analytics.publish(
|
||||
TaskFilterAnalytics::<DeleteTasks> {
|
||||
filtered_by_uid: params.uids.is_some(),
|
||||
filtered_by_index_uid: params.index_uids.is_some(),
|
||||
filtered_by_type: params.types.is_some(),
|
||||
filtered_by_status: params.statuses.is_some(),
|
||||
filtered_by_canceled_by: params.canceled_by.is_some(),
|
||||
filtered_by_before_enqueued_at: params.before_enqueued_at.is_some(),
|
||||
filtered_by_after_enqueued_at: params.after_enqueued_at.is_some(),
|
||||
filtered_by_before_started_at: params.before_started_at.is_some(),
|
||||
filtered_by_after_started_at: params.after_started_at.is_some(),
|
||||
filtered_by_before_finished_at: params.before_finished_at.is_some(),
|
||||
filtered_by_after_finished_at: params.after_finished_at.is_some(),
|
||||
|
||||
marker: std::marker::PhantomData,
|
||||
},
|
||||
&req,
|
||||
"Tasks Deleted".to_string(),
|
||||
json!({
|
||||
"filtered_by_uid": params.uids.is_some(),
|
||||
"filtered_by_index_uid": params.index_uids.is_some(),
|
||||
"filtered_by_type": params.types.is_some(),
|
||||
"filtered_by_status": params.statuses.is_some(),
|
||||
"filtered_by_canceled_by": params.canceled_by.is_some(),
|
||||
"filtered_by_before_enqueued_at": params.before_enqueued_at.is_some(),
|
||||
"filtered_by_after_enqueued_at": params.after_enqueued_at.is_some(),
|
||||
"filtered_by_before_started_at": params.before_started_at.is_some(),
|
||||
"filtered_by_after_started_at": params.after_started_at.is_some(),
|
||||
"filtered_by_before_finished_at": params.before_finished_at.is_some(),
|
||||
"filtered_by_after_finished_at": params.after_finished_at.is_some(),
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
|
||||
let query = params.into_query();
|
||||
|
||||
let (tasks, _) = index_scheduler.get_task_ids_from_authorized_indexes(
|
||||
|
||||
@@ -796,10 +796,8 @@ fn prepare_search<'t>(
|
||||
let span = tracing::trace_span!(target: "search::vector", "embed_one");
|
||||
let _entered = span.enter();
|
||||
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10);
|
||||
|
||||
embedder
|
||||
.embed_one(query.q.clone().unwrap(), Some(deadline))
|
||||
.embed_one(query.q.clone().unwrap())
|
||||
.map_err(milli::vector::Error::from)
|
||||
.map_err(milli::Error::from)?
|
||||
}
|
||||
@@ -1197,13 +1195,8 @@ impl<'a> HitMaker<'a> {
|
||||
let vectors_is_hidden = match (&displayed_ids, vectors_fid) {
|
||||
// displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid
|
||||
(None, _) => false,
|
||||
// vectors has no fid, so check its explicit name
|
||||
(Some(_), None) => {
|
||||
// unwrap as otherwise we'd go to the first one
|
||||
let displayed_names = index.displayed_fields(rtxn)?.unwrap();
|
||||
!displayed_names
|
||||
.contains(&milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME)
|
||||
}
|
||||
// displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field
|
||||
(Some(_), None) => true,
|
||||
// displayed_ids is a finit list, so hide if `_vectors` is not part of it
|
||||
(Some(map), Some(vectors_fid)) => map.contains(&vectors_fid),
|
||||
};
|
||||
@@ -1689,7 +1682,7 @@ fn add_non_formatted_ids_to_formatted_options(
|
||||
fn make_document(
|
||||
displayed_attributes: &BTreeSet<FieldId>,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReaderU16,
|
||||
obkv: &obkv::KvReaderU16,
|
||||
) -> Result<Document, MeilisearchHttpError> {
|
||||
let mut document = serde_json::Map::new();
|
||||
|
||||
|
||||
@@ -381,6 +381,7 @@ pub fn default_settings(dir: impl AsRef<Path>) -> Opt {
|
||||
db_path: dir.as_ref().join("db"),
|
||||
dump_dir: dir.as_ref().join("dumps"),
|
||||
env: "development".to_owned(),
|
||||
#[cfg(feature = "analytics")]
|
||||
no_analytics: true,
|
||||
max_index_size: Byte::from_u64_with_unit(100, Unit::MiB).unwrap(),
|
||||
max_task_db_size: Byte::from_u64_with_unit(1, Unit::GiB).unwrap(),
|
||||
|
||||
@@ -9,9 +9,8 @@ use actix_web::test;
|
||||
use actix_web::test::TestRequest;
|
||||
use actix_web::web::Data;
|
||||
use index_scheduler::IndexScheduler;
|
||||
use meilisearch::analytics::Analytics;
|
||||
use meilisearch::search_queue::SearchQueue;
|
||||
use meilisearch::{create_app, Opt, SubscriberForSecondLayer};
|
||||
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
|
||||
use meilisearch_auth::AuthController;
|
||||
use tracing::level_filters::LevelFilter;
|
||||
use tracing_subscriber::Layer;
|
||||
@@ -142,7 +141,7 @@ impl Service {
|
||||
Data::new(search_queue),
|
||||
self.options.clone(),
|
||||
(route_layer_handle, stderr_layer_handle),
|
||||
Data::new(Analytics::no_analytics()),
|
||||
analytics::MockAnalytics::new(&self.options),
|
||||
true,
|
||||
))
|
||||
.await
|
||||
|
||||
@@ -7,9 +7,8 @@ use std::str::FromStr;
|
||||
use actix_web::http::header::ContentType;
|
||||
use actix_web::web::Data;
|
||||
use meili_snap::snapshot;
|
||||
use meilisearch::analytics::Analytics;
|
||||
use meilisearch::search_queue::SearchQueue;
|
||||
use meilisearch::{create_app, Opt, SubscriberForSecondLayer};
|
||||
use meilisearch::{analytics, create_app, Opt, SubscriberForSecondLayer};
|
||||
use tracing::level_filters::LevelFilter;
|
||||
use tracing_subscriber::layer::SubscriberExt;
|
||||
use tracing_subscriber::Layer;
|
||||
@@ -55,7 +54,7 @@ async fn basic_test_log_stream_route() {
|
||||
Data::new(search_queue),
|
||||
server.service.options.clone(),
|
||||
(route_layer_handle, stderr_layer_handle),
|
||||
Data::new(Analytics::no_analytics()),
|
||||
analytics::MockAnalytics::new(&server.service.options),
|
||||
true,
|
||||
))
|
||||
.await;
|
||||
|
||||
@@ -568,57 +568,6 @@ async fn retrieve_vectors() {
|
||||
]
|
||||
"###);
|
||||
|
||||
// use explicit `_vectors` in displayed attributes
|
||||
let (response, code) = index
|
||||
.update_settings(json!({ "displayedAttributes": ["id", "title", "desc", "_vectors"]} ))
|
||||
.await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(response.uid()).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Captain", "hybrid": {"embedder": "default", "semanticRatio": 0.2}, "retrieveVectors": true}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###"
|
||||
[
|
||||
{
|
||||
"title": "Captain Planet",
|
||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
||||
"id": "2",
|
||||
"_vectors": {
|
||||
"default": {
|
||||
"embeddings": "[vectors]",
|
||||
"regenerate": true
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "3",
|
||||
"_vectors": {
|
||||
"default": {
|
||||
"embeddings": "[vectors]",
|
||||
"regenerate": true
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
"_vectors": {
|
||||
"default": {
|
||||
"embeddings": "[vectors]",
|
||||
"regenerate": true
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
"###);
|
||||
|
||||
// remove `_vectors` from displayed attributes
|
||||
let (response, code) =
|
||||
index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await;
|
||||
|
||||
@@ -137,14 +137,13 @@ fn long_text() -> &'static str {
|
||||
}
|
||||
|
||||
async fn create_mock_tokenized() -> (MockServer, Value) {
|
||||
create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false, false).await
|
||||
create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false).await
|
||||
}
|
||||
|
||||
async fn create_mock_with_template(
|
||||
document_template: &str,
|
||||
model_dimensions: ModelDimensions,
|
||||
fallible: bool,
|
||||
slow: bool,
|
||||
) -> (MockServer, Value) {
|
||||
let mock_server = MockServer::start().await;
|
||||
const API_KEY: &str = "my-api-key";
|
||||
@@ -155,11 +154,7 @@ async fn create_mock_with_template(
|
||||
Mock::given(method("POST"))
|
||||
.and(path("/"))
|
||||
.respond_with(move |req: &Request| {
|
||||
// 0. wait for a long time
|
||||
if slow {
|
||||
std::thread::sleep(std::time::Duration::from_secs(1));
|
||||
}
|
||||
// 1. maybe return 500
|
||||
// 0. maybe return 500
|
||||
if fallible {
|
||||
let attempt = attempt.fetch_add(1, Ordering::Relaxed);
|
||||
let failed = matches!(attempt % 4, 0 | 1 | 3);
|
||||
@@ -172,7 +167,7 @@ async fn create_mock_with_template(
|
||||
}))
|
||||
}
|
||||
}
|
||||
// 3. check API key
|
||||
// 1. check API key
|
||||
match req.headers.get("Authorization") {
|
||||
Some(api_key) if api_key == API_KEY_BEARER => {
|
||||
{}
|
||||
@@ -207,7 +202,7 @@ async fn create_mock_with_template(
|
||||
)
|
||||
}
|
||||
}
|
||||
// 3. parse text inputs
|
||||
// 2. parse text inputs
|
||||
let query: serde_json::Value = match req.body_json() {
|
||||
Ok(query) => query,
|
||||
Err(_error) => return ResponseTemplate::new(400).set_body_json(
|
||||
@@ -228,7 +223,7 @@ async fn create_mock_with_template(
|
||||
panic!("Expected {model_dimensions:?}, got {query_model_dimensions:?}")
|
||||
}
|
||||
|
||||
// 4. for each text, find embedding in responses
|
||||
// 3. for each text, find embedding in responses
|
||||
let serde_json::Value::Array(inputs) = &query["input"] else {
|
||||
panic!("Unexpected `input` value")
|
||||
};
|
||||
@@ -288,7 +283,7 @@ async fn create_mock_with_template(
|
||||
"embedding": embedding,
|
||||
})).collect();
|
||||
|
||||
// 5. produce output from embeddings
|
||||
// 4. produce output from embeddings
|
||||
ResponseTemplate::new(200).set_body_json(json!({
|
||||
"object": "list",
|
||||
"data": data,
|
||||
@@ -322,27 +317,23 @@ const DOGGO_TEMPLATE: &str = r#"{%- if doc.gender == "F" -%}Une chienne nommée
|
||||
{%- endif %}, de race {{doc.breed}}."#;
|
||||
|
||||
async fn create_mock() -> (MockServer, Value) {
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false, false).await
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false).await
|
||||
}
|
||||
|
||||
async fn create_mock_dimensions() -> (MockServer, Value) {
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false, false).await
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false).await
|
||||
}
|
||||
|
||||
async fn create_mock_small_embedding_model() -> (MockServer, Value) {
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false, false).await
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false).await
|
||||
}
|
||||
|
||||
async fn create_mock_legacy_embedding_model() -> (MockServer, Value) {
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false, false).await
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false).await
|
||||
}
|
||||
|
||||
async fn create_fallible_mock() -> (MockServer, Value) {
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, false).await
|
||||
}
|
||||
|
||||
async fn create_slow_mock() -> (MockServer, Value) {
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, true).await
|
||||
create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true).await
|
||||
}
|
||||
|
||||
// basic test "it works"
|
||||
@@ -1882,114 +1873,4 @@ async fn it_still_works() {
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
// test with a server that responds 500 on 3 out of 4 calls
|
||||
#[actix_rt::test]
|
||||
async fn timeout() {
|
||||
let (_mock, setting) = create_slow_mock().await;
|
||||
let server = get_server_vector().await;
|
||||
let index = server.index("doggo");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": {
|
||||
"default": setting,
|
||||
},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let task = server.wait_task(response.uid()).await;
|
||||
snapshot!(task["status"], @r###""succeeded""###);
|
||||
let documents = json!([
|
||||
{"id": 0, "name": "kefir", "gender": "M", "birthyear": 2023, "breed": "Patou"},
|
||||
]);
|
||||
let (value, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let task = index.wait_task(value.uid()).await;
|
||||
snapshot!(task, @r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"indexUid": "doggo",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (documents, _code) = index
|
||||
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
||||
.await;
|
||||
snapshot!(json_string!(documents, {".results.*._vectors.default.embeddings" => "[vector]"}), @r###"
|
||||
{
|
||||
"results": [
|
||||
{
|
||||
"id": 0,
|
||||
"name": "kefir",
|
||||
"gender": "M",
|
||||
"birthyear": 2023,
|
||||
"breed": "Patou",
|
||||
"_vectors": {
|
||||
"default": {
|
||||
"embeddings": "[vector]",
|
||||
"regenerate": true
|
||||
}
|
||||
}
|
||||
}
|
||||
],
|
||||
"offset": 0,
|
||||
"limit": 20,
|
||||
"total": 1
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({
|
||||
"q": "grand chien de berger des montagnes",
|
||||
"hybrid": {"semanticRatio": 0.99, "embedder": "default"}
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["semanticHitCount"]), @"0");
|
||||
snapshot!(json_string!(response["hits"]), @"[]");
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({
|
||||
"q": "grand chien de berger des montagnes",
|
||||
"hybrid": {"semanticRatio": 0.99, "embedder": "default"}
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["semanticHitCount"]), @"1");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 0,
|
||||
"name": "kefir",
|
||||
"gender": "M",
|
||||
"birthyear": 2023,
|
||||
"breed": "Patou"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({
|
||||
"q": "grand chien de berger des montagnes",
|
||||
"hybrid": {"semanticRatio": 0.99, "embedder": "default"}
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["semanticHitCount"]), @"0");
|
||||
snapshot!(json_string!(response["hits"]), @"[]");
|
||||
}
|
||||
|
||||
// test with a server that wrongly responds 400
|
||||
|
||||
@@ -4,53 +4,6 @@ use crate::common::{GetAllDocumentsOptions, Server};
|
||||
use crate::json;
|
||||
use crate::vector::generate_default_user_provided_documents;
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn field_unavailable_for_source() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("doggo");
|
||||
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(value, @r###"
|
||||
{
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
"editDocumentsByFunction": false,
|
||||
"containsFilter": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": { "manual": {"source": "userProvided", "documentTemplate": "{{doc.documentTemplate}}"}},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "`.embedders.manual`: Field `documentTemplate` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`, `rest`). Available fields: `source`, `dimensions`, `distribution`, `binaryQuantized`",
|
||||
"code": "invalid_settings_embedders",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"embedders": { "default": {"source": "openAi", "revision": "42"}},
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "`.embedders.default`: Field `revision` unavailable for source `openAi` (only available for sources: `huggingFace`). Available fields: `source`, `model`, `apiKey`, `documentTemplate`, `dimensions`, `distribution`, `url`, `binaryQuantized`",
|
||||
"code": "invalid_settings_embedders",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_settings_embedders"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn update_embedder() {
|
||||
let server = Server::new().await;
|
||||
|
||||
@@ -682,7 +682,7 @@ fn export_a_dump(
|
||||
format!("While iterating on content file {:?}", content_file_uuid)
|
||||
})? {
|
||||
dump_content_file
|
||||
.push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
|
||||
.push_document(&obkv_to_object(doc, &documents_batch_index)?)?;
|
||||
}
|
||||
dump_content_file.flush()?;
|
||||
count += 1;
|
||||
|
||||
@@ -12,12 +12,14 @@ readme.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
big_s = "1.0.2"
|
||||
bimap = { version = "0.6.3", features = ["serde"] }
|
||||
bincode = "1.3.3"
|
||||
bstr = "1.9.1"
|
||||
bytemuck = { version = "1.16.1", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.5.0"
|
||||
charabia = { version = "0.9.1", default-features = false }
|
||||
# charabia = { version = "0.9.0", default-features = false }
|
||||
charabia = { git = "https://github.com/meilisearch/charabia", branch = "mutualize-char-normalizer", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.13"
|
||||
deserr = "0.6.2"
|
||||
@@ -27,9 +29,9 @@ fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.7", default-features = false, features = [
|
||||
"rayon",
|
||||
"tempfile",
|
||||
] }
|
||||
"rayon", # TODO Should we keep this feature
|
||||
"tempfile"
|
||||
], git = "https://github.com/meilisearch/grenad", branch = "various-improvements" }
|
||||
heed = { version = "0.20.3", default-features = false, features = [
|
||||
"serde-json",
|
||||
"serde-bincode",
|
||||
@@ -40,14 +42,14 @@ json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memchr = "2.5.0"
|
||||
memmap2 = "0.9.4"
|
||||
obkv = "0.2.2"
|
||||
obkv = { git = "https://github.com/kerollmops/obkv", branch = "unsized-kvreader" }
|
||||
once_cell = "1.19.0"
|
||||
ordered-float = "4.2.1"
|
||||
rayon = "1.10.0"
|
||||
roaring = { version = "0.10.6", features = ["serde"] }
|
||||
rstar = { version = "0.12.0", features = ["serde"] }
|
||||
serde = { version = "1.0.204", features = ["derive"] }
|
||||
serde_json = { version = "1.0.120", features = ["preserve_order"] }
|
||||
serde_json = { version = "1.0.120", features = ["preserve_order", "raw_value"] }
|
||||
slice-group-by = "0.3.1"
|
||||
smallstr = { version = "0.3.0", features = ["serde"] }
|
||||
smallvec = "1.13.2"
|
||||
@@ -79,17 +81,17 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
|
||||
] }
|
||||
tiktoken-rs = "0.5.9"
|
||||
liquid = "0.26.6"
|
||||
rhai = { git = "https://github.com/rhaiscript/rhai", rev = "ef3df63121d27aacd838f366f2b83fd65f20a1e4", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
||||
arroy = "0.5.0"
|
||||
rhai = { version = "1.19.0", features = ["serde", "no_module", "no_custom_syntax", "no_time", "sync"] }
|
||||
arroy = { git = "https://github.com/meilisearch/arroy/", rev = "2386594dfb009ce08821a925ccc89fb8e30bf73d" }
|
||||
rand = "0.8.5"
|
||||
tracing = "0.1.40"
|
||||
ureq = { version = "2.10.0", features = ["json"] }
|
||||
url = "2.5.2"
|
||||
rayon-par-bridge = "0.1.0"
|
||||
hashbrown = "0.14.5"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.43", default-features = false }
|
||||
big_s = "1.0.2"
|
||||
insta = "1.39.0"
|
||||
maplit = "1.0.2"
|
||||
md5 = "0.7.0"
|
||||
@@ -98,7 +100,16 @@ rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
all-tokenizations = [
|
||||
"charabia/default",
|
||||
"charabia/chinese",
|
||||
"charabia/hebrew",
|
||||
"charabia/japanese",
|
||||
"charabia/thai",
|
||||
"charabia/korean",
|
||||
"charabia/greek",
|
||||
"charabia/khmer",
|
||||
"charabia/vietnamese",
|
||||
"charabia/swedish-recomposition",
|
||||
"charabia/german-segmentation",
|
||||
]
|
||||
|
||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||
@@ -137,8 +148,5 @@ german = ["charabia/german-segmentation"]
|
||||
# force swedish character recomposition
|
||||
swedish-recomposition = ["charabia/swedish-recomposition"]
|
||||
|
||||
# allow turkish specialized tokenization
|
||||
turkish = ["charabia/turkish"]
|
||||
|
||||
# allow CUDA support, see <https://github.com/meilisearch/meilisearch/issues/4306>
|
||||
cuda = ["candle-core/cuda"]
|
||||
|
||||
@@ -292,7 +292,7 @@ mod test {
|
||||
.unwrap()
|
||||
.into_cursor_and_fields_index();
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -321,7 +321,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -348,7 +348,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -375,7 +375,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -402,7 +402,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -429,7 +429,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -456,7 +456,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -483,7 +483,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -510,7 +510,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
@@ -555,7 +555,7 @@ mod test {
|
||||
.into_cursor_and_fields_index();
|
||||
|
||||
let doc = cursor.next_document().unwrap().unwrap();
|
||||
let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
|
||||
let val = obkv_to_object(doc, &index).map(Value::from).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
val,
|
||||
|
||||
@@ -69,7 +69,7 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct EnrichedDocument<'a> {
|
||||
pub document: KvReader<'a, FieldId>,
|
||||
pub document: &'a KvReader<FieldId>,
|
||||
pub document_id: DocumentId,
|
||||
}
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ use crate::{FieldId, Object, Result};
|
||||
const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
|
||||
|
||||
/// Helper function to convert an obkv reader into a JSON object.
|
||||
pub fn obkv_to_object(obkv: &KvReader<'_, FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
|
||||
pub fn obkv_to_object(obkv: &KvReader<FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
|
||||
obkv.iter()
|
||||
.map(|(field_id, value)| {
|
||||
let field_name = index
|
||||
@@ -76,7 +76,7 @@ impl DocumentsBatchIndex {
|
||||
self.0.get_by_right(name).cloned()
|
||||
}
|
||||
|
||||
pub fn recreate_json(&self, document: &obkv::KvReaderU16<'_>) -> Result<Object> {
|
||||
pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result<Object> {
|
||||
let mut map = Object::new();
|
||||
|
||||
for (k, v) in document.iter() {
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
use std::borrow::Cow;
|
||||
use std::iter;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde_json::Value;
|
||||
use serde_json::{from_str, Value};
|
||||
|
||||
use crate::update::new::{CowStr, TopLevelMap};
|
||||
use crate::{FieldId, InternalError, Object, Result, UserError};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
@@ -52,7 +54,7 @@ impl<'a> PrimaryKey<'a> {
|
||||
|
||||
pub fn document_id(
|
||||
&self,
|
||||
document: &obkv::KvReader<'_, FieldId>,
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
fields: &impl FieldIdMapper,
|
||||
) -> Result<StdResult<String, DocumentIdExtractionError>> {
|
||||
match self {
|
||||
@@ -100,6 +102,45 @@ impl<'a> PrimaryKey<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the document ID based on the primary and
|
||||
/// search for it recursively in zero-copy-deserialized documents.
|
||||
pub fn document_id_from_top_level_map<'p>(
|
||||
&self,
|
||||
document: &TopLevelMap<'p>,
|
||||
) -> Result<StdResult<CowStr<'p>, DocumentIdExtractionError>> {
|
||||
fn get_docid<'p>(
|
||||
document: &TopLevelMap<'p>,
|
||||
primary_key: &[&str],
|
||||
) -> Result<StdResult<CowStr<'p>, DocumentIdExtractionError>> {
|
||||
match primary_key {
|
||||
[] => unreachable!("arrrgh"), // would None be ok?
|
||||
[primary_key] => match document.0.get(*primary_key) {
|
||||
Some(value) => match from_str::<u64>(value.get()) {
|
||||
Ok(value) => Ok(Ok(CowStr(Cow::Owned(value.to_string())))),
|
||||
Err(_) => match from_str(value.get()) {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(e) => Ok(Err(DocumentIdExtractionError::InvalidDocumentId(
|
||||
UserError::SerdeJson(e),
|
||||
))),
|
||||
},
|
||||
},
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
},
|
||||
[head, tail @ ..] => match document.0.get(*head) {
|
||||
Some(value) => {
|
||||
let document = from_str(value.get()).map_err(InternalError::SerdeJson)?;
|
||||
get_docid(&document, tail)
|
||||
}
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO do not allocate a vec everytime here
|
||||
let primary_key: Vec<_> = self.name().split(PRIMARY_KEY_SPLIT_SYMBOL).collect();
|
||||
get_docid(document, &primary_key)
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and depth of the objects.
|
||||
pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
|
||||
@@ -72,15 +72,24 @@ impl<R> DocumentsBatchCursor<R> {
|
||||
}
|
||||
|
||||
impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
|
||||
/// Returns a single document from the database.
|
||||
pub fn get(
|
||||
&mut self,
|
||||
offset: u32,
|
||||
) -> Result<Option<&KvReader<FieldId>>, DocumentsBatchCursorError> {
|
||||
match self.cursor.move_on_key_equal_to(offset.to_be_bytes())? {
|
||||
Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => Ok(Some(value.into())),
|
||||
_otherwise => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the next document, starting from the first one. Subsequent calls to
|
||||
/// `next_document` advance the document reader until all the documents have been read.
|
||||
pub fn next_document(
|
||||
&mut self,
|
||||
) -> Result<Option<KvReader<'_, FieldId>>, DocumentsBatchCursorError> {
|
||||
) -> Result<Option<&KvReader<FieldId>>, DocumentsBatchCursorError> {
|
||||
match self.cursor.move_on_next()? {
|
||||
Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
|
||||
Ok(Some(KvReader::new(value)))
|
||||
}
|
||||
Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => Ok(Some(value.into())),
|
||||
_otherwise => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -297,7 +297,6 @@ impl From<arroy::Error> for Error {
|
||||
arroy::Error::InvalidVecDimension { expected, received } => {
|
||||
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||
}
|
||||
arroy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation),
|
||||
arroy::Error::DatabaseFull
|
||||
| arroy::Error::InvalidItemAppend
|
||||
| arroy::Error::UnmatchingDistance { .. }
|
||||
|
||||
@@ -4,6 +4,9 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
mod global;
|
||||
pub use global::GlobalFieldsIdsMap;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct FieldsIdsMap {
|
||||
names_ids: BTreeMap<String, FieldId>,
|
||||
|
||||
86
milli/src/fields_ids_map/global.rs
Normal file
86
milli/src/fields_ids_map/global.rs
Normal file
@@ -0,0 +1,86 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::RwLock;
|
||||
|
||||
use crate::{FieldId, FieldsIdsMap};
|
||||
|
||||
/// A fields ids map that can be globally updated to add fields
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct GlobalFieldsIdsMap<'indexing> {
|
||||
global: &'indexing RwLock<FieldsIdsMap>,
|
||||
local: LocalFieldsIdsMap,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct LocalFieldsIdsMap {
|
||||
names_ids: BTreeMap<String, FieldId>,
|
||||
ids_names: BTreeMap<FieldId, String>,
|
||||
}
|
||||
|
||||
impl LocalFieldsIdsMap {
|
||||
fn new(global: &RwLock<FieldsIdsMap>) -> Self {
|
||||
let global = global.read().unwrap();
|
||||
Self { names_ids: global.names_ids.clone(), ids_names: global.ids_names.clone() }
|
||||
}
|
||||
|
||||
fn insert(&mut self, name: &str, field_id: FieldId) {
|
||||
self.names_ids.insert(name.to_owned(), field_id);
|
||||
self.ids_names.insert(field_id, name.to_owned());
|
||||
}
|
||||
|
||||
fn name(&self, id: FieldId) -> Option<&str> {
|
||||
self.ids_names.get(&id).map(String::as_str)
|
||||
}
|
||||
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.names_ids.get(name).copied()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'indexing> GlobalFieldsIdsMap<'indexing> {
|
||||
pub fn new(global: &'indexing RwLock<FieldsIdsMap>) -> Self {
|
||||
Self { local: LocalFieldsIdsMap::new(global), global }
|
||||
}
|
||||
|
||||
/// Returns the field id related to a field name, it will create a new field id if the
|
||||
/// name is not already known. Returns `None` if the maximum field id as been reached.
|
||||
pub fn id_or_insert(&mut self, name: &str) -> Option<FieldId> {
|
||||
if let Some(field_id) = self.local.id(name) {
|
||||
return Some(field_id);
|
||||
}
|
||||
|
||||
{
|
||||
// optimistically lookup the global map
|
||||
let global = self.global.read().unwrap();
|
||||
|
||||
if let Some(field_id) = global.id(name) {
|
||||
self.local.insert(name, field_id);
|
||||
return Some(field_id);
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
let mut global = self.global.write().unwrap();
|
||||
|
||||
if let Some(field_id) = global.id(name) {
|
||||
self.local.insert(name, field_id);
|
||||
return Some(field_id);
|
||||
}
|
||||
|
||||
let field_id = global.insert(name)?;
|
||||
self.local.insert(name, field_id);
|
||||
Some(field_id)
|
||||
}
|
||||
}
|
||||
|
||||
/// Get the name of a field based on its id.
|
||||
pub fn name(&mut self, id: FieldId) -> Option<&str> {
|
||||
if self.local.name(id).is_none() {
|
||||
let global = self.global.read().unwrap();
|
||||
|
||||
let name = global.name(id)?;
|
||||
self.local.insert(name, id);
|
||||
}
|
||||
|
||||
self.local.name(id)
|
||||
}
|
||||
}
|
||||
@@ -6,10 +6,10 @@ use obkv::{KvReaderU16, KvWriterU16};
|
||||
pub struct ObkvCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
||||
type DItem = KvReaderU16<'a>;
|
||||
type DItem = &'a KvReaderU16;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(KvReaderU16::new(bytes))
|
||||
Ok(KvReaderU16::from_slice(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -122,7 +122,7 @@ impl CboRoaringBitmapCodec {
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into<'a>(
|
||||
deladd: KvReaderDelAdd<'_>,
|
||||
deladd: &KvReaderDelAdd,
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> io::Result<Option<&'a [u8]>> {
|
||||
|
||||
@@ -1251,12 +1251,20 @@ impl Index {
|
||||
|
||||
/* documents */
|
||||
|
||||
/// Returns a document by using the document id.
|
||||
pub fn document<'t>(&self, rtxn: &'t RoTxn, id: DocumentId) -> Result<&'t obkv::KvReaderU16> {
|
||||
self.documents
|
||||
.get(rtxn, &id)?
|
||||
.ok_or(UserError::UnknownInternalDocumentId { document_id: id })
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
/// Returns an iterator over the requested documents. The next item will be an error if a document is missing.
|
||||
pub fn iter_documents<'a, 't: 'a>(
|
||||
&'a self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
ids: impl IntoIterator<Item = DocumentId> + 'a,
|
||||
) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
|
||||
) -> Result<impl Iterator<Item = Result<(DocumentId, &'t obkv::KvReaderU16)>> + 'a> {
|
||||
Ok(ids.into_iter().map(move |id| {
|
||||
let kv = self
|
||||
.documents
|
||||
@@ -1271,7 +1279,7 @@ impl Index {
|
||||
&self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
ids: impl IntoIterator<Item = DocumentId>,
|
||||
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
|
||||
) -> Result<Vec<(DocumentId, &'t obkv::KvReaderU16)>> {
|
||||
self.iter_documents(rtxn, ids)?.collect()
|
||||
}
|
||||
|
||||
@@ -1279,7 +1287,7 @@ impl Index {
|
||||
pub fn all_documents<'a, 't: 'a>(
|
||||
&'a self,
|
||||
rtxn: &'t RoTxn<'t>,
|
||||
) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
|
||||
) -> Result<impl Iterator<Item = Result<(DocumentId, &'t obkv::KvReaderU16)>> + 'a> {
|
||||
self.iter_documents(rtxn, self.documents_ids(rtxn)?)
|
||||
}
|
||||
|
||||
@@ -1303,7 +1311,7 @@ impl Index {
|
||||
})?;
|
||||
Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
|
||||
let (_docid, obkv) = entry?;
|
||||
match primary_key.document_id(&obkv, &fields)? {
|
||||
match primary_key.document_id(obkv, &fields)? {
|
||||
Ok(document_id) => Ok(document_id),
|
||||
Err(_) => Err(InternalError::DocumentsError(
|
||||
crate::documents::Error::InvalidDocumentFormat,
|
||||
@@ -1610,6 +1618,24 @@ impl Index {
|
||||
.unwrap_or_default())
|
||||
}
|
||||
|
||||
pub fn arroy_readers<'a>(
|
||||
&'a self,
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
embedder_id: u8,
|
||||
quantized: bool,
|
||||
) -> impl Iterator<Item = Result<ArroyWrapper>> + 'a {
|
||||
crate::vector::arroy_db_range_for_embedder(embedder_id).map_while(move |k| {
|
||||
let reader = ArroyWrapper::new(self.vector_arroy, k, quantized);
|
||||
// Here we don't care about the dimensions, but we want to know if we can read
|
||||
// in the database or if its metadata are missing because there is no document with that many vectors.
|
||||
match reader.dimensions(rtxn) {
|
||||
Ok(_) => Some(Ok(reader)),
|
||||
Err(arroy::Error::MissingMetadata(_)) => None,
|
||||
Err(e) => Some(Err(e.into())),
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> {
|
||||
self.main.remap_types::<Str, BEU64>().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff)
|
||||
}
|
||||
@@ -1631,9 +1657,14 @@ impl Index {
|
||||
let embedding_configs = self.embedding_configs(rtxn)?;
|
||||
for config in embedding_configs {
|
||||
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
|
||||
let reader =
|
||||
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
|
||||
let embeddings = reader.item_vectors(rtxn, docid)?;
|
||||
let embeddings = self
|
||||
.arroy_readers(rtxn, embedder_id, config.config.quantized())
|
||||
.map_while(|reader| {
|
||||
reader
|
||||
.and_then(|r| r.item_vector(rtxn, docid).map_err(|e| e.into()))
|
||||
.transpose()
|
||||
})
|
||||
.collect::<Result<Vec<_>>>()?;
|
||||
res.insert(config.name.to_owned(), embeddings);
|
||||
}
|
||||
Ok(res)
|
||||
|
||||
@@ -55,7 +55,7 @@ pub use self::error::{
|
||||
};
|
||||
pub use self::external_documents_ids::ExternalDocumentsIds;
|
||||
pub use self::fieldids_weights_map::FieldidsWeightsMap;
|
||||
pub use self::fields_ids_map::FieldsIdsMap;
|
||||
pub use self::fields_ids_map::{FieldsIdsMap, GlobalFieldsIdsMap};
|
||||
pub use self::heed_codec::{
|
||||
BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec,
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec,
|
||||
@@ -214,7 +214,7 @@ pub fn bucketed_position(relative: u16) -> u16 {
|
||||
pub fn obkv_to_json(
|
||||
displayed_fields: &[FieldId],
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReaderU16<'_>,
|
||||
obkv: &obkv::KvReaderU16,
|
||||
) -> Result<Object> {
|
||||
displayed_fields
|
||||
.iter()
|
||||
@@ -232,10 +232,7 @@ pub fn obkv_to_json(
|
||||
}
|
||||
|
||||
/// Transform every field of a raw obkv store into a JSON Object.
|
||||
pub fn all_obkv_to_json(
|
||||
obkv: obkv::KvReaderU16<'_>,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
) -> Result<Object> {
|
||||
pub fn all_obkv_to_json(obkv: &obkv::KvReaderU16, fields_ids_map: &FieldsIdsMap) -> Result<Object> {
|
||||
let all_keys = obkv.iter().map(|(k, _v)| k).collect::<Vec<_>>();
|
||||
obkv_to_json(all_keys.as_slice(), fields_ids_map, obkv)
|
||||
}
|
||||
@@ -434,7 +431,7 @@ mod tests {
|
||||
writer.insert(id1, b"1234").unwrap();
|
||||
writer.insert(id2, b"4321").unwrap();
|
||||
let contents = writer.into_inner().unwrap();
|
||||
let obkv = obkv::KvReaderU16::new(&contents);
|
||||
let obkv = obkv::KvReaderU16::from_slice(&contents);
|
||||
|
||||
let expected = json!({
|
||||
"field1": 1234,
|
||||
|
||||
@@ -30,13 +30,13 @@ impl ParsedValue {
|
||||
|
||||
impl<'a> Document<'a> {
|
||||
pub fn new(
|
||||
data: obkv::KvReaderU16<'a>,
|
||||
data: &'a obkv::KvReaderU16,
|
||||
side: DelAdd,
|
||||
inverted_field_map: &'a FieldsIdsMap,
|
||||
) -> Self {
|
||||
let mut out_data = BTreeMap::new();
|
||||
for (fid, raw) in data {
|
||||
let obkv = KvReaderDelAdd::new(raw);
|
||||
let obkv = KvReaderDelAdd::from_slice(raw);
|
||||
let Some(raw) = obkv.get(side) else {
|
||||
continue;
|
||||
};
|
||||
|
||||
@@ -111,7 +111,7 @@ impl Prompt {
|
||||
|
||||
pub fn render(
|
||||
&self,
|
||||
document: obkv::KvReaderU16<'_>,
|
||||
document: &obkv::KvReaderU16,
|
||||
side: DelAdd,
|
||||
field_id_map: &FieldsIdsMapWithMetadata,
|
||||
) -> Result<String, RenderPromptError> {
|
||||
|
||||
@@ -201,9 +201,7 @@ impl<'a> Search<'a> {
|
||||
let span = tracing::trace_span!(target: "search::hybrid", "embed_one");
|
||||
let _entered = span.enter();
|
||||
|
||||
let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3);
|
||||
|
||||
match embedder.embed_one(query, Some(deadline)) {
|
||||
match embedder.embed_one(query) {
|
||||
Ok(embedding) => embedding,
|
||||
Err(error) => {
|
||||
tracing::error!(error=%error, "Embedding failed");
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
|
||||
use std::hash::Hash;
|
||||
|
||||
use fxhash::FxHashMap;
|
||||
use grenad::MergeFunction;
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesEncode, Database, RoTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
@@ -11,7 +12,7 @@ use super::interner::Interned;
|
||||
use super::Word;
|
||||
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
|
||||
use crate::update::MergeCboRoaringBitmaps;
|
||||
use crate::{
|
||||
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
|
||||
};
|
||||
@@ -110,19 +111,21 @@ impl<'ctx> DatabaseCache<'ctx> {
|
||||
.map_err(Into::into)
|
||||
}
|
||||
|
||||
fn get_value_from_keys<'v, K1, KC>(
|
||||
fn get_value_from_keys<'v, K1, KC, MF>(
|
||||
txn: &'ctx RoTxn<'_>,
|
||||
cache_key: K1,
|
||||
db_keys: &'v [KC::EItem],
|
||||
cache: &mut FxHashMap<K1, Option<Cow<'ctx, [u8]>>>,
|
||||
db: Database<KC, Bytes>,
|
||||
universe: Option<&RoaringBitmap>,
|
||||
merger: MergeFn,
|
||||
merger: MF,
|
||||
) -> Result<Option<RoaringBitmap>>
|
||||
where
|
||||
K1: Copy + Eq + Hash,
|
||||
KC: BytesEncode<'v>,
|
||||
KC::EItem: Sized,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
if let Entry::Vacant(entry) = cache.entry(cache_key) {
|
||||
let bitmap_ptr: Option<Cow<'ctx, [u8]>> = match db_keys {
|
||||
@@ -138,7 +141,7 @@ impl<'ctx> DatabaseCache<'ctx> {
|
||||
if bitmaps.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(merger(&[], &bitmaps[..])?)
|
||||
Some(merger.merge(&[], &bitmaps[..])?)
|
||||
}
|
||||
}
|
||||
};
|
||||
@@ -213,17 +216,17 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@@ -245,17 +248,17 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
word,
|
||||
&keys[..],
|
||||
&mut self.db_cache.exact_word_docids,
|
||||
self.index.word_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
word,
|
||||
self.word_interner.get(word).as_str(),
|
||||
@@ -302,17 +305,17 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.tolerant.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
@@ -334,17 +337,17 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
let keys: Vec<_> =
|
||||
restricted_fids.exact.iter().map(|(fid, _)| (interned, *fid)).collect();
|
||||
|
||||
DatabaseCache::get_value_from_keys::<_, _>(
|
||||
DatabaseCache::get_value_from_keys(
|
||||
self.txn,
|
||||
prefix,
|
||||
&keys[..],
|
||||
&mut self.db_cache.exact_word_prefix_docids,
|
||||
self.index.word_prefix_fid_docids.remap_data_type::<Bytes>(),
|
||||
universe,
|
||||
merge_cbo_roaring_bitmaps,
|
||||
MergeCboRoaringBitmaps,
|
||||
)
|
||||
}
|
||||
None => DatabaseCache::get_value::<_, _>(
|
||||
None => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
prefix,
|
||||
self.word_interner.get(prefix).as_str(),
|
||||
@@ -405,7 +408,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
|
||||
Ok(docids)
|
||||
}
|
||||
ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>(
|
||||
ProximityPrecision::ByWord => DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(proximity, word1, word2),
|
||||
&(
|
||||
@@ -538,7 +541,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, fid),
|
||||
&(self.word_interner.get(word).as_str(), fid),
|
||||
@@ -559,7 +562,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, fid),
|
||||
&(self.word_interner.get(word_prefix).as_str(), fid),
|
||||
@@ -629,7 +632,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word, position),
|
||||
&(self.word_interner.get(word).as_str(), position),
|
||||
@@ -645,7 +648,7 @@ impl<'ctx> SearchContext<'ctx> {
|
||||
word_prefix: Interned<String>,
|
||||
position: u16,
|
||||
) -> Result<Option<RoaringBitmap>> {
|
||||
DatabaseCache::get_value::<_, _>(
|
||||
DatabaseCache::get_value(
|
||||
self.txn,
|
||||
(word_prefix, position),
|
||||
&(self.word_interner.get(word_prefix).as_str(), position),
|
||||
|
||||
@@ -553,7 +553,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "debug", skip_all, target = "search::universe")]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::universe")]
|
||||
pub fn filtered_universe(
|
||||
index: &Index,
|
||||
txn: &RoTxn<'_>,
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait};
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::vector::{ArroyWrapper, DistributionShift, Embedder};
|
||||
use crate::vector::{DistributionShift, Embedder};
|
||||
use crate::{DocumentId, Result, SearchContext, SearchLogger};
|
||||
|
||||
pub struct VectorSort<Q: RankingRuleQueryTrait> {
|
||||
@@ -52,9 +53,14 @@ impl<Q: RankingRuleQueryTrait> VectorSort<Q> {
|
||||
vector_candidates: &RoaringBitmap,
|
||||
) -> Result<()> {
|
||||
let target = &self.target;
|
||||
let mut results = Vec::new();
|
||||
|
||||
let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized);
|
||||
let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
||||
for reader in ctx.index.arroy_readers(ctx.txn, self.embedder_index, self.quantized) {
|
||||
let nns_by_vector =
|
||||
reader?.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?;
|
||||
results.extend(nns_by_vector.into_iter());
|
||||
}
|
||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||
self.cached_sorted_docids = results.into_iter();
|
||||
|
||||
Ok(())
|
||||
@@ -83,7 +89,7 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for VectorSort<Q> {
|
||||
}
|
||||
|
||||
#[allow(clippy::only_used_in_recursion)]
|
||||
#[tracing::instrument(level = "debug", skip_all, target = "search::vector_sort")]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::vector_sort")]
|
||||
fn next_bucket(
|
||||
&mut self,
|
||||
ctx: &mut SearchContext<'ctx>,
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use ordered_float::OrderedFloat;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::score_details::{self, ScoreDetails};
|
||||
use crate::vector::{ArroyWrapper, Embedder};
|
||||
use crate::vector::Embedder;
|
||||
use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult};
|
||||
|
||||
pub struct Similar<'a> {
|
||||
@@ -70,13 +71,23 @@ impl<'a> Similar<'a> {
|
||||
.get(self.rtxn, &self.embedder_name)?
|
||||
.ok_or_else(|| crate::UserError::InvalidEmbedder(self.embedder_name.to_owned()))?;
|
||||
|
||||
let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized);
|
||||
let results = reader.nns_by_item(
|
||||
self.rtxn,
|
||||
self.id,
|
||||
self.limit + self.offset + 1,
|
||||
Some(&universe),
|
||||
)?;
|
||||
let mut results = Vec::new();
|
||||
|
||||
for reader in self.index.arroy_readers(self.rtxn, embedder_index, self.quantized) {
|
||||
let nns_by_item = reader?.nns_by_item(
|
||||
self.rtxn,
|
||||
self.id,
|
||||
self.limit + self.offset + 1,
|
||||
Some(&universe),
|
||||
)?;
|
||||
if let Some(mut nns_by_item) = nns_by_item {
|
||||
results.append(&mut nns_by_item);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance));
|
||||
|
||||
let mut documents_ids = Vec::with_capacity(self.limit);
|
||||
let mut document_scores = Vec::with_capacity(self.limit);
|
||||
|
||||
@@ -3,12 +3,12 @@ use std::ops::RangeInclusive;
|
||||
|
||||
use roaring::bitmap::{IntoIter, RoaringBitmap};
|
||||
|
||||
pub struct AvailableDocumentsIds {
|
||||
pub struct AvailableIds {
|
||||
iter: Chain<IntoIter, RangeInclusive<u32>>,
|
||||
}
|
||||
|
||||
impl AvailableDocumentsIds {
|
||||
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
|
||||
impl AvailableIds {
|
||||
pub fn new(docids: &RoaringBitmap) -> AvailableIds {
|
||||
match docids.max() {
|
||||
Some(last_id) => {
|
||||
let mut available = RoaringBitmap::from_iter(0..last_id);
|
||||
@@ -20,17 +20,17 @@ impl AvailableDocumentsIds {
|
||||
None => 1..=0, // empty range iterator
|
||||
};
|
||||
|
||||
AvailableDocumentsIds { iter: available.into_iter().chain(iter) }
|
||||
AvailableIds { iter: available.into_iter().chain(iter) }
|
||||
}
|
||||
None => {
|
||||
let empty = RoaringBitmap::new().into_iter();
|
||||
AvailableDocumentsIds { iter: empty.chain(0..=u32::MAX) }
|
||||
AvailableIds { iter: empty.chain(0..=u32::MAX) }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterator for AvailableDocumentsIds {
|
||||
impl Iterator for AvailableIds {
|
||||
type Item = u32;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
@@ -45,7 +45,7 @@ mod tests {
|
||||
#[test]
|
||||
fn empty() {
|
||||
let base = RoaringBitmap::new();
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let left = AvailableIds::new(&base);
|
||||
let right = 0..=u32::MAX;
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
@@ -58,7 +58,7 @@ mod tests {
|
||||
base.insert(100);
|
||||
base.insert(405);
|
||||
|
||||
let left = AvailableDocumentsIds::from_documents_ids(&base);
|
||||
let left = AvailableIds::new(&base);
|
||||
let right = (0..=u32::MAX).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
|
||||
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
|
||||
}
|
||||
59
milli/src/update/concurrent_available_ids.rs
Normal file
59
milli/src/update/concurrent_available_ids.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
/// A concurrent ID generate that will never return the same ID twice.
|
||||
#[derive(Debug)]
|
||||
pub struct ConcurrentAvailableIds {
|
||||
/// The current tree node ID we should use if there is no other IDs available.
|
||||
current: AtomicU32,
|
||||
/// The total number of tree node IDs used.
|
||||
used: AtomicU64,
|
||||
|
||||
/// A list of IDs to exhaust before picking IDs from `current`.
|
||||
available: RoaringBitmap,
|
||||
/// The current Nth ID to select in the bitmap.
|
||||
select_in_bitmap: AtomicU32,
|
||||
/// Tells if you should look in the roaring bitmap or if all the IDs are already exhausted.
|
||||
look_into_bitmap: AtomicBool,
|
||||
}
|
||||
|
||||
impl ConcurrentAvailableIds {
|
||||
/// Creates an ID generator returning unique IDs, avoiding the specified used IDs.
|
||||
pub fn new(used: RoaringBitmap) -> ConcurrentAvailableIds {
|
||||
let last_id = used.max().map_or(0, |id| id + 1);
|
||||
let used_ids = used.len();
|
||||
let available = RoaringBitmap::from_sorted_iter(0..last_id).unwrap() - used;
|
||||
|
||||
ConcurrentAvailableIds {
|
||||
current: AtomicU32::new(last_id),
|
||||
used: AtomicU64::new(used_ids),
|
||||
select_in_bitmap: AtomicU32::new(0),
|
||||
look_into_bitmap: AtomicBool::new(!available.is_empty()),
|
||||
available,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a new unique ID and increase the count of IDs used.
|
||||
pub fn next(&self) -> Option<u32> {
|
||||
if self.used.fetch_add(1, Ordering::Relaxed) > u32::MAX as u64 {
|
||||
None
|
||||
} else if self.look_into_bitmap.load(Ordering::Relaxed) {
|
||||
let current = self.select_in_bitmap.fetch_add(1, Ordering::Relaxed);
|
||||
match self.available.select(current) {
|
||||
Some(id) => Some(id),
|
||||
None => {
|
||||
self.look_into_bitmap.store(false, Ordering::Relaxed);
|
||||
Some(self.current.fetch_add(1, Ordering::Relaxed))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Some(self.current.fetch_add(1, Ordering::Relaxed))
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of used ids in total.
|
||||
pub fn used(&self) -> u64 {
|
||||
self.used.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
use obkv::Key;
|
||||
|
||||
pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
|
||||
pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
|
||||
pub type KvReaderDelAdd = obkv::KvReader<DelAdd>;
|
||||
|
||||
/// DelAdd defines the new value to add in the database and old value to delete from the database.
|
||||
///
|
||||
@@ -36,7 +36,7 @@ impl Key for DelAdd {
|
||||
/// Addition: put all the values under DelAdd::Addition,
|
||||
/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
|
||||
pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
reader: obkv::KvReader<'_, K>,
|
||||
reader: &obkv::KvReader<K>,
|
||||
operation: DelAddOperation,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
@@ -46,7 +46,7 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
|
||||
/// Akin to the [into_del_add_obkv] function but lets you
|
||||
/// conditionally define the `DelAdd` variant based on the obkv key.
|
||||
pub fn into_del_add_obkv_conditional_operation<K, F>(
|
||||
reader: obkv::KvReader<'_, K>,
|
||||
reader: &obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
operation: F,
|
||||
) -> std::io::Result<()>
|
||||
@@ -86,8 +86,8 @@ pub enum DelAddOperation {
|
||||
/// putting each deletion obkv's keys under an DelAdd::Deletion
|
||||
/// and putting each addition obkv's keys under an DelAdd::Addition
|
||||
pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
deletion: &obkv::KvReader<'_, K>,
|
||||
addition: &obkv::KvReader<'_, K>,
|
||||
deletion: &obkv::KvReader<K>,
|
||||
addition: &obkv::KvReader<K>,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<(), std::io::Error> {
|
||||
use itertools::merge_join_by;
|
||||
@@ -121,7 +121,7 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
|
||||
writer.finish()
|
||||
}
|
||||
|
||||
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd<'_>) -> bool {
|
||||
pub fn is_noop_del_add_obkv(del_add: &KvReaderDelAdd) -> bool {
|
||||
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
|
||||
}
|
||||
|
||||
@@ -136,5 +136,5 @@ pub fn deladd_serialize_add_side<'a>(
|
||||
obkv: &'a [u8],
|
||||
_buffer: &mut Vec<u8>,
|
||||
) -> crate::Result<&'a [u8]> {
|
||||
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
Ok(KvReaderDelAdd::from_slice(obkv).get(DelAdd::Addition).unwrap_or_default())
|
||||
}
|
||||
|
||||
@@ -14,7 +14,7 @@ use crate::heed_codec::facet::{
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
|
||||
use crate::update::MergeFn;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
|
||||
|
||||
/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
|
||||
@@ -29,7 +29,7 @@ pub struct FacetsUpdateBulk<'i> {
|
||||
facet_type: FacetType,
|
||||
field_ids: Vec<FieldId>,
|
||||
// None if level 0 does not need to be updated
|
||||
delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
delta_data: Option<Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>>,
|
||||
}
|
||||
|
||||
impl<'i> FacetsUpdateBulk<'i> {
|
||||
@@ -37,7 +37,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
index: &'i Index,
|
||||
field_ids: Vec<FieldId>,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
) -> FacetsUpdateBulk<'i> {
|
||||
@@ -90,7 +90,7 @@ impl<'i> FacetsUpdateBulk<'i> {
|
||||
/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
|
||||
pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
|
||||
pub db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
pub delta_data: Option<Merger<R, MergeFn>>,
|
||||
pub delta_data: Option<Merger<R, MergeDeladdCboRoaringBitmaps>>,
|
||||
pub group_size: u8,
|
||||
pub min_level_size: u8,
|
||||
}
|
||||
@@ -135,7 +135,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
if !valid_lmdb_key(key) {
|
||||
continue;
|
||||
}
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
let value = KvReaderDelAdd::from_slice(value);
|
||||
|
||||
// DB is empty, it is safe to ignore Del operations
|
||||
let Some(value) = value.get(DelAdd::Addition) else {
|
||||
@@ -161,7 +161,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
|
||||
continue;
|
||||
}
|
||||
|
||||
let value = KvReaderDelAdd::new(value);
|
||||
let value = KvReaderDelAdd::from_slice(value);
|
||||
|
||||
// the value is a CboRoaringBitmap, but I still need to prepend the
|
||||
// group size for level 0 (= 1) to it
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::heed_codec::BytesRefCodec;
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::update::MergeFn;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
|
||||
/// Enum used as a return value for the facet incremental indexing.
|
||||
@@ -57,14 +57,14 @@ enum ModificationResult {
|
||||
/// `facet_id_(string/f64)_docids` databases.
|
||||
pub struct FacetsUpdateIncremental {
|
||||
inner: FacetsUpdateIncrementalInner,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
}
|
||||
|
||||
impl FacetsUpdateIncremental {
|
||||
pub fn new(
|
||||
index: &Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
group_size: u8,
|
||||
min_level_size: u8,
|
||||
max_group_size: u8,
|
||||
@@ -109,7 +109,7 @@ impl FacetsUpdateIncremental {
|
||||
}
|
||||
current_field_id = Some(key.field_id);
|
||||
|
||||
let value = KvReader::new(value);
|
||||
let value = KvReader::from_slice(value);
|
||||
let docids_to_delete = value
|
||||
.get(DelAdd::Deletion)
|
||||
.map(CboRoaringBitmapCodec::bytes_decode)
|
||||
|
||||
@@ -86,12 +86,11 @@ use time::OffsetDateTime;
|
||||
use tracing::debug;
|
||||
|
||||
use self::incremental::FacetsUpdateIncremental;
|
||||
use super::FacetsUpdateBulk;
|
||||
use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::facet::FacetType;
|
||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||
use crate::heed_codec::BytesRefCodec;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
use crate::update::MergeFn;
|
||||
use crate::{try_split_array_at, FieldId, Index, Result};
|
||||
|
||||
pub mod bulk;
|
||||
@@ -105,8 +104,8 @@ pub struct FacetsUpdate<'i> {
|
||||
index: &'i Index,
|
||||
database: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
||||
group_size: u8,
|
||||
max_group_size: u8,
|
||||
min_level_size: u8,
|
||||
@@ -116,8 +115,8 @@ impl<'i> FacetsUpdate<'i> {
|
||||
pub fn new(
|
||||
index: &'i Index,
|
||||
facet_type: FacetType,
|
||||
delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeFn>>,
|
||||
delta_data: Merger<BufReader<File>, MergeDeladdCboRoaringBitmaps>,
|
||||
normalized_delta_data: Option<Merger<BufReader<File>, MergeDeladdBtreesetString>>,
|
||||
data_size: u64,
|
||||
) -> Self {
|
||||
let database = match facet_type {
|
||||
@@ -182,12 +181,12 @@ impl<'i> FacetsUpdate<'i> {
|
||||
|
||||
fn index_facet_search(
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
normalized_delta_data: Merger<BufReader<File>, MergeFn>,
|
||||
normalized_delta_data: Merger<BufReader<File>, MergeDeladdBtreesetString>,
|
||||
index: &Index,
|
||||
) -> Result<()> {
|
||||
let mut iter = normalized_delta_data.into_stream_merger_iter()?;
|
||||
while let Some((key_bytes, delta_bytes)) = iter.next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(delta_bytes);
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(delta_bytes);
|
||||
|
||||
let database_set = index
|
||||
.facet_id_normalized_string_strings
|
||||
@@ -298,8 +297,8 @@ pub(crate) mod test_helpers {
|
||||
use crate::search::facet::get_highest_level;
|
||||
use crate::snapshot_tests::display_bitmap;
|
||||
use crate::update::del_add::{DelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::merge_deladd_cbo_roaring_bitmaps;
|
||||
use crate::update::{FacetsUpdateIncrementalInner, MergeFn};
|
||||
use crate::update::index_documents::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::update::FacetsUpdateIncrementalInner;
|
||||
use crate::CboRoaringBitmapCodec;
|
||||
|
||||
/// Utility function to generate a string whose position in a lexicographically
|
||||
@@ -484,7 +483,7 @@ pub(crate) mod test_helpers {
|
||||
}
|
||||
writer.finish().unwrap();
|
||||
let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
builder.push(reader.into_cursor().unwrap());
|
||||
let merger = builder.build();
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
return match cursor.next_document()? {
|
||||
Some(first_document) => Ok(Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.to_string(),
|
||||
document: obkv_to_object(&first_document, &documents_batch_index)?,
|
||||
document: obkv_to_object(first_document, &documents_batch_index)?,
|
||||
})),
|
||||
None => unreachable!("Called with reader.is_empty()"),
|
||||
};
|
||||
@@ -106,7 +106,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
let mut count = 0;
|
||||
while let Some(document) = cursor.next_document()? {
|
||||
let document_id = match fetch_or_generate_document_id(
|
||||
&document,
|
||||
document,
|
||||
&documents_batch_index,
|
||||
primary_key,
|
||||
autogenerate_docids,
|
||||
@@ -145,7 +145,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
|
||||
#[tracing::instrument(level = "trace", skip(uuid_buffer, documents_batch_index, document)
|
||||
target = "indexing::documents")]
|
||||
fn fetch_or_generate_document_id(
|
||||
document: &obkv::KvReader<'_, FieldId>,
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
documents_batch_index: &DocumentsBatchIndex,
|
||||
primary_key: PrimaryKey<'_>,
|
||||
autogenerate_docids: bool,
|
||||
|
||||
@@ -8,7 +8,7 @@ use obkv::{KvReader, KvWriterU16};
|
||||
use roaring::RoaringBitmap;
|
||||
use serde_json::Value;
|
||||
|
||||
use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
|
||||
use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepLatestObkv};
|
||||
use crate::error::{InternalError, SerializationError};
|
||||
use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
@@ -35,7 +35,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
let mut documents_ids = RoaringBitmap::new();
|
||||
let mut docid_word_positions_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_latest_obkv,
|
||||
KeepLatestObkv,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -80,10 +80,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
.try_into()
|
||||
.map(u32::from_be_bytes)
|
||||
.map_err(|_| SerializationError::InvalidNumberSerialization)?;
|
||||
let obkv = KvReader::<FieldId>::new(value);
|
||||
let obkv = KvReader::<FieldId>::from_slice(value);
|
||||
|
||||
// if the searchable fields didn't change, skip the searchable indexing for this document.
|
||||
if !force_reindexing && !searchable_fields_changed(&obkv, settings_diff) {
|
||||
if !force_reindexing && !searchable_fields_changed(obkv, settings_diff) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -98,7 +98,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
|| {
|
||||
// deletions
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
obkv,
|
||||
&settings_diff.old,
|
||||
&del_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
@@ -109,7 +109,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
|| {
|
||||
// additions
|
||||
tokens_from_document(
|
||||
&obkv,
|
||||
obkv,
|
||||
&settings_diff.new,
|
||||
&add_tokenizer,
|
||||
max_positions_per_attributes,
|
||||
@@ -126,13 +126,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
// transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
|
||||
value_buffer.clear();
|
||||
del_add_from_two_obkvs(
|
||||
&KvReader::<FieldId>::new(del_obkv),
|
||||
&KvReader::<FieldId>::new(add_obkv),
|
||||
KvReader::<FieldId>::from_slice(del_obkv),
|
||||
KvReader::<FieldId>::from_slice(add_obkv),
|
||||
&mut value_buffer,
|
||||
)?;
|
||||
|
||||
// write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
|
||||
let obkv = KvReader::<FieldId>::new(&value_buffer);
|
||||
let obkv = KvReader::<FieldId>::from_slice(&value_buffer);
|
||||
for (field_id, value) in obkv.iter() {
|
||||
key_buffer.truncate(mem::size_of::<u32>());
|
||||
key_buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
@@ -146,13 +146,13 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
||||
|
||||
/// Check if any searchable fields of a document changed.
|
||||
fn searchable_fields_changed(
|
||||
obkv: &KvReader<'_, FieldId>,
|
||||
obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
) -> bool {
|
||||
let searchable_fields = &settings_diff.new.searchable_fields_ids;
|
||||
for (field_id, field_bytes) in obkv.iter() {
|
||||
if searchable_fields.contains(&field_id) {
|
||||
let del_add = KvReaderDelAdd::new(field_bytes);
|
||||
let del_add = KvReaderDelAdd::from_slice(field_bytes);
|
||||
match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
|
||||
// if both fields are None, check the next field.
|
||||
(None, None) => (),
|
||||
@@ -189,7 +189,7 @@ fn tokenizer_builder<'a>(
|
||||
|
||||
/// Extract words mapped with their positions of a document.
|
||||
fn tokens_from_document<'a>(
|
||||
obkv: &KvReader<'a, FieldId>,
|
||||
obkv: &'a KvReader<FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
tokenizer: &Tokenizer<'_>,
|
||||
max_positions_per_attributes: u32,
|
||||
@@ -202,7 +202,7 @@ fn tokens_from_document<'a>(
|
||||
// if field is searchable.
|
||||
if settings.searchable_fields_ids.contains(&field_id) {
|
||||
// extract deletion or addition only.
|
||||
if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
|
||||
if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) {
|
||||
// parse json.
|
||||
let value =
|
||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||
|
||||
@@ -4,7 +4,7 @@ use std::io::{self, BufReader};
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
|
||||
create_sorter, sorter_into_reader, GrenadParameters, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::heed_codec::facet::{
|
||||
FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
|
||||
@@ -27,7 +27,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_number_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -45,7 +45,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
|
||||
|
||||
buffer.clear();
|
||||
let mut obkv = KvWriterDelAdd::new(&mut buffer);
|
||||
for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
|
||||
for (deladd_key, _) in KvReaderDelAdd::from_slice(deladd_obkv_bytes).iter() {
|
||||
obkv.insert(deladd_key, document_id.to_ne_bytes())?;
|
||||
}
|
||||
obkv.finish()?;
|
||||
|
||||
@@ -15,7 +15,7 @@ use crate::heed_codec::{BEU16StrCodec, StrRefCodec};
|
||||
use crate::localized_attributes_rules::LocalizedFieldIds;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::{
|
||||
merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
@@ -56,7 +56,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -65,7 +65,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_btreeset_string,
|
||||
MergeDeladdBtreesetString,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -75,7 +75,7 @@ fn extract_facet_string_docids_document_update<R: io::Read + io::Seek>(
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
@@ -144,7 +144,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
|
||||
let mut facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -153,7 +153,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
|
||||
let mut normalized_facet_string_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_deladd_btreeset_string,
|
||||
MergeDeladdBtreesetString,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -163,7 +163,7 @@ fn extract_facet_string_docids_settings<R: io::Read + io::Seek>(
|
||||
let mut buffer = Vec::new();
|
||||
let mut cursor = docid_fid_facet_string.into_cursor()?;
|
||||
while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
|
||||
let deladd_reader = KvReaderDelAdd::new(deladd_original_value_bytes);
|
||||
let deladd_reader = KvReaderDelAdd::from_slice(deladd_original_value_bytes);
|
||||
|
||||
let is_same_value = deladd_reader.get(DelAdd::Deletion).is_some()
|
||||
&& deladd_reader.get(DelAdd::Addition).is_some();
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::convert::TryInto;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
use std::mem::size_of;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use bytemuck::bytes_of;
|
||||
use grenad::Sorter;
|
||||
@@ -15,13 +13,13 @@ use roaring::RoaringBitmap;
|
||||
use serde_json::{from_slice, Value};
|
||||
use FilterableValues::{Empty, Null, Values};
|
||||
|
||||
use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
|
||||
use super::helpers::{create_sorter, sorter_into_reader, GrenadParameters, KeepFirst};
|
||||
use crate::error::InternalError;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::{create_writer, writer_into_reader};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, MAX_FACET_VALUE_LENGTH};
|
||||
|
||||
/// The length of the elements that are always in the buffer when inserting new values.
|
||||
const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
|
||||
@@ -50,7 +48,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
|
||||
let mut fid_docid_facet_numbers_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
KeepFirst,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -59,7 +57,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
|
||||
let mut fid_docid_facet_strings_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
KeepFirst,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -83,10 +81,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
|
||||
if !settings_diff.settings_update_only || old_faceted_fids != new_faceted_fids {
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
let get_document_json_value = move |field_id, side| {
|
||||
obkv.get(field_id)
|
||||
.map(KvReaderDelAdd::new)
|
||||
.map(KvReaderDelAdd::from_slice)
|
||||
.and_then(|kv| kv.get(side))
|
||||
.map(from_slice)
|
||||
.transpose()
|
||||
@@ -330,15 +328,12 @@ fn truncate_str(s: &str) -> &str {
|
||||
|
||||
/// Computes the diff between both Del and Add numbers and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_numbers_diff<MF>(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
|
||||
fn insert_numbers_diff(
|
||||
fid_docid_facet_numbers_sorter: &mut Sorter<KeepFirst>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_numbers: Vec<f64>,
|
||||
mut add_numbers: Vec<f64>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the float numbers
|
||||
del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
|
||||
@@ -390,15 +385,12 @@ where
|
||||
|
||||
/// Computes the diff between both Del and Add strings and
|
||||
/// only inserts the parts that differ in the sorter.
|
||||
fn insert_strings_diff<MF>(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<MF>,
|
||||
fn insert_strings_diff(
|
||||
fid_docid_facet_strings_sorter: &mut Sorter<KeepFirst>,
|
||||
key_buffer: &mut Vec<u8>,
|
||||
mut del_strings: Vec<(String, String)>,
|
||||
mut add_strings: Vec<(String, String)>,
|
||||
) -> Result<()>
|
||||
where
|
||||
MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
|
||||
{
|
||||
) -> Result<()> {
|
||||
// We sort and dedup the normalized and original strings
|
||||
del_strings.sort_unstable();
|
||||
add_strings.sort_unstable();
|
||||
|
||||
@@ -4,8 +4,8 @@ use std::io::{self, BufReader};
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
@@ -30,7 +30,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut fid_word_count_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -45,19 +45,23 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
let deletion = del_add_reader
|
||||
// get deleted words
|
||||
.get(DelAdd::Deletion)
|
||||
// count deleted words
|
||||
.map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
.map(|deletion| {
|
||||
KvReaderU16::from_slice(deletion).iter().take(MAX_COUNTED_WORDS + 1).count()
|
||||
})
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
let addition = del_add_reader
|
||||
// get added words
|
||||
.get(DelAdd::Addition)
|
||||
// count added words
|
||||
.map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
|
||||
.map(|addition| {
|
||||
KvReaderU16::from_slice(addition).iter().take(MAX_COUNTED_WORDS + 1).count()
|
||||
})
|
||||
// keep the count if under or equal to MAX_COUNTED_WORDS
|
||||
.filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
|
||||
|
||||
|
||||
@@ -29,22 +29,20 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
|
||||
let mut cursor = obkv_documents.into_cursor()?;
|
||||
while let Some((docid_bytes, value)) = cursor.move_on_next()? {
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
// since we only need the primary key when we throw an error
|
||||
// we create this getter to lazily get it when needed
|
||||
let document_id = || -> Value {
|
||||
let reader = KvReaderDelAdd::new(obkv.get(primary_key_id).unwrap());
|
||||
let reader = KvReaderDelAdd::from_slice(obkv.get(primary_key_id).unwrap());
|
||||
let document_id =
|
||||
reader.get(DelAdd::Deletion).or(reader.get(DelAdd::Addition)).unwrap();
|
||||
serde_json::from_slice(document_id).unwrap()
|
||||
};
|
||||
|
||||
// extract old version
|
||||
let del_lat_lng =
|
||||
extract_lat_lng(&obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
|
||||
let del_lat_lng = extract_lat_lng(obkv, &settings_diff.old, DelAdd::Deletion, document_id)?;
|
||||
// extract new version
|
||||
let add_lat_lng =
|
||||
extract_lat_lng(&obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
|
||||
let add_lat_lng = extract_lat_lng(obkv, &settings_diff.new, DelAdd::Addition, document_id)?;
|
||||
|
||||
if del_lat_lng != add_lat_lng {
|
||||
let mut obkv = KvWriterDelAdd::memory();
|
||||
@@ -68,15 +66,17 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
|
||||
|
||||
/// Extract the finite floats lat and lng from two bytes slices.
|
||||
fn extract_lat_lng(
|
||||
document: &obkv::KvReader<'_, FieldId>,
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
settings: &InnerIndexSettings,
|
||||
deladd: DelAdd,
|
||||
document_id: impl Fn() -> Value,
|
||||
) -> Result<Option<[f64; 2]>> {
|
||||
match settings.geo_fields_ids {
|
||||
Some((lat_fid, lng_fid)) => {
|
||||
let lat = document.get(lat_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
|
||||
let lng = document.get(lng_fid).map(KvReaderDelAdd::new).and_then(|r| r.get(deladd));
|
||||
let lat =
|
||||
document.get(lat_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
|
||||
let lng =
|
||||
document.get(lng_fid).map(KvReaderDelAdd::from_slice).and_then(|r| r.get(deladd));
|
||||
let (lat, lng) = match (lat, lng) {
|
||||
(Some(lat), Some(lng)) => (lat, lng),
|
||||
(Some(_), None) => {
|
||||
|
||||
@@ -313,7 +313,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||
let docid = DocumentId::from_be_bytes(docid_bytes);
|
||||
|
||||
let obkv = obkv::KvReader::new(value);
|
||||
let obkv = obkv::KvReader::from_slice(value);
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
||||
|
||||
@@ -481,7 +481,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
||||
#[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments
|
||||
fn extract_vector_document_diff(
|
||||
docid: DocumentId,
|
||||
obkv: obkv::KvReader<'_, FieldId>,
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
(add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap),
|
||||
(old, new): (VectorState, VectorState),
|
||||
@@ -526,7 +526,7 @@ fn extract_vector_document_diff(
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
|
||||
if document_is_kept {
|
||||
@@ -562,7 +562,7 @@ fn extract_vector_document_diff(
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
if embedder_is_manual {
|
||||
@@ -588,7 +588,7 @@ fn extract_vector_document_diff(
|
||||
// Do we keep this document?
|
||||
let document_is_kept = obkv
|
||||
.iter()
|
||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||
.map(|(_, deladd)| KvReaderDelAdd::from_slice(deladd))
|
||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||
if document_is_kept {
|
||||
// if the new version of documents has the vectors in the DB,
|
||||
@@ -606,7 +606,7 @@ fn extract_vector_document_diff(
|
||||
}
|
||||
|
||||
fn regenerate_if_prompt_changed(
|
||||
obkv: obkv::KvReader<'_, FieldId>,
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
(old_prompt, new_prompt): (&Prompt, &Prompt),
|
||||
(old_fields_ids_map, new_fields_ids_map): (
|
||||
&FieldsIdsMapWithMetadata,
|
||||
@@ -624,7 +624,7 @@ fn regenerate_if_prompt_changed(
|
||||
}
|
||||
|
||||
fn regenerate_prompt(
|
||||
obkv: obkv::KvReader<'_, FieldId>,
|
||||
obkv: &obkv::KvReader<FieldId>,
|
||||
prompt: &Prompt,
|
||||
new_fields_ids_map: &FieldsIdsMapWithMetadata,
|
||||
) -> Result<VectorStateDelta> {
|
||||
|
||||
@@ -7,8 +7,8 @@ use obkv::KvReaderU16;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters,
|
||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::heed_codec::StrBEU16Codec;
|
||||
@@ -16,7 +16,6 @@ use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::helpers::sorter_into_reader;
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result};
|
||||
|
||||
/// Extracts the word and the documents ids where this word appear.
|
||||
@@ -40,7 +39,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut word_fid_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -58,17 +57,17 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
let fid = u16::from_be_bytes(fid_bytes);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (_pos, word) in KvReaderU16::new(deletion).iter() {
|
||||
for (_pos, word) in KvReaderU16::from_slice(deletion).iter() {
|
||||
del_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (_pos, word) in KvReaderU16::new(addition).iter() {
|
||||
for (_pos, word) in KvReaderU16::from_slice(addition).iter() {
|
||||
add_words.insert(word.to_vec());
|
||||
}
|
||||
}
|
||||
@@ -94,7 +93,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -103,7 +102,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut exact_word_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -115,7 +114,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
// NOTE: replacing sorters by bitmap merging is less efficient, so, use sorters.
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
// only keep the value if their is a change to apply in the DB.
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
|
||||
if !is_noop_del_add_obkv(KvReaderDelAdd::from_slice(value)) {
|
||||
word_fid_docids_writer.insert(key, value)?;
|
||||
}
|
||||
|
||||
@@ -123,7 +122,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
|
||||
.map_err(|_| SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
|
||||
|
||||
// merge all deletions
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = obkv.get(DelAdd::Deletion) {
|
||||
let delete_from_exact = settings_diff.old.exact_attributes.contains(&fid);
|
||||
buffer.clear();
|
||||
@@ -163,7 +162,7 @@ fn words_into_sorter(
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_words: &BTreeSet<Vec<u8>>,
|
||||
add_words: &BTreeSet<Vec<u8>>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
word_fid_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
@@ -6,8 +6,8 @@ use std::{cmp, io};
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
|
||||
writer_into_reader, GrenadParameters, MergeFn,
|
||||
create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
@@ -44,7 +44,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
.map(|_| {
|
||||
create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -92,8 +92,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
// deletions
|
||||
if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::new(deletion).iter() {
|
||||
if let Some(deletion) = KvReaderDelAdd::from_slice(value).get(DelAdd::Deletion) {
|
||||
for (position, word) in KvReaderU16::from_slice(deletion).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while del_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
@@ -125,8 +125,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
|
||||
}
|
||||
|
||||
// additions
|
||||
if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::new(addition).iter() {
|
||||
if let Some(addition) = KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
||||
for (position, word) in KvReaderU16::from_slice(addition).iter() {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while add_word_positions.front().map_or(false, |(_w, p)| {
|
||||
index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
|
||||
@@ -197,7 +197,7 @@ fn document_word_positions_into_sorter(
|
||||
document_id: DocumentId,
|
||||
del_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
add_word_pair_proximity: &BTreeMap<(String, String), u8>,
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
|
||||
word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeDeladdCboRoaringBitmaps>],
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
@@ -5,14 +5,13 @@ use std::io::{self, BufReader};
|
||||
use obkv::KvReaderU16;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
|
||||
GrenadParameters,
|
||||
create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
};
|
||||
use crate::error::SerializationError;
|
||||
use crate::index::db_name::DOCID_WORD_POSITIONS;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::update::MergeFn;
|
||||
use crate::{bucketed_position, DocumentId, Result};
|
||||
|
||||
/// Extracts the word positions and the documents ids where this word appear.
|
||||
@@ -29,7 +28,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
|
||||
let mut word_position_docids_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Unstable,
|
||||
merge_deladd_cbo_roaring_bitmaps,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
indexer.max_nb_chunks,
|
||||
@@ -60,10 +59,10 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
|
||||
current_document_id = Some(document_id);
|
||||
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
// extract all unique words to remove.
|
||||
if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
|
||||
for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
|
||||
for (position, word_bytes) in KvReaderU16::from_slice(deletion).iter() {
|
||||
let position = bucketed_position(position);
|
||||
del_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
@@ -71,7 +70,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
|
||||
|
||||
// extract all unique additional words.
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
for (position, word_bytes) in KvReaderU16::new(addition).iter() {
|
||||
for (position, word_bytes) in KvReaderU16::from_slice(addition).iter() {
|
||||
let position = bucketed_position(position);
|
||||
add_word_positions.insert((position, word_bytes.to_vec()));
|
||||
}
|
||||
@@ -100,7 +99,7 @@ fn words_position_into_sorter(
|
||||
key_buffer: &mut Vec<u8>,
|
||||
del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
|
||||
word_position_docids_sorter: &mut grenad::Sorter<MergeDeladdCboRoaringBitmaps>,
|
||||
) -> Result<()> {
|
||||
use itertools::merge_join_by;
|
||||
use itertools::EitherOrBoth::{Both, Left, Right};
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use std::borrow::Cow;
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, BufWriter, Seek};
|
||||
|
||||
use grenad::{CompressionType, Sorter};
|
||||
use grenad::{CompressionType, MergeFunction, Sorter};
|
||||
use heed::types::Bytes;
|
||||
|
||||
use super::{ClonableMmap, MergeFn};
|
||||
use super::ClonableMmap;
|
||||
use crate::update::index_documents::valid_lmdb_key;
|
||||
use crate::Result;
|
||||
|
||||
@@ -31,14 +30,14 @@ pub fn create_writer<R: io::Write>(
|
||||
/// A helper function that creates a grenad sorter
|
||||
/// with the given parameters. The max memory is
|
||||
/// clamped to something reasonable.
|
||||
pub fn create_sorter(
|
||||
pub fn create_sorter<MF: MergeFunction>(
|
||||
sort_algorithm: grenad::SortAlgorithm,
|
||||
merge: MergeFn,
|
||||
merge: MF,
|
||||
chunk_compression_type: grenad::CompressionType,
|
||||
chunk_compression_level: Option<u32>,
|
||||
max_nb_chunks: Option<usize>,
|
||||
max_memory: Option<usize>,
|
||||
) -> grenad::Sorter<MergeFn> {
|
||||
) -> grenad::Sorter<MF> {
|
||||
let mut builder = grenad::Sorter::builder(merge);
|
||||
builder.chunk_compression_type(chunk_compression_type);
|
||||
if let Some(level) = chunk_compression_level {
|
||||
@@ -57,10 +56,14 @@ pub fn create_sorter(
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn sorter_into_reader(
|
||||
sorter: grenad::Sorter<MergeFn>,
|
||||
pub fn sorter_into_reader<MF>(
|
||||
sorter: grenad::Sorter<MF>,
|
||||
indexer: GrenadParameters,
|
||||
) -> Result<grenad::Reader<BufReader<File>>> {
|
||||
) -> Result<grenad::Reader<BufReader<File>>>
|
||||
where
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut writer = create_writer(
|
||||
indexer.chunk_compression_type,
|
||||
indexer.chunk_compression_level,
|
||||
@@ -169,8 +172,8 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
|
||||
/// Write provided sorter in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::grenad")]
|
||||
pub fn write_sorter_into_database<K, V, FS, FM>(
|
||||
sorter: Sorter<MergeFn>,
|
||||
pub fn write_sorter_into_database<K, V, FS, FM, MF>(
|
||||
sorter: Sorter<MF>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut heed::RwTxn<'_>,
|
||||
index_is_empty: bool,
|
||||
@@ -180,6 +183,8 @@ pub fn write_sorter_into_database<K, V, FS, FM>(
|
||||
where
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
@@ -207,8 +212,3 @@ where
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||
pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(Cow::Owned(Vec::new()))
|
||||
}
|
||||
|
||||
@@ -3,6 +3,8 @@ use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use either::Either;
|
||||
use grenad::MergeFunction;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
@@ -10,7 +12,8 @@ use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||
use crate::update::index_documents::transform::Operation;
|
||||
use crate::Result;
|
||||
|
||||
pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
|
||||
pub type EitherObkvMerge =
|
||||
Either<ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions>;
|
||||
|
||||
pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
|
||||
buffer.clear();
|
||||
@@ -18,35 +21,53 @@ pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) ->
|
||||
bitmap.serialize_into(buffer)
|
||||
}
|
||||
|
||||
pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let merged = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
let mut buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&merged, &mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
pub struct MergeRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let merged = values
|
||||
.iter()
|
||||
.map(AsRef::as_ref)
|
||||
.map(RoaringBitmap::deserialize_from)
|
||||
.map(StdResult::unwrap)
|
||||
.reduce(|a, b| a | b)
|
||||
.unwrap();
|
||||
let mut buffer = Vec::new();
|
||||
serialize_roaring_bitmap(&merged, &mut buffer)?;
|
||||
Ok(Cow::Owned(buffer))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
pub struct KeepFirst;
|
||||
|
||||
impl MergeFunction for KeepFirst {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(values[0].clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Only the last value associated with an id is kept.
|
||||
pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
pub struct KeepLatestObkv;
|
||||
|
||||
impl MergeFunction for KeepLatestObkv {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
Ok(obkvs.last().unwrap().clone())
|
||||
}
|
||||
}
|
||||
|
||||
pub fn merge_two_del_add_obkvs(
|
||||
base: obkv::KvReaderU16<'_>,
|
||||
update: obkv::KvReaderU16<'_>,
|
||||
base: &obkv::KvReaderU16,
|
||||
update: &obkv::KvReaderU16,
|
||||
merge_additions: bool,
|
||||
buffer: &mut Vec<u8>,
|
||||
) {
|
||||
@@ -66,7 +87,7 @@ pub fn merge_two_del_add_obkvs(
|
||||
// If merge_additions is false, recreate an obkv keeping the deletions only.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::new(v);
|
||||
let base_reader = KvReaderDelAdd::from_slice(v);
|
||||
|
||||
if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
|
||||
value_writer.insert(DelAdd::Deletion, deletion).unwrap();
|
||||
@@ -80,8 +101,8 @@ pub fn merge_two_del_add_obkvs(
|
||||
// merge deletions and additions.
|
||||
value_buffer.clear();
|
||||
let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
|
||||
let base_reader = KvReaderDelAdd::new(base);
|
||||
let update_reader = KvReaderDelAdd::new(update);
|
||||
let base_reader = KvReaderDelAdd::from_slice(base);
|
||||
let update_reader = KvReaderDelAdd::from_slice(update);
|
||||
|
||||
// keep newest deletion.
|
||||
if let Some(deletion) = update_reader
|
||||
@@ -131,8 +152,8 @@ fn inner_merge_del_add_obkvs<'a>(
|
||||
break;
|
||||
}
|
||||
|
||||
let newest = obkv::KvReader::new(&acc);
|
||||
let oldest = obkv::KvReader::new(¤t[1..]);
|
||||
let newest = obkv::KvReader::from_slice(&acc);
|
||||
let oldest = obkv::KvReader::from_slice(¤t[1..]);
|
||||
merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
|
||||
|
||||
// we want the result of the merge into our accumulator.
|
||||
@@ -145,65 +166,79 @@ fn inner_merge_del_add_obkvs<'a>(
|
||||
}
|
||||
|
||||
/// Merge all the obkvs from the newest to the oldest.
|
||||
pub fn obkvs_merge_additions_and_deletions<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, true)
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ObkvsMergeAdditionsAndDeletions;
|
||||
|
||||
impl MergeFunction for ObkvsMergeAdditionsAndDeletions {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, true)
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
|
||||
pub fn obkvs_keep_last_addition_merge_deletions<'a>(
|
||||
_key: &[u8],
|
||||
obkvs: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, false)
|
||||
#[derive(Copy, Clone)]
|
||||
pub struct ObkvsKeepLastAdditionMergeDeletions;
|
||||
|
||||
impl MergeFunction for ObkvsKeepLastAdditionMergeDeletions {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
inner_merge_del_add_obkvs(obkvs, false)
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of all the CboRoaringBitmaps in the values.
|
||||
pub fn merge_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||
Ok(Cow::from(vec))
|
||||
pub struct MergeCboRoaringBitmaps;
|
||||
|
||||
impl MergeFunction for MergeCboRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
let mut vec = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(values, &mut vec)?;
|
||||
Ok(Cow::from(vec))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_bitmaps_bytes = Vec::new();
|
||||
let mut add_bitmaps_bytes = Vec::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||
del_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||
add_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
}
|
||||
pub struct MergeDeladdCboRoaringBitmaps;
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let mut buffer = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
impl MergeFunction for MergeDeladdCboRoaringBitmaps {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_bitmaps_bytes = Vec::new();
|
||||
let mut add_bitmaps_bytes = Vec::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
|
||||
del_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
|
||||
add_bitmaps_bytes.push(bitmap_bytes);
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let mut buffer = Vec::new();
|
||||
CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -217,7 +252,7 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> Result<Option<&'a [u8]>> {
|
||||
Ok(CboRoaringBitmapCodec::merge_deladd_into(
|
||||
KvReaderDelAdd::new(deladd_obkv),
|
||||
KvReaderDelAdd::from_slice(deladd_obkv),
|
||||
previous,
|
||||
buffer,
|
||||
)?)
|
||||
@@ -225,37 +260,55 @@ pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
|
||||
|
||||
/// Do a union of BtreeSet on both sides of a DelAdd obkv
|
||||
/// separately and outputs a new DelAdd with both unions.
|
||||
pub fn merge_deladd_btreeset_string<'a>(
|
||||
_key: &[u8],
|
||||
values: &[Cow<'a, [u8]>],
|
||||
) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_set = BTreeSet::new();
|
||||
let mut add_set = BTreeSet::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::new(value);
|
||||
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
del_set.insert(value);
|
||||
}
|
||||
}
|
||||
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
add_set.insert(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
pub struct MergeDeladdBtreesetString;
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let del = serde_json::to_vec(&del_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||
let add = serde_json::to_vec(&add_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
impl MergeFunction for MergeDeladdBtreesetString {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(&self, _key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
|
||||
if values.len() == 1 {
|
||||
Ok(values[0].clone())
|
||||
} else {
|
||||
// Retrieve the bitmaps from both sides
|
||||
let mut del_set = BTreeSet::new();
|
||||
let mut add_set = BTreeSet::new();
|
||||
for value in values {
|
||||
let obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(bytes) = obkv.get(DelAdd::Deletion) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
del_set.insert(value);
|
||||
}
|
||||
}
|
||||
if let Some(bytes) = obkv.get(DelAdd::Addition) {
|
||||
let set = serde_json::from_slice::<BTreeSet<String>>(bytes).unwrap();
|
||||
for value in set {
|
||||
add_set.insert(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut output_deladd_obkv = KvWriterDelAdd::memory();
|
||||
let del = serde_json::to_vec(&del_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Deletion, &del)?;
|
||||
let add = serde_json::to_vec(&add_set).unwrap();
|
||||
output_deladd_obkv.insert(DelAdd::Addition, &add)?;
|
||||
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Used when trying to merge readers, but you don't actually care about the values.
|
||||
pub struct MergeIgnoreValues;
|
||||
|
||||
impl MergeFunction for MergeIgnoreValues {
|
||||
type Error = crate::Error;
|
||||
|
||||
fn merge<'a>(
|
||||
&self,
|
||||
_key: &[u8],
|
||||
_values: &[Cow<'a, [u8]>],
|
||||
) -> std::result::Result<Cow<'a, [u8]>, Self::Error> {
|
||||
Ok(Cow::Owned(Vec::new()))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,17 +7,8 @@ use std::convert::{TryFrom, TryInto};
|
||||
|
||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
pub use grenad_helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||
merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
|
||||
GrenadParameters,
|
||||
};
|
||||
pub use merge_functions::{
|
||||
keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_deladd_btreeset_string,
|
||||
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
|
||||
merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, MergeFn,
|
||||
};
|
||||
pub use grenad_helpers::*;
|
||||
pub use merge_functions::*;
|
||||
|
||||
use crate::MAX_WORD_LENGTH;
|
||||
|
||||
|
||||
@@ -27,13 +27,7 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk};
|
||||
|
||||
use self::enrich::enrich_documents_batch;
|
||||
pub use self::enrich::{extract_finite_float_from_value, DocumentId};
|
||||
pub use self::helpers::{
|
||||
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
|
||||
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
|
||||
valid_lmdb_key, write_sorter_into_database, writer_into_reader, MergeFn,
|
||||
};
|
||||
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
|
||||
pub use self::helpers::*;
|
||||
pub use self::transform::{Transform, TransformOutput};
|
||||
use crate::documents::{obkv_to_object, DocumentsBatchBuilder, DocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
@@ -605,7 +599,7 @@ where
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&word_docids_reader)? };
|
||||
let word_docids = word_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
|
||||
});
|
||||
word_docids.push(cloneable_chunk.into_cursor()?);
|
||||
let cloneable_chunk =
|
||||
@@ -613,14 +607,14 @@ where
|
||||
let exact_word_docids =
|
||||
exact_word_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(
|
||||
merge_deladd_cbo_roaring_bitmaps as MergeFn,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
)
|
||||
});
|
||||
exact_word_docids.push(cloneable_chunk.into_cursor()?);
|
||||
let cloneable_chunk =
|
||||
unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
|
||||
let word_fid_docids = word_fid_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn)
|
||||
MergerBuilder::new(MergeDeladdCboRoaringBitmaps)
|
||||
});
|
||||
word_fid_docids.push(cloneable_chunk.into_cursor()?);
|
||||
TypedChunk::WordDocids {
|
||||
@@ -634,7 +628,7 @@ where
|
||||
let word_position_docids =
|
||||
word_position_docids.get_or_insert_with(|| {
|
||||
MergerBuilder::new(
|
||||
merge_deladd_cbo_roaring_bitmaps as MergeFn,
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
)
|
||||
});
|
||||
word_position_docids.push(cloneable_chunk.into_cursor()?);
|
||||
@@ -689,8 +683,9 @@ where
|
||||
key: None,
|
||||
},
|
||||
)?;
|
||||
let first_id = crate::vector::arroy_db_range_for_embedder(index).next().unwrap();
|
||||
let reader =
|
||||
ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized);
|
||||
ArroyWrapper::new(self.index.vector_arroy, first_id, action.was_quantized);
|
||||
let dim = reader.dimensions(self.wtxn)?;
|
||||
dimension.insert(name.to_string(), dim);
|
||||
}
|
||||
@@ -699,7 +694,6 @@ where
|
||||
for (embedder_name, dimension) in dimension {
|
||||
let wtxn = &mut *self.wtxn;
|
||||
let vector_arroy = self.index.vector_arroy;
|
||||
let cancel = &self.should_abort;
|
||||
|
||||
let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||
@@ -713,8 +707,17 @@ where
|
||||
let is_quantizing = embedder_config.map_or(false, |action| action.is_being_quantized);
|
||||
|
||||
pool.install(|| {
|
||||
let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized);
|
||||
writer.build_and_quantize(wtxn, &mut rng, dimension, is_quantizing, cancel)?;
|
||||
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
||||
let mut writer = ArroyWrapper::new(vector_arroy, k, was_quantized);
|
||||
if is_quantizing {
|
||||
writer.quantize(wtxn, k, dimension)?;
|
||||
}
|
||||
if writer.need_build(wtxn, dimension)? {
|
||||
writer.build(wtxn, &mut rng, dimension)?;
|
||||
} else if writer.is_empty(wtxn, dimension)? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
Result::Ok(())
|
||||
})
|
||||
.map_err(InternalError::from)??;
|
||||
@@ -738,10 +741,10 @@ where
|
||||
)]
|
||||
pub fn execute_prefix_databases(
|
||||
self,
|
||||
word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_fid_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
|
||||
word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
||||
exact_word_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
||||
word_position_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
||||
word_fid_docids: Option<Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>>,
|
||||
) -> Result<()>
|
||||
where
|
||||
FP: Fn(UpdateIndexingStep) + Sync,
|
||||
@@ -921,7 +924,7 @@ where
|
||||
)]
|
||||
fn execute_word_prefix_docids(
|
||||
txn: &mut heed::RwTxn<'_>,
|
||||
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||
merger: Merger<CursorClonableMmap, MergeDeladdCboRoaringBitmaps>,
|
||||
word_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
|
||||
indexer_config: &IndexerConfig,
|
||||
|
||||
@@ -31,14 +31,14 @@ impl<'t> ImmutableObkvs<'t> {
|
||||
}
|
||||
|
||||
/// Returns the OBKVs identified by the given ID.
|
||||
pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<KvReaderU16<'t>>> {
|
||||
pub fn obkv(&self, docid: DocumentId) -> heed::Result<Option<&'t KvReaderU16>> {
|
||||
match self
|
||||
.ids
|
||||
.rank(docid)
|
||||
.checked_sub(1)
|
||||
.and_then(|offset| self.slices.get(offset as usize))
|
||||
{
|
||||
Some(bytes) => Ok(Some(KvReaderU16::new(bytes))),
|
||||
Some(&bytes) => Ok(Some(bytes.into())),
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::collections::{BTreeMap, HashMap, HashSet};
|
||||
use std::fs::File;
|
||||
use std::io::{Read, Seek};
|
||||
|
||||
use either::Either;
|
||||
use fxhash::FxHashMap;
|
||||
use itertools::Itertools;
|
||||
use obkv::{KvReader, KvReaderU16, KvWriter};
|
||||
@@ -13,10 +14,10 @@ use serde_json::Value;
|
||||
use smartstring::SmartString;
|
||||
|
||||
use super::helpers::{
|
||||
create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions,
|
||||
obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn,
|
||||
create_sorter, create_writer, sorter_into_reader, EitherObkvMerge,
|
||||
ObkvsKeepLastAdditionMergeDeletions, ObkvsMergeAdditionsAndDeletions,
|
||||
};
|
||||
use super::{IndexDocumentsMethod, IndexerConfig};
|
||||
use super::{IndexDocumentsMethod, IndexerConfig, KeepFirst};
|
||||
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
|
||||
use crate::error::{Error, InternalError, UserError};
|
||||
use crate::index::{db_name, main_key};
|
||||
@@ -26,7 +27,7 @@ use crate::update::del_add::{
|
||||
};
|
||||
use crate::update::index_documents::GrenadParameters;
|
||||
use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff};
|
||||
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
|
||||
use crate::update::{AvailableIds, UpdateIndexingStep};
|
||||
use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
|
||||
use crate::vector::settings::WriteBackToDocuments;
|
||||
use crate::vector::ArroyWrapper;
|
||||
@@ -55,13 +56,13 @@ pub struct Transform<'a, 'i> {
|
||||
|
||||
indexer_settings: &'a IndexerConfig,
|
||||
pub index_documents_method: IndexDocumentsMethod,
|
||||
available_documents_ids: AvailableDocumentsIds,
|
||||
available_documents_ids: AvailableIds,
|
||||
|
||||
// Both grenad follows the same format:
|
||||
// key | value
|
||||
// u32 | 1 byte for the Operation byte, the rest is the obkv of the document stored
|
||||
original_sorter: grenad::Sorter<MergeFn>,
|
||||
flattened_sorter: grenad::Sorter<MergeFn>,
|
||||
original_sorter: grenad::Sorter<EitherObkvMerge>,
|
||||
flattened_sorter: grenad::Sorter<EitherObkvMerge>,
|
||||
|
||||
replaced_documents_ids: RoaringBitmap,
|
||||
new_documents_ids: RoaringBitmap,
|
||||
@@ -109,17 +110,19 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
index_documents_method: IndexDocumentsMethod,
|
||||
_autogenerate_docids: bool,
|
||||
) -> Result<Self> {
|
||||
use IndexDocumentsMethod::{ReplaceDocuments, UpdateDocuments};
|
||||
|
||||
// We must choose the appropriate merge function for when two or more documents
|
||||
// with the same user id must be merged or fully replaced in the same batch.
|
||||
let merge_function = match index_documents_method {
|
||||
IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
|
||||
IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
|
||||
ReplaceDocuments => Either::Left(ObkvsKeepLastAdditionMergeDeletions),
|
||||
UpdateDocuments => Either::Right(ObkvsMergeAdditionsAndDeletions),
|
||||
};
|
||||
|
||||
// We initialize the sorter with the user indexing settings.
|
||||
let original_sorter = create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
merge_function,
|
||||
merge_function.clone(),
|
||||
indexer_settings.chunk_compression_type,
|
||||
indexer_settings.chunk_compression_level,
|
||||
indexer_settings.max_nb_chunks,
|
||||
@@ -141,7 +144,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
index,
|
||||
fields_ids_map: index.fields_ids_map(wtxn)?,
|
||||
indexer_settings,
|
||||
available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
|
||||
available_documents_ids: AvailableIds::new(&documents_ids),
|
||||
original_sorter,
|
||||
flattened_sorter,
|
||||
index_documents_method,
|
||||
@@ -279,21 +282,21 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(base_obkv),
|
||||
KvReaderU16::from_slice(base_obkv),
|
||||
deladd_operation,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
let base_obkv = KvReader::new(base_obkv);
|
||||
let base_obkv = KvReader::from_slice(base_obkv);
|
||||
if let Some(flattened_obkv) =
|
||||
Self::flatten_from_fields_ids_map(&base_obkv, &mut self.fields_ids_map)?
|
||||
Self::flatten_from_fields_ids_map(base_obkv, &mut self.fields_ids_map)?
|
||||
{
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&flattened_obkv),
|
||||
KvReaderU16::from_slice(&flattened_obkv),
|
||||
deladd_operation,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
@@ -312,7 +315,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv_buffer),
|
||||
KvReaderU16::from_slice(&obkv_buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?;
|
||||
@@ -320,14 +323,14 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
self.original_sorter
|
||||
.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
let flattened_obkv = KvReader::new(&obkv_buffer);
|
||||
let flattened_obkv = KvReader::from_slice(&obkv_buffer);
|
||||
if let Some(obkv) =
|
||||
Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
|
||||
Self::flatten_from_fields_ids_map(flattened_obkv, &mut self.fields_ids_map)?
|
||||
{
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv),
|
||||
KvReaderU16::from_slice(&obkv),
|
||||
DelAddOperation::Addition,
|
||||
&mut document_sorter_value_buffer,
|
||||
)?
|
||||
@@ -520,22 +523,22 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(base_obkv),
|
||||
KvReaderU16::from_slice(base_obkv),
|
||||
DelAddOperation::Deletion,
|
||||
document_sorter_value_buffer,
|
||||
)?;
|
||||
self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
|
||||
|
||||
// flatten it and push it as to delete in the flattened_sorter
|
||||
let flattened_obkv = KvReader::new(base_obkv);
|
||||
let flattened_obkv = KvReader::from_slice(base_obkv);
|
||||
if let Some(obkv) =
|
||||
Self::flatten_from_fields_ids_map(&flattened_obkv, &mut self.fields_ids_map)?
|
||||
Self::flatten_from_fields_ids_map(flattened_obkv, &mut self.fields_ids_map)?
|
||||
{
|
||||
// we recreate our buffer with the flattened documents
|
||||
document_sorter_value_buffer.clear();
|
||||
document_sorter_value_buffer.push(Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&obkv),
|
||||
KvReaderU16::from_slice(&obkv),
|
||||
DelAddOperation::Deletion,
|
||||
document_sorter_value_buffer,
|
||||
)?;
|
||||
@@ -553,7 +556,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
target = "indexing::transform"
|
||||
)]
|
||||
fn flatten_from_fields_ids_map(
|
||||
obkv: &KvReader<'_, FieldId>,
|
||||
obkv: &KvReader<FieldId>,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
) -> Result<Option<Vec<u8>>> {
|
||||
if obkv
|
||||
@@ -721,10 +724,10 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
total_documents: self.documents_count,
|
||||
});
|
||||
|
||||
for (key, value) in KvReader::new(val) {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
for (key, value) in KvReader::from_slice(val) {
|
||||
let reader = KvReaderDelAdd::from_slice(value);
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
(None, None) => (),
|
||||
(None, Some(_)) => {
|
||||
// New field
|
||||
let name = self.fields_ids_map.name(key).ok_or(
|
||||
@@ -838,7 +841,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
/// then fill the provided buffers with delta documents using KvWritterDelAdd.
|
||||
#[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo
|
||||
fn rebind_existing_document(
|
||||
old_obkv: KvReader<'_, FieldId>,
|
||||
old_obkv: &KvReader<FieldId>,
|
||||
settings_diff: &InnerIndexSettingsDiff,
|
||||
modified_faceted_fields: &HashSet<String>,
|
||||
mut injected_vectors: serde_json::Map<String, serde_json::Value>,
|
||||
@@ -926,7 +929,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
}
|
||||
|
||||
let data = obkv_writer.into_inner()?;
|
||||
let obkv = KvReader::<FieldId>::new(&data);
|
||||
let obkv = KvReader::<FieldId>::from_slice(&data);
|
||||
|
||||
if let Some(original_obkv_buffer) = original_obkv_buffer {
|
||||
original_obkv_buffer.clear();
|
||||
@@ -936,8 +939,8 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
if let Some(flattened_obkv_buffer) = flattened_obkv_buffer {
|
||||
// take the non-flattened version if flatten_from_fields_ids_map returns None.
|
||||
let mut fields_ids_map = settings_diff.new.fields_ids_map.clone();
|
||||
let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?;
|
||||
let flattened = flattened.as_deref().map_or(obkv, KvReader::new);
|
||||
let flattened = Self::flatten_from_fields_ids_map(obkv, &mut fields_ids_map)?;
|
||||
let flattened = flattened.as_deref().map_or(obkv, KvReader::from_slice);
|
||||
|
||||
flattened_obkv_buffer.clear();
|
||||
into_del_add_obkv_conditional_operation(flattened, flattened_obkv_buffer, |id| {
|
||||
@@ -980,7 +983,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
let mut original_sorter = if settings_diff.reindex_vectors() {
|
||||
Some(create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
KeepFirst,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
@@ -990,24 +993,27 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
None
|
||||
};
|
||||
|
||||
let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff
|
||||
let readers: Result<BTreeMap<&str, (Vec<ArroyWrapper>, &RoaringBitmap)>> = settings_diff
|
||||
.embedding_config_updates
|
||||
.iter()
|
||||
.filter_map(|(name, action)| {
|
||||
if let Some(WriteBackToDocuments { embedder_id, user_provided }) =
|
||||
action.write_back()
|
||||
{
|
||||
let reader = ArroyWrapper::new(
|
||||
self.index.vector_arroy,
|
||||
*embedder_id,
|
||||
action.was_quantized,
|
||||
);
|
||||
Some((name.as_str(), (reader, user_provided)))
|
||||
let readers: Result<Vec<_>> = self
|
||||
.index
|
||||
.arroy_readers(wtxn, *embedder_id, action.was_quantized)
|
||||
.collect();
|
||||
match readers {
|
||||
Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))),
|
||||
Err(error) => Some(Err(error)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let readers = readers?;
|
||||
|
||||
let old_vectors_fid = settings_diff
|
||||
.old
|
||||
@@ -1019,7 +1025,7 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
if settings_diff.reindex_searchable() || settings_diff.reindex_facets() {
|
||||
Some(create_sorter(
|
||||
grenad::SortAlgorithm::Stable,
|
||||
keep_first,
|
||||
KeepFirst,
|
||||
self.indexer_settings.chunk_compression_type,
|
||||
self.indexer_settings.chunk_compression_level,
|
||||
self.indexer_settings.max_nb_chunks,
|
||||
@@ -1045,24 +1051,34 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
arroy::Error,
|
||||
> = readers
|
||||
.iter()
|
||||
.filter_map(|(name, (reader, user_provided))| {
|
||||
.filter_map(|(name, (readers, user_provided))| {
|
||||
if !user_provided.contains(docid) {
|
||||
return None;
|
||||
}
|
||||
match reader.item_vectors(wtxn, docid) {
|
||||
Ok(vectors) if vectors.is_empty() => None,
|
||||
Ok(vectors) => Some(Ok((
|
||||
name.to_string(),
|
||||
serde_json::to_value(ExplicitVectors {
|
||||
embeddings: Some(
|
||||
VectorOrArrayOfVectors::from_array_of_vectors(vectors),
|
||||
),
|
||||
regenerate: false,
|
||||
})
|
||||
.unwrap(),
|
||||
))),
|
||||
Err(e) => Some(Err(e)),
|
||||
let mut vectors = Vec::new();
|
||||
for reader in readers {
|
||||
let Some(vector) = reader.item_vector(wtxn, docid).transpose() else {
|
||||
break;
|
||||
};
|
||||
|
||||
match vector {
|
||||
Ok(vector) => vectors.push(vector),
|
||||
Err(error) => return Some(Err(error)),
|
||||
}
|
||||
}
|
||||
if vectors.is_empty() {
|
||||
return None;
|
||||
}
|
||||
Some(Ok((
|
||||
name.to_string(),
|
||||
serde_json::to_value(ExplicitVectors {
|
||||
embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
|
||||
vectors,
|
||||
)),
|
||||
regenerate: false,
|
||||
})
|
||||
.unwrap(),
|
||||
)))
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -1091,9 +1107,11 @@ impl<'a, 'i> Transform<'a, 'i> {
|
||||
}
|
||||
|
||||
// delete all vectors from the embedders that need removal
|
||||
for (_, (reader, _)) in readers {
|
||||
let dimensions = reader.dimensions(wtxn)?;
|
||||
reader.clear(wtxn, dimensions)?;
|
||||
for (_, (readers, _)) in readers {
|
||||
for reader in readers {
|
||||
let dimensions = reader.dimensions(wtxn)?;
|
||||
reader.clear(wtxn, dimensions)?;
|
||||
}
|
||||
}
|
||||
|
||||
let grenad_params = GrenadParameters {
|
||||
@@ -1137,6 +1155,8 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use grenad::MergeFunction;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
@@ -1148,21 +1168,21 @@ mod test {
|
||||
kv_writer.insert(0_u8, [0]).unwrap();
|
||||
let buffer = kv_writer.into_inner().unwrap();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
KvReaderU16::from_slice(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut additive_doc_0,
|
||||
)
|
||||
.unwrap();
|
||||
additive_doc_0.insert(0, Operation::Addition as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
KvReaderU16::from_slice(&buffer),
|
||||
DelAddOperation::Deletion,
|
||||
&mut deletive_doc_0,
|
||||
)
|
||||
.unwrap();
|
||||
deletive_doc_0.insert(0, Operation::Deletion as u8);
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
KvReaderU16::from_slice(&buffer),
|
||||
DelAddOperation::DeletionAndAddition,
|
||||
&mut del_add_doc_0,
|
||||
)
|
||||
@@ -1174,7 +1194,7 @@ mod test {
|
||||
kv_writer.insert(1_u8, [1]).unwrap();
|
||||
let buffer = kv_writer.into_inner().unwrap();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
KvReaderU16::from_slice(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut additive_doc_1,
|
||||
)
|
||||
@@ -1187,32 +1207,39 @@ mod test {
|
||||
kv_writer.insert(1_u8, [1]).unwrap();
|
||||
let buffer = kv_writer.into_inner().unwrap();
|
||||
into_del_add_obkv(
|
||||
KvReaderU16::new(&buffer),
|
||||
KvReaderU16::from_slice(&buffer),
|
||||
DelAddOperation::Addition,
|
||||
&mut additive_doc_0_1,
|
||||
)
|
||||
.unwrap();
|
||||
additive_doc_0_1.insert(0, Operation::Addition as u8);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
|
||||
.unwrap();
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsMergeAdditionsAndDeletions,
|
||||
&[],
|
||||
&[Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, additive_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsMergeAdditionsAndDeletions,
|
||||
&[],
|
||||
&[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, del_add_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsMergeAdditionsAndDeletions,
|
||||
&[],
|
||||
&[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, deletive_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsMergeAdditionsAndDeletions,
|
||||
&[],
|
||||
&[
|
||||
Cow::from(additive_doc_1.as_slice()),
|
||||
@@ -1223,21 +1250,24 @@ mod test {
|
||||
.unwrap();
|
||||
assert_eq!(*ret, del_add_doc_0);
|
||||
|
||||
let ret = obkvs_merge_additions_and_deletions(
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsMergeAdditionsAndDeletions,
|
||||
&[],
|
||||
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, additive_doc_0_1);
|
||||
|
||||
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsKeepLastAdditionMergeDeletions,
|
||||
&[],
|
||||
&[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
|
||||
)
|
||||
.unwrap();
|
||||
assert_eq!(*ret, additive_doc_0);
|
||||
|
||||
let ret = obkvs_keep_last_addition_merge_deletions(
|
||||
let ret = MergeFunction::merge(
|
||||
&ObkvsKeepLastAdditionMergeDeletions,
|
||||
&[],
|
||||
&[
|
||||
Cow::from(deletive_doc_0.as_slice()),
|
||||
|
||||
@@ -4,18 +4,17 @@ use std::fs::File;
|
||||
use std::io::{self, BufReader};
|
||||
|
||||
use bytemuck::allocation::pod_collect_to_vec;
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use grenad::{MergeFunction, Merger, MergerBuilder};
|
||||
use heed::types::Bytes;
|
||||
use heed::{BytesDecode, RwTxn};
|
||||
use obkv::{KvReader, KvWriter};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::helpers::{
|
||||
self, keep_first, merge_deladd_btreeset_string, merge_deladd_cbo_roaring_bitmaps,
|
||||
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, valid_lmdb_key,
|
||||
CursorClonableMmap,
|
||||
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
|
||||
CursorClonableMmap, KeepFirst, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps,
|
||||
MergeIgnoreValues,
|
||||
};
|
||||
use super::MergeFn;
|
||||
use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
|
||||
use crate::facet::FacetType;
|
||||
use crate::index::db_name::DOCUMENTS;
|
||||
@@ -24,7 +23,7 @@ use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
|
||||
use crate::update::facet::FacetsUpdate;
|
||||
use crate::update::index_documents::helpers::{
|
||||
as_cloneable_grenad, keep_latest_obkv, try_split_array_at,
|
||||
as_cloneable_grenad, try_split_array_at, KeepLatestObkv,
|
||||
};
|
||||
use crate::update::settings::InnerIndexSettingsDiff;
|
||||
use crate::vector::ArroyWrapper;
|
||||
@@ -141,7 +140,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let vectors_fid =
|
||||
fields_ids_map.id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME);
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_latest_obkv as MergeFn);
|
||||
let mut builder = MergerBuilder::new(KeepLatestObkv);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::Documents(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -163,7 +162,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let mut vectors_buffer = Vec::new();
|
||||
while let Some((key, reader)) = iter.next()? {
|
||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||
let reader: KvReader<'_, FieldId> = KvReader::new(reader);
|
||||
let reader: &KvReader<FieldId> = reader.into();
|
||||
|
||||
let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
|
||||
.ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
|
||||
@@ -171,7 +170,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let external_id = std::str::from_utf8(external_id_bytes)?;
|
||||
|
||||
for (field_id, value) in reader.iter() {
|
||||
let del_add_reader = KvReaderDelAdd::new(value);
|
||||
let del_add_reader = KvReaderDelAdd::from_slice(value);
|
||||
|
||||
if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
|
||||
let addition = if vectors_fid == Some(field_id) {
|
||||
@@ -235,7 +234,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_word_count_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdWordCountDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -258,13 +257,10 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut word_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut exact_word_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut word_fid_docids_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut fst_merger_builder = MergerBuilder::new(merge_ignore_values as MergeFn);
|
||||
let mut word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut exact_word_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut word_fid_docids_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut fst_merger_builder = MergerBuilder::new(MergeIgnoreValues);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordDocids {
|
||||
word_docids_reader,
|
||||
@@ -329,7 +325,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "word_position_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPositionDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -353,7 +349,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db","field_id_facet_number_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids) = typed_chunk
|
||||
@@ -375,10 +371,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_string_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut facet_id_string_builder =
|
||||
MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut facet_id_string_builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
let mut normalized_facet_id_string_builder =
|
||||
MergerBuilder::new(merge_deladd_btreeset_string as MergeFn);
|
||||
MergerBuilder::new(MergeDeladdBtreesetString);
|
||||
let mut data_size = 0;
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetStringDocids((
|
||||
@@ -412,7 +407,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_exists_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetExistsDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -436,7 +431,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_null_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsNullDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -459,7 +454,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "field_id_facet_is_empty_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdFacetIsEmptyDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -483,7 +478,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(merge_deladd_cbo_roaring_bitmaps as MergeFn);
|
||||
let mut builder = MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::WordPairProximityDocids(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -516,7 +511,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_numbers");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut builder = MergerBuilder::new(KeepFirst);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetNumbers(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -530,7 +525,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.field_id_docid_facet_f64s.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
let reader = KvReaderDelAdd::from_slice(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
@@ -550,7 +545,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
tracing::trace_span!(target: "indexing::write_db", "field_id_docid_facet_strings");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut builder = MergerBuilder::new(KeepFirst);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::FieldIdDocidFacetStrings(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -564,7 +559,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
index.field_id_docid_facet_strings.remap_types::<Bytes, Bytes>();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
let reader = KvReaderDelAdd::new(value);
|
||||
let reader = KvReaderDelAdd::from_slice(value);
|
||||
if valid_lmdb_key(key) {
|
||||
match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
|
||||
(None, None) => {}
|
||||
@@ -583,7 +578,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "geo_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut builder = MergerBuilder::new(KeepFirst);
|
||||
for typed_chunk in typed_chunks {
|
||||
let TypedChunk::GeoPoints(chunk) = typed_chunk else {
|
||||
unreachable!();
|
||||
@@ -601,7 +596,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
// convert the key back to a u32 (4 bytes)
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
|
||||
let deladd_obkv = KvReaderDelAdd::new(value);
|
||||
let deladd_obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
|
||||
let geopoint = extract_geo_point(value, docid);
|
||||
rtree.remove(&geopoint);
|
||||
@@ -620,9 +615,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let span = tracing::trace_span!(target: "indexing::write_db", "vector_points");
|
||||
let _entered = span.enter();
|
||||
|
||||
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||
let mut remove_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut manual_vectors_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut embeddings_builder = MergerBuilder::new(KeepFirst);
|
||||
let mut add_to_user_provided = RoaringBitmap::new();
|
||||
let mut remove_from_user_provided = RoaringBitmap::new();
|
||||
let mut params = None;
|
||||
@@ -673,14 +668,22 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
.get(&embedder_name)
|
||||
.map_or(false, |conf| conf.2);
|
||||
// FIXME: allow customizing distance
|
||||
let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized);
|
||||
let writers: Vec<_> = crate::vector::arroy_db_range_for_embedder(embedder_index)
|
||||
.map(|k| ArroyWrapper::new(index.vector_arroy, k, binary_quantized))
|
||||
.collect();
|
||||
|
||||
// remove vectors for docids we want them removed
|
||||
let merger = remove_vectors_builder.build();
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, _)) = iter.next()? {
|
||||
let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
|
||||
writer.del_items(wtxn, expected_dimension, docid)?;
|
||||
|
||||
for writer in &writers {
|
||||
// Uses invariant: vectors are packed in the first writers.
|
||||
if !writer.del_item(wtxn, expected_dimension, docid)? {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// add generated embeddings
|
||||
@@ -708,7 +711,9 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
embeddings.embedding_count(),
|
||||
)));
|
||||
}
|
||||
writer.add_items(wtxn, docid, &embeddings)?;
|
||||
for (embedding, writer) in embeddings.iter().zip(&writers) {
|
||||
writer.add_item(wtxn, expected_dimension, docid, embedding)?;
|
||||
}
|
||||
}
|
||||
|
||||
// perform the manual diff
|
||||
@@ -719,18 +724,55 @@ pub(crate) fn write_typed_chunk_into_index(
|
||||
let (left, _index) = try_split_array_at(key).unwrap();
|
||||
let docid = DocumentId::from_be_bytes(left);
|
||||
|
||||
let vector_deladd_obkv = KvReaderDelAdd::new(value);
|
||||
let vector_deladd_obkv = KvReaderDelAdd::from_slice(value);
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
|
||||
let vector: Vec<f32> = pod_collect_to_vec(value);
|
||||
|
||||
writer.del_item(wtxn, docid, &vector)?;
|
||||
let mut deleted_index = None;
|
||||
for (index, writer) in writers.iter().enumerate() {
|
||||
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
|
||||
// uses invariant: vectors are packed in the first writers.
|
||||
break;
|
||||
};
|
||||
if candidate == vector {
|
||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
||||
deleted_index = Some(index);
|
||||
}
|
||||
}
|
||||
|
||||
// 🥲 enforce invariant: vectors are packed in the first writers.
|
||||
if let Some(deleted_index) = deleted_index {
|
||||
let mut last_index_with_a_vector = None;
|
||||
for (index, writer) in writers.iter().enumerate().skip(deleted_index) {
|
||||
let Some(candidate) = writer.item_vector(wtxn, docid)? else {
|
||||
break;
|
||||
};
|
||||
last_index_with_a_vector = Some((index, candidate));
|
||||
}
|
||||
if let Some((last_index, vector)) = last_index_with_a_vector {
|
||||
// unwrap: computed the index from the list of writers
|
||||
let writer = writers.get(last_index).unwrap();
|
||||
writer.del_item(wtxn, expected_dimension, docid)?;
|
||||
writers.get(deleted_index).unwrap().add_item(
|
||||
wtxn,
|
||||
expected_dimension,
|
||||
docid,
|
||||
&vector,
|
||||
)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
|
||||
let vector = pod_collect_to_vec(value);
|
||||
|
||||
// overflow was detected during vector extraction.
|
||||
writer.add_item(wtxn, docid, &vector)?;
|
||||
for writer in &writers {
|
||||
if !writer.contains_item(wtxn, expected_dimension, docid)? {
|
||||
writer.add_item(wtxn, expected_dimension, docid, &vector)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -750,9 +792,13 @@ fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
|
||||
GeoPoint::new(xyz_point, (docid, point))
|
||||
}
|
||||
|
||||
fn merge_word_docids_reader_into_fst(
|
||||
merger: Merger<CursorClonableMmap, MergeFn>,
|
||||
) -> Result<fst::Set<Vec<u8>>> {
|
||||
fn merge_word_docids_reader_into_fst<MF>(
|
||||
merger: Merger<CursorClonableMmap, MF>,
|
||||
) -> Result<fst::Set<Vec<u8>>>
|
||||
where
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
let mut builder = fst::SetBuilder::memory();
|
||||
|
||||
@@ -766,8 +812,8 @@ fn merge_word_docids_reader_into_fst(
|
||||
/// Write provided entries in database using serialize_value function.
|
||||
/// merge_values function is used if an entry already exist in the database.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_entries_into_database<R, K, V, FS, FM>(
|
||||
merger: Merger<R, MergeFn>,
|
||||
fn write_entries_into_database<R, K, V, FS, FM, MF>(
|
||||
merger: Merger<R, MF>,
|
||||
database: &heed::Database<K, V>,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
serialize_value: FS,
|
||||
@@ -777,6 +823,8 @@ where
|
||||
R: io::Read + io::Seek,
|
||||
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
|
||||
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut buffer = Vec::new();
|
||||
let database = database.remap_types::<Bytes, Bytes>();
|
||||
@@ -803,20 +851,22 @@ where
|
||||
/// Akin to the `write_entries_into_database` function but specialized
|
||||
/// for the case when we only index additional searchable fields only.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::write_db")]
|
||||
fn write_proximity_entries_into_database_additional_searchables<R>(
|
||||
merger: Merger<R, MergeFn>,
|
||||
fn write_proximity_entries_into_database_additional_searchables<R, MF>(
|
||||
merger: Merger<R, MF>,
|
||||
database: &heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||
wtxn: &mut RwTxn<'_>,
|
||||
) -> Result<()>
|
||||
where
|
||||
R: io::Read + io::Seek,
|
||||
MF: MergeFunction,
|
||||
crate::Error: From<MF::Error>,
|
||||
{
|
||||
let mut iter = merger.into_stream_merger_iter()?;
|
||||
while let Some((key, value)) = iter.next()? {
|
||||
if valid_lmdb_key(key) {
|
||||
let (proximity_to_insert, word1, word2) =
|
||||
U8StrStrCodec::bytes_decode(key).map_err(heed::Error::Decoding)?;
|
||||
let data_to_insert = match KvReaderDelAdd::new(value).get(DelAdd::Addition) {
|
||||
let data_to_insert = match KvReaderDelAdd::from_slice(value).get(DelAdd::Addition) {
|
||||
Some(value) => {
|
||||
CboRoaringBitmapCodec::bytes_decode(value).map_err(heed::Error::Decoding)?
|
||||
}
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
pub use self::available_documents_ids::AvailableDocumentsIds;
|
||||
pub use self::available_ids::AvailableIds;
|
||||
pub use self::clear_documents::ClearDocuments;
|
||||
pub use self::concurrent_available_ids::ConcurrentAvailableIds;
|
||||
pub use self::facet::bulk::FacetsUpdateBulk;
|
||||
pub use self::facet::incremental::FacetsUpdateIncrementalInner;
|
||||
pub use self::index_documents::{
|
||||
merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, DocumentAdditionResult, DocumentId,
|
||||
IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, MergeFn,
|
||||
};
|
||||
pub use self::index_documents::*;
|
||||
pub use self::indexer_config::IndexerConfig;
|
||||
pub use self::settings::{validate_embedding_settings, Setting, Settings};
|
||||
pub use self::update_step::UpdateIndexingStep;
|
||||
@@ -13,12 +11,14 @@ pub use self::word_prefix_docids::WordPrefixDocids;
|
||||
pub use self::words_prefix_integer_docids::WordPrefixIntegerDocids;
|
||||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||
|
||||
mod available_documents_ids;
|
||||
mod available_ids;
|
||||
mod clear_documents;
|
||||
mod concurrent_available_ids;
|
||||
pub(crate) mod del_add;
|
||||
pub(crate) mod facet;
|
||||
mod index_documents;
|
||||
mod indexer_config;
|
||||
pub mod new;
|
||||
mod settings;
|
||||
mod update_step;
|
||||
mod word_prefix_docids;
|
||||
|
||||
522
milli/src/update/new/channel.rs
Normal file
522
milli/src/update/new/channel.rs
Normal file
@@ -0,0 +1,522 @@
|
||||
use std::marker::PhantomData;
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
|
||||
use heed::types::Bytes;
|
||||
use memmap2::Mmap;
|
||||
|
||||
use super::extract::{FacetKind, HashMapMerger};
|
||||
use super::StdResult;
|
||||
use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY};
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::{DocumentId, Index};
|
||||
|
||||
/// The capacity of the channel is currently in number of messages.
|
||||
pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) {
|
||||
let (sender, receiver) = crossbeam_channel::bounded(cap);
|
||||
(
|
||||
MergerSender {
|
||||
sender,
|
||||
send_count: Default::default(),
|
||||
writer_contentious_count: Default::default(),
|
||||
merger_contentious_count: Default::default(),
|
||||
},
|
||||
WriterReceiver(receiver),
|
||||
)
|
||||
}
|
||||
|
||||
/// The capacity of the channel is currently in number of messages.
|
||||
pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceiver) {
|
||||
let (sender, receiver) = crossbeam_channel::bounded(cap);
|
||||
(ExtractorSender(sender), MergerReceiver(receiver))
|
||||
}
|
||||
|
||||
pub enum KeyValueEntry {
|
||||
SmallInMemory { key_length: usize, data: Box<[u8]> },
|
||||
LargeOnDisk { key: Box<[u8]>, value: Mmap },
|
||||
}
|
||||
|
||||
impl KeyValueEntry {
|
||||
pub fn from_small_key_value(key: &[u8], value: &[u8]) -> Self {
|
||||
let mut data = Vec::with_capacity(key.len() + value.len());
|
||||
data.extend_from_slice(key);
|
||||
data.extend_from_slice(value);
|
||||
KeyValueEntry::SmallInMemory { key_length: key.len(), data: data.into_boxed_slice() }
|
||||
}
|
||||
|
||||
pub fn from_large_key_value(key: &[u8], value: Mmap) -> Self {
|
||||
KeyValueEntry::LargeOnDisk { key: key.to_vec().into_boxed_slice(), value }
|
||||
}
|
||||
|
||||
pub fn key(&self) -> &[u8] {
|
||||
match self {
|
||||
KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[..*key_length],
|
||||
KeyValueEntry::LargeOnDisk { key, value: _ } => key.as_ref(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn value(&self) -> &[u8] {
|
||||
match self {
|
||||
KeyValueEntry::SmallInMemory { key_length, data } => &data.as_ref()[*key_length..],
|
||||
KeyValueEntry::LargeOnDisk { key: _, value } => value.as_ref(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct KeyEntry {
|
||||
data: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl KeyEntry {
|
||||
pub fn from_key(key: &[u8]) -> Self {
|
||||
KeyEntry { data: key.to_vec().into_boxed_slice() }
|
||||
}
|
||||
|
||||
pub fn entry(&self) -> &[u8] {
|
||||
self.data.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
pub enum EntryOperation {
|
||||
Delete(KeyEntry),
|
||||
Write(KeyValueEntry),
|
||||
}
|
||||
|
||||
pub struct DocumentEntry {
|
||||
docid: DocumentId,
|
||||
content: Box<[u8]>,
|
||||
}
|
||||
|
||||
impl DocumentEntry {
|
||||
pub fn new_uncompressed(docid: DocumentId, content: Box<KvReaderFieldId>) -> Self {
|
||||
DocumentEntry { docid, content: content.into() }
|
||||
}
|
||||
|
||||
pub fn new_compressed(docid: DocumentId, content: Box<[u8]>) -> Self {
|
||||
DocumentEntry { docid, content }
|
||||
}
|
||||
|
||||
pub fn key(&self) -> [u8; 4] {
|
||||
self.docid.to_be_bytes()
|
||||
}
|
||||
|
||||
pub fn content(&self) -> &[u8] {
|
||||
&self.content
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentDeletionEntry(DocumentId);
|
||||
|
||||
impl DocumentDeletionEntry {
|
||||
pub fn key(&self) -> [u8; 4] {
|
||||
self.0.to_be_bytes()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WriterOperation {
|
||||
database: Database,
|
||||
entry: EntryOperation,
|
||||
}
|
||||
|
||||
pub enum Database {
|
||||
Documents,
|
||||
ExactWordDocids,
|
||||
FidWordCountDocids,
|
||||
Main,
|
||||
WordDocids,
|
||||
WordFidDocids,
|
||||
WordPairProximityDocids,
|
||||
WordPositionDocids,
|
||||
FacetIdIsNullDocids,
|
||||
FacetIdIsEmptyDocids,
|
||||
FacetIdExistsDocids,
|
||||
FacetIdF64NumberDocids,
|
||||
FacetIdStringDocids,
|
||||
}
|
||||
|
||||
impl Database {
|
||||
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
||||
match self {
|
||||
Database::Documents => index.documents.remap_types(),
|
||||
Database::ExactWordDocids => index.exact_word_docids.remap_types(),
|
||||
Database::Main => index.main.remap_types(),
|
||||
Database::WordDocids => index.word_docids.remap_types(),
|
||||
Database::WordFidDocids => index.word_fid_docids.remap_types(),
|
||||
Database::WordPositionDocids => index.word_position_docids.remap_types(),
|
||||
Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
|
||||
Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
|
||||
Database::FacetIdIsNullDocids => index.facet_id_is_null_docids.remap_types(),
|
||||
Database::FacetIdIsEmptyDocids => index.facet_id_is_empty_docids.remap_types(),
|
||||
Database::FacetIdExistsDocids => index.facet_id_exists_docids.remap_types(),
|
||||
Database::FacetIdF64NumberDocids => index.facet_id_f64_docids.remap_types(),
|
||||
Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl WriterOperation {
|
||||
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
||||
self.database.database(index)
|
||||
}
|
||||
|
||||
pub fn entry(self) -> EntryOperation {
|
||||
self.entry
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WriterReceiver(Receiver<WriterOperation>);
|
||||
|
||||
impl IntoIterator for WriterReceiver {
|
||||
type Item = WriterOperation;
|
||||
type IntoIter = IntoIter<Self::Item>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MergerSender {
|
||||
sender: Sender<WriterOperation>,
|
||||
/// The number of message we send in total in the channel.
|
||||
send_count: std::sync::atomic::AtomicUsize,
|
||||
/// The number of times we sent something in a channel that was full.
|
||||
writer_contentious_count: std::sync::atomic::AtomicUsize,
|
||||
/// The number of times we sent something in a channel that was empty.
|
||||
merger_contentious_count: std::sync::atomic::AtomicUsize,
|
||||
}
|
||||
|
||||
impl Drop for MergerSender {
|
||||
fn drop(&mut self) {
|
||||
eprintln!(
|
||||
"Merger channel stats: {} sends, {} writer contentions ({}%), {} merger contentions ({}%)",
|
||||
self.send_count.load(Ordering::SeqCst),
|
||||
self.writer_contentious_count.load(Ordering::SeqCst),
|
||||
(self.writer_contentious_count.load(Ordering::SeqCst) as f32 / self.send_count.load(Ordering::SeqCst) as f32) * 100.0,
|
||||
self.merger_contentious_count.load(Ordering::SeqCst),
|
||||
(self.merger_contentious_count.load(Ordering::SeqCst) as f32 / self.send_count.load(Ordering::SeqCst) as f32) * 100.0
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
impl MergerSender {
|
||||
pub fn main(&self) -> MainSender<'_> {
|
||||
MainSender(self)
|
||||
}
|
||||
|
||||
pub fn docids<D: DatabaseType>(&self) -> WordDocidsSender<'_, D> {
|
||||
WordDocidsSender { sender: self, _marker: PhantomData }
|
||||
}
|
||||
|
||||
pub fn facet_docids(&self) -> FacetDocidsSender<'_> {
|
||||
FacetDocidsSender { sender: self }
|
||||
}
|
||||
|
||||
pub fn documents(&self) -> DocumentsSender<'_> {
|
||||
DocumentsSender(self)
|
||||
}
|
||||
|
||||
pub fn send_documents_ids(&self, bitmap: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
||||
DOCUMENTS_IDS_KEY.as_bytes(),
|
||||
bitmap,
|
||||
));
|
||||
match self.send(WriterOperation { database: Database::Main, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
fn send(&self, op: WriterOperation) -> StdResult<(), SendError<()>> {
|
||||
if self.sender.is_full() {
|
||||
self.writer_contentious_count.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
if self.sender.is_empty() {
|
||||
self.merger_contentious_count.fetch_add(1, Ordering::SeqCst);
|
||||
}
|
||||
self.send_count.fetch_add(1, Ordering::SeqCst);
|
||||
match self.sender.send(op) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MainSender<'a>(&'a MergerSender);
|
||||
|
||||
impl MainSender<'_> {
|
||||
pub fn write_words_fst(&self, value: Mmap) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_large_key_value(
|
||||
WORDS_FST_KEY.as_bytes(),
|
||||
value,
|
||||
));
|
||||
match self.0.send(WriterOperation { database: Database::Main, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||
match self.0.send(WriterOperation { database: Database::Main, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum ExactWordDocids {}
|
||||
pub enum FidWordCountDocids {}
|
||||
pub enum WordDocids {}
|
||||
pub enum WordFidDocids {}
|
||||
pub enum WordPairProximityDocids {}
|
||||
pub enum WordPositionDocids {}
|
||||
pub enum FacetDocids {}
|
||||
|
||||
pub trait DatabaseType {
|
||||
const DATABASE: Database;
|
||||
}
|
||||
|
||||
pub trait MergerOperationType {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation;
|
||||
}
|
||||
|
||||
impl DatabaseType for ExactWordDocids {
|
||||
const DATABASE: Database = Database::ExactWordDocids;
|
||||
}
|
||||
|
||||
impl MergerOperationType for ExactWordDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::ExactWordDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for FidWordCountDocids {
|
||||
const DATABASE: Database = Database::FidWordCountDocids;
|
||||
}
|
||||
|
||||
impl MergerOperationType for FidWordCountDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::FidWordCountDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordDocids {
|
||||
const DATABASE: Database = Database::WordDocids;
|
||||
}
|
||||
|
||||
impl MergerOperationType for WordDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::WordDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordFidDocids {
|
||||
const DATABASE: Database = Database::WordFidDocids;
|
||||
}
|
||||
|
||||
impl MergerOperationType for WordFidDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::WordFidDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordPairProximityDocids {
|
||||
const DATABASE: Database = Database::WordPairProximityDocids;
|
||||
}
|
||||
|
||||
impl MergerOperationType for WordPairProximityDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::WordPairProximityDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordPositionDocids {
|
||||
const DATABASE: Database = Database::WordPositionDocids;
|
||||
}
|
||||
|
||||
impl MergerOperationType for WordPositionDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::WordPositionDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl MergerOperationType for FacetDocids {
|
||||
fn new_merger_operation(merger: HashMapMerger) -> MergerOperation {
|
||||
MergerOperation::FacetDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
pub trait DocidsSender {
|
||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>>;
|
||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>>;
|
||||
}
|
||||
|
||||
pub struct WordDocidsSender<'a, D> {
|
||||
sender: &'a MergerSender,
|
||||
_marker: PhantomData<D>,
|
||||
}
|
||||
|
||||
impl<D: DatabaseType> DocidsSender for WordDocidsSender<'_, D> {
|
||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
||||
match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||
match self.sender.send(WriterOperation { database: D::DATABASE, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetDocidsSender<'a> {
|
||||
sender: &'a MergerSender,
|
||||
}
|
||||
|
||||
impl DocidsSender for FacetDocidsSender<'_> {
|
||||
fn write(&self, key: &[u8], value: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let (database, key) = self.extract_database(key);
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(key, value));
|
||||
match self.sender.send(WriterOperation { database, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
fn delete(&self, key: &[u8]) -> StdResult<(), SendError<()>> {
|
||||
let (database, key) = self.extract_database(key);
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(key));
|
||||
match self.sender.send(WriterOperation { database, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl FacetDocidsSender<'_> {
|
||||
fn extract_database<'a>(&self, key: &'a [u8]) -> (Database, &'a [u8]) {
|
||||
let database = match FacetKind::from(key[0]) {
|
||||
FacetKind::Number => Database::FacetIdF64NumberDocids,
|
||||
FacetKind::String => Database::FacetIdStringDocids,
|
||||
FacetKind::Null => Database::FacetIdIsNullDocids,
|
||||
FacetKind::Empty => Database::FacetIdIsEmptyDocids,
|
||||
FacetKind::Exists => Database::FacetIdExistsDocids,
|
||||
};
|
||||
(database, &key[1..])
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentsSender<'a>(&'a MergerSender);
|
||||
|
||||
impl DocumentsSender<'_> {
|
||||
/// TODO do that efficiently
|
||||
pub fn uncompressed(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
document: &KvReaderFieldId,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Write(KeyValueEntry::from_small_key_value(
|
||||
&docid.to_be_bytes(),
|
||||
document.as_bytes(),
|
||||
));
|
||||
match self.0.send(WriterOperation { database: Database::Documents, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> {
|
||||
let entry = EntryOperation::Delete(KeyEntry::from_key(&docid.to_be_bytes()));
|
||||
match self.0.send(WriterOperation { database: Database::Documents, entry }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum MergerOperation {
|
||||
ExactWordDocidsMerger(HashMapMerger),
|
||||
FidWordCountDocidsMerger(HashMapMerger),
|
||||
WordDocidsMerger(HashMapMerger),
|
||||
WordFidDocidsMerger(HashMapMerger),
|
||||
WordPairProximityDocidsMerger(HashMapMerger),
|
||||
WordPositionDocidsMerger(HashMapMerger),
|
||||
FacetDocidsMerger(HashMapMerger),
|
||||
DeleteDocument { docid: DocumentId },
|
||||
InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
|
||||
FinishedDocument,
|
||||
}
|
||||
|
||||
pub struct MergerReceiver(Receiver<MergerOperation>);
|
||||
|
||||
impl IntoIterator for MergerReceiver {
|
||||
type Item = MergerOperation;
|
||||
type IntoIter = IntoIter<Self::Item>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
self.0.into_iter()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExtractorSender(Sender<MergerOperation>);
|
||||
|
||||
impl ExtractorSender {
|
||||
pub fn document_sender(&self) -> DocumentSender<'_> {
|
||||
DocumentSender(Some(&self.0))
|
||||
}
|
||||
|
||||
pub fn send_searchable<D: MergerOperationType>(
|
||||
&self,
|
||||
merger: HashMapMerger,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
match self.0.send(D::new_merger_operation(merger)) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct DocumentSender<'a>(Option<&'a Sender<MergerOperation>>);
|
||||
|
||||
impl DocumentSender<'_> {
|
||||
pub fn insert(
|
||||
&self,
|
||||
docid: DocumentId,
|
||||
document: Box<KvReaderFieldId>,
|
||||
) -> StdResult<(), SendError<()>> {
|
||||
let sender = self.0.unwrap();
|
||||
match sender.send(MergerOperation::InsertDocument { docid, document }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn delete(&self, docid: DocumentId) -> StdResult<(), SendError<()>> {
|
||||
let sender = self.0.unwrap();
|
||||
match sender.send(MergerOperation::DeleteDocument { docid }) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn finish(mut self) -> StdResult<(), SendError<()>> {
|
||||
let sender = self.0.take().unwrap();
|
||||
match sender.send(MergerOperation::FinishedDocument) {
|
||||
Ok(()) => Ok(()),
|
||||
Err(SendError(_)) => Err(SendError(())),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for DocumentSender<'_> {
|
||||
fn drop(&mut self) {
|
||||
if let Some(sender) = self.0.take() {
|
||||
sender.send(MergerOperation::FinishedDocument);
|
||||
}
|
||||
}
|
||||
}
|
||||
96
milli/src/update/new/document_change.rs
Normal file
96
milli/src/update/new/document_change.rs
Normal file
@@ -0,0 +1,96 @@
|
||||
use heed::RoTxn;
|
||||
use obkv::KvReader;
|
||||
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::{DocumentId, FieldId, Index, Result};
|
||||
|
||||
pub enum DocumentChange {
|
||||
Deletion(Deletion),
|
||||
Update(Update),
|
||||
Insertion(Insertion),
|
||||
}
|
||||
|
||||
pub struct Deletion {
|
||||
docid: DocumentId,
|
||||
current: Box<KvReaderFieldId>,
|
||||
}
|
||||
|
||||
pub struct Update {
|
||||
docid: DocumentId,
|
||||
current: Box<KvReaderFieldId>,
|
||||
new: Box<KvReaderFieldId>,
|
||||
}
|
||||
|
||||
pub struct Insertion {
|
||||
docid: DocumentId,
|
||||
new: Box<KvReaderFieldId>,
|
||||
}
|
||||
|
||||
impl DocumentChange {
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
match &self {
|
||||
Self::Deletion(inner) => inner.docid(),
|
||||
Self::Update(inner) => inner.docid(),
|
||||
Self::Insertion(inner) => inner.docid(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deletion {
|
||||
pub fn create(docid: DocumentId, current: Box<KvReaderFieldId>) -> Self {
|
||||
Self { docid, current }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
// TODO shouldn't we use the one in self?
|
||||
pub fn current<'a>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<&'a KvReader<FieldId>>> {
|
||||
index.documents.get(rtxn, &self.docid).map_err(crate::Error::from)
|
||||
}
|
||||
}
|
||||
|
||||
impl Insertion {
|
||||
pub fn create(docid: DocumentId, new: Box<KvReaderFieldId>) -> Self {
|
||||
Insertion { docid, new }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn new(&self) -> &KvReader<FieldId> {
|
||||
self.new.as_ref()
|
||||
}
|
||||
}
|
||||
|
||||
impl Update {
|
||||
pub fn create(
|
||||
docid: DocumentId,
|
||||
current: Box<KvReaderFieldId>,
|
||||
new: Box<KvReaderFieldId>,
|
||||
) -> Self {
|
||||
Update { docid, current, new }
|
||||
}
|
||||
|
||||
pub fn docid(&self) -> DocumentId {
|
||||
self.docid
|
||||
}
|
||||
|
||||
pub fn current<'a>(
|
||||
&self,
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<&'a KvReader<FieldId>>> {
|
||||
index.documents.get(rtxn, &self.docid).map_err(crate::Error::from)
|
||||
}
|
||||
|
||||
pub fn new(&self) -> &KvReader<FieldId> {
|
||||
self.new.as_ref()
|
||||
}
|
||||
}
|
||||
149
milli/src/update/new/extract/cache.rs
Normal file
149
milli/src/update/new/extract/cache.rs
Normal file
@@ -0,0 +1,149 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use roaring::RoaringBitmap;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub const KEY_SIZE: usize = 12;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct CboCachedSorter {
|
||||
cache: HashMap<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap>,
|
||||
total_insertions: usize,
|
||||
fitted_in_key: usize,
|
||||
}
|
||||
|
||||
impl CboCachedSorter {
|
||||
pub fn new() -> Self {
|
||||
CboCachedSorter { cache: HashMap::new(), total_insertions: 0, fitted_in_key: 0 }
|
||||
}
|
||||
}
|
||||
|
||||
impl CboCachedSorter {
|
||||
pub fn insert_del_u32(&mut self, key: &[u8], n: u32) {
|
||||
match self.cache.get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del, add: _ }) => {
|
||||
del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
None => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let value = DelAddRoaringBitmap::new_del_u32(n);
|
||||
assert!(self.cache.insert(key.into(), value).is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_del(&mut self, key: &[u8], bitmap: RoaringBitmap) {
|
||||
match self.cache.get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del, add: _ }) => {
|
||||
*del.get_or_insert_with(RoaringBitmap::default) |= bitmap;
|
||||
}
|
||||
None => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let value = DelAddRoaringBitmap::new_del(bitmap);
|
||||
assert!(self.cache.insert(key.into(), value).is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_add_u32(&mut self, key: &[u8], n: u32) {
|
||||
match self.cache.get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del: _, add }) => {
|
||||
add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
None => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let value = DelAddRoaringBitmap::new_add_u32(n);
|
||||
assert!(self.cache.insert(key.into(), value).is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_add(&mut self, key: &[u8], bitmap: RoaringBitmap) {
|
||||
match self.cache.get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del: _, add }) => {
|
||||
*add.get_or_insert_with(RoaringBitmap::default) |= bitmap;
|
||||
}
|
||||
None => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let value = DelAddRoaringBitmap::new_add(bitmap);
|
||||
assert!(self.cache.insert(key.into(), value).is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_del_add_u32(&mut self, key: &[u8], n: u32) {
|
||||
match self.cache.get_mut(key) {
|
||||
Some(DelAddRoaringBitmap { del, add }) => {
|
||||
del.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
add.get_or_insert_with(RoaringBitmap::default).insert(n);
|
||||
}
|
||||
None => {
|
||||
self.total_insertions += 1;
|
||||
self.fitted_in_key += (key.len() <= KEY_SIZE) as usize;
|
||||
let value = DelAddRoaringBitmap::new_del_add_u32(n);
|
||||
assert!(self.cache.insert(key.into(), value).is_none());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn into_sorter(self) -> HashMap<SmallVec<[u8; KEY_SIZE]>, DelAddRoaringBitmap> {
|
||||
eprintln!(
|
||||
"LruCache stats: {} <= {KEY_SIZE} bytes ({}%) on a total of {} insertions",
|
||||
self.fitted_in_key,
|
||||
(self.fitted_in_key as f32 / self.total_insertions as f32) * 100.0,
|
||||
self.total_insertions,
|
||||
);
|
||||
|
||||
self.cache
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default)]
|
||||
pub struct DelAddRoaringBitmap {
|
||||
pub(crate) del: Option<RoaringBitmap>,
|
||||
pub(crate) add: Option<RoaringBitmap>,
|
||||
}
|
||||
|
||||
impl DelAddRoaringBitmap {
|
||||
fn new_del_add_u32(n: u32) -> Self {
|
||||
DelAddRoaringBitmap {
|
||||
del: Some(RoaringBitmap::from([n])),
|
||||
add: Some(RoaringBitmap::from([n])),
|
||||
}
|
||||
}
|
||||
|
||||
fn new_del(bitmap: RoaringBitmap) -> Self {
|
||||
DelAddRoaringBitmap { del: Some(bitmap), add: None }
|
||||
}
|
||||
|
||||
fn new_del_u32(n: u32) -> Self {
|
||||
DelAddRoaringBitmap { del: Some(RoaringBitmap::from([n])), add: None }
|
||||
}
|
||||
|
||||
fn new_add(bitmap: RoaringBitmap) -> Self {
|
||||
DelAddRoaringBitmap { del: None, add: Some(bitmap) }
|
||||
}
|
||||
|
||||
fn new_add_u32(n: u32) -> Self {
|
||||
DelAddRoaringBitmap { del: None, add: Some(RoaringBitmap::from([n])) }
|
||||
}
|
||||
|
||||
pub fn merge_with(&mut self, other: DelAddRoaringBitmap) {
|
||||
self.del = match (self.del.take(), other.del) {
|
||||
(None, None) => None,
|
||||
(None, Some(other)) => Some(other),
|
||||
(Some(this), None) => Some(this),
|
||||
(Some(this), Some(other)) => Some(this | other),
|
||||
};
|
||||
self.add = match (self.add.take(), other.add) {
|
||||
(None, None) => None,
|
||||
(None, Some(other)) => Some(other),
|
||||
(Some(this), None) => Some(this),
|
||||
(Some(this), Some(other)) => Some(this | other),
|
||||
};
|
||||
}
|
||||
}
|
||||
240
milli/src/update/new/extract/faceted/extract_facets.rs
Normal file
240
milli/src/update/new/extract/faceted/extract_facets.rs
Normal file
@@ -0,0 +1,240 @@
|
||||
use std::collections::HashSet;
|
||||
|
||||
use heed::RoTxn;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
|
||||
use serde_json::Value;
|
||||
|
||||
use super::super::cache::CboCachedSorter;
|
||||
use super::facet_document::extract_document_facets;
|
||||
use super::FacetKind;
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::update::new::extract::{DocidsExtractor, HashMapMerger};
|
||||
use crate::update::new::{DocumentChange, ItemsPool};
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result, MAX_FACET_VALUE_LENGTH};
|
||||
pub struct FacetedDocidsExtractor;
|
||||
|
||||
impl FacetedDocidsExtractor {
|
||||
fn extract_document_change(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
buffer: &mut Vec<u8>,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
attributes_to_extract: &[&str],
|
||||
cached_sorter: &mut CboCachedSorter,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
buffer,
|
||||
cached_sorter,
|
||||
CboCachedSorter::insert_del_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
),
|
||||
DocumentChange::Update(inner) => {
|
||||
extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
buffer,
|
||||
cached_sorter,
|
||||
CboCachedSorter::insert_del_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)?;
|
||||
|
||||
extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.new(),
|
||||
fields_ids_map,
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
buffer,
|
||||
cached_sorter,
|
||||
CboCachedSorter::insert_add_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
)
|
||||
}
|
||||
DocumentChange::Insertion(inner) => extract_document_facets(
|
||||
attributes_to_extract,
|
||||
inner.new(),
|
||||
fields_ids_map,
|
||||
&mut |fid, value| {
|
||||
Self::facet_fn_with_options(
|
||||
buffer,
|
||||
cached_sorter,
|
||||
CboCachedSorter::insert_add_u32,
|
||||
inner.docid(),
|
||||
fid,
|
||||
value,
|
||||
)
|
||||
},
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
fn facet_fn_with_options(
|
||||
buffer: &mut Vec<u8>,
|
||||
cached_sorter: &mut CboCachedSorter,
|
||||
cache_fn: impl Fn(&mut CboCachedSorter, &[u8], u32),
|
||||
docid: DocumentId,
|
||||
fid: FieldId,
|
||||
value: &Value,
|
||||
) -> Result<()> {
|
||||
// Exists
|
||||
// key: fid
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Exists as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
cache_fn(cached_sorter, &*buffer, docid);
|
||||
|
||||
match value {
|
||||
// Number
|
||||
// key: fid - level - orderedf64 - orignalf64
|
||||
Value::Number(number) => {
|
||||
if let Some((n, ordered)) =
|
||||
number.as_f64().and_then(|n| f64_into_bytes(n).map(|ordered| (n, ordered)))
|
||||
{
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Number as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(1); // level 0
|
||||
buffer.extend_from_slice(&ordered);
|
||||
buffer.extend_from_slice(&n.to_be_bytes());
|
||||
|
||||
Ok(cache_fn(cached_sorter, &*buffer, docid))
|
||||
} else {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
// String
|
||||
// key: fid - level - truncated_string
|
||||
Value::String(s) => {
|
||||
let truncated = truncate_str(s);
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::String as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(1); // level 0
|
||||
buffer.extend_from_slice(truncated.as_bytes());
|
||||
Ok(cache_fn(cached_sorter, &*buffer, docid))
|
||||
}
|
||||
// Null
|
||||
// key: fid
|
||||
Value::Null => {
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Null as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
Ok(cache_fn(cached_sorter, &*buffer, docid))
|
||||
}
|
||||
// Empty
|
||||
// key: fid
|
||||
Value::Array(a) if a.is_empty() => {
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Empty as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
Ok(cache_fn(cached_sorter, &*buffer, docid))
|
||||
}
|
||||
Value::Object(o) if o.is_empty() => {
|
||||
buffer.clear();
|
||||
buffer.push(FacetKind::Empty as u8);
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
Ok(cache_fn(cached_sorter, &*buffer, docid))
|
||||
}
|
||||
// Otherwise, do nothing
|
||||
/// TODO: What about Value::Bool?
|
||||
_ => Ok(()),
|
||||
}
|
||||
}
|
||||
|
||||
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<HashSet<String>> {
|
||||
index.user_defined_faceted_fields(rtxn)
|
||||
}
|
||||
}
|
||||
|
||||
/// Truncates a string to the biggest valid LMDB key size.
|
||||
fn truncate_str(s: &str) -> &str {
|
||||
let index = s
|
||||
.char_indices()
|
||||
.map(|(idx, _)| idx)
|
||||
.chain(std::iter::once(s.len()))
|
||||
.take_while(|idx| idx <= &MAX_FACET_VALUE_LENGTH)
|
||||
.last();
|
||||
|
||||
&s[..index.unwrap_or(0)]
|
||||
}
|
||||
|
||||
impl DocidsExtractor for FacetedDocidsExtractor {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||
fn run_extraction(
|
||||
index: &Index,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
) -> Result<HashMapMerger> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||
let attributes_to_extract: Vec<_> =
|
||||
attributes_to_extract.iter().map(|s| s.as_ref()).collect();
|
||||
|
||||
let context_pool = ItemsPool::new(|| {
|
||||
Ok((index.read_txn()?, fields_ids_map.clone(), Vec::new(), CboCachedSorter::new()))
|
||||
});
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||
context_pool.with(|(rtxn, fields_ids_map, buffer, cached_sorter)| {
|
||||
Self::extract_document_change(
|
||||
&*rtxn,
|
||||
index,
|
||||
buffer,
|
||||
fields_ids_map,
|
||||
&attributes_to_extract,
|
||||
cached_sorter,
|
||||
document_change?,
|
||||
)
|
||||
})
|
||||
})?;
|
||||
}
|
||||
{
|
||||
let mut builder = HashMapMerger::new();
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
||||
let _entered = span.enter();
|
||||
|
||||
let readers: Vec<_> = context_pool
|
||||
.into_items()
|
||||
.par_bridge()
|
||||
.map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| {
|
||||
cached_sorter.into_sorter()
|
||||
})
|
||||
.collect();
|
||||
|
||||
builder.extend(readers);
|
||||
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
}
|
||||
52
milli/src/update/new/extract/faceted/facet_document.rs
Normal file
52
milli/src/update/new/extract/faceted/facet_document.rs
Normal file
@@ -0,0 +1,52 @@
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
|
||||
|
||||
pub fn extract_document_facets(
|
||||
attributes_to_extract: &[&str],
|
||||
obkv: &KvReaderFieldId,
|
||||
field_id_map: &mut GlobalFieldsIdsMap,
|
||||
facet_fn: &mut impl FnMut(FieldId, &Value) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut field_name = String::new();
|
||||
for (field_id, field_bytes) in obkv {
|
||||
let Some(field_name) = field_id_map.name(field_id).map(|s| {
|
||||
field_name.clear();
|
||||
field_name.push_str(s);
|
||||
&field_name
|
||||
}) else {
|
||||
unreachable!("field id not found in field id map");
|
||||
};
|
||||
|
||||
let mut tokenize_field = |name: &str, value: &Value| match field_id_map.id_or_insert(name) {
|
||||
Some(field_id) => facet_fn(field_id, value),
|
||||
None => Err(UserError::AttributeLimitReached.into()),
|
||||
};
|
||||
|
||||
// if the current field is searchable or contains a searchable attribute
|
||||
if perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]) {
|
||||
// parse json.
|
||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(object) => perm_json_p::seek_leaf_values_in_object(
|
||||
&object,
|
||||
Some(attributes_to_extract),
|
||||
&[], // skip no attributes
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
Value::Array(array) => perm_json_p::seek_leaf_values_in_array(
|
||||
&array,
|
||||
Some(attributes_to_extract),
|
||||
&[], // skip no attributes
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
value => tokenize_field(field_name, &value)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
26
milli/src/update/new/extract/faceted/mod.rs
Normal file
26
milli/src/update/new/extract/faceted/mod.rs
Normal file
@@ -0,0 +1,26 @@
|
||||
mod extract_facets;
|
||||
mod facet_document;
|
||||
|
||||
pub use extract_facets::FacetedDocidsExtractor;
|
||||
|
||||
#[repr(u8)]
|
||||
pub enum FacetKind {
|
||||
Number = 0,
|
||||
String = 1,
|
||||
Null = 2,
|
||||
Empty = 3,
|
||||
Exists,
|
||||
}
|
||||
|
||||
impl From<u8> for FacetKind {
|
||||
fn from(value: u8) -> Self {
|
||||
match value {
|
||||
0 => Self::Number,
|
||||
1 => Self::String,
|
||||
2 => Self::Null,
|
||||
3 => Self::Empty,
|
||||
4 => Self::Exists,
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
}
|
||||
234
milli/src/update/new/extract/lru.rs
Normal file
234
milli/src/update/new/extract/lru.rs
Normal file
@@ -0,0 +1,234 @@
|
||||
use std::borrow::Borrow;
|
||||
use std::hash::{BuildHasher, Hash};
|
||||
use std::iter::repeat_with;
|
||||
use std::mem;
|
||||
use std::num::NonZeroUsize;
|
||||
|
||||
use hashbrown::hash_map::{DefaultHashBuilder, Entry};
|
||||
use hashbrown::HashMap;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct Lru<K, V, S = DefaultHashBuilder> {
|
||||
lookup: HashMap<K, usize, S>,
|
||||
storage: FixedSizeList<LruNode<K, V>>,
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V> Lru<K, V> {
|
||||
/// Creates a new LRU cache that holds at most `capacity` elements.
|
||||
pub fn new(capacity: NonZeroUsize) -> Self {
|
||||
Self { lookup: HashMap::new(), storage: FixedSizeList::new(capacity.get()) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
|
||||
/// Creates a new LRU cache that holds at most `capacity` elements
|
||||
/// and uses the provided hash builder to hash keys.
|
||||
pub fn with_hasher(capacity: NonZeroUsize, hash_builder: S) -> Lru<K, V, S> {
|
||||
Self {
|
||||
lookup: HashMap::with_hasher(hash_builder),
|
||||
storage: FixedSizeList::new(capacity.get()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
|
||||
/// Returns a mutable reference to the value of the key in the cache or `None` if it is not present in the cache.
|
||||
///
|
||||
/// Moves the key to the head of the LRU list if it exists.
|
||||
pub fn get_mut<Q>(&mut self, key: &Q) -> Option<&mut V>
|
||||
where
|
||||
K: Borrow<Q>,
|
||||
Q: Hash + Eq + ?Sized,
|
||||
{
|
||||
let idx = *self.lookup.get(key)?;
|
||||
self.storage.move_front(idx).map(|node| &mut node.value)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K: Clone + Eq + Hash, V, S: BuildHasher> Lru<K, V, S> {
|
||||
pub fn push(&mut self, key: K, value: V) -> Option<(K, V)> {
|
||||
match self.lookup.entry(key) {
|
||||
Entry::Occupied(occ) => {
|
||||
// It's fine to unwrap here because:
|
||||
// * the entry already exists
|
||||
let node = self.storage.move_front(*occ.get()).unwrap();
|
||||
let old_value = mem::replace(&mut node.value, value);
|
||||
let old_key = occ.replace_key();
|
||||
Some((old_key, old_value))
|
||||
}
|
||||
Entry::Vacant(vac) => {
|
||||
let key = vac.key().clone();
|
||||
if self.storage.is_full() {
|
||||
// It's fine to unwrap here because:
|
||||
// * the cache capacity is non zero
|
||||
// * the cache is full
|
||||
let idx = self.storage.back_idx();
|
||||
let node = self.storage.move_front(idx).unwrap();
|
||||
let LruNode { key, value } = mem::replace(node, LruNode { key, value });
|
||||
vac.insert(idx);
|
||||
self.lookup.remove(&key);
|
||||
Some((key, value))
|
||||
} else {
|
||||
// It's fine to unwrap here because:
|
||||
// * the cache capacity is non zero
|
||||
// * the cache is not full
|
||||
let (idx, _) = self.storage.push_front(LruNode { key, value }).unwrap();
|
||||
vac.insert(idx);
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<K, V, S> IntoIterator for Lru<K, V, S> {
|
||||
type Item = (K, V);
|
||||
type IntoIter = IntoIter<K, V>;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
IntoIter { lookup_iter: self.lookup.into_iter(), nodes: self.storage.nodes }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntoIter<K, V> {
|
||||
lookup_iter: hashbrown::hash_map::IntoIter<K, usize>,
|
||||
nodes: Box<[Option<FixedSizeListNode<LruNode<K, V>>>]>,
|
||||
}
|
||||
|
||||
impl<K, V> Iterator for IntoIter<K, V> {
|
||||
type Item = (K, V);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
let (_key, idx) = self.lookup_iter.next()?;
|
||||
let LruNode { key, value } = self.nodes.get_mut(idx)?.take()?.data;
|
||||
Some((key, value))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct LruNode<K, V> {
|
||||
key: K,
|
||||
value: V,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FixedSizeListNode<T> {
|
||||
prev: usize,
|
||||
next: usize,
|
||||
data: T,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct FixedSizeList<T> {
|
||||
nodes: Box<[Option<FixedSizeListNode<T>>]>,
|
||||
/// Also corresponds to the first `None` in the nodes.
|
||||
length: usize,
|
||||
// TODO Also, we probably do not need one of the front and back cursors.
|
||||
front: usize,
|
||||
back: usize,
|
||||
}
|
||||
|
||||
impl<T> FixedSizeList<T> {
|
||||
fn new(capacity: usize) -> Self {
|
||||
Self {
|
||||
nodes: repeat_with(|| None).take(capacity).collect::<Vec<_>>().into_boxed_slice(),
|
||||
length: 0,
|
||||
front: usize::MAX,
|
||||
back: usize::MAX,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn capacity(&self) -> usize {
|
||||
self.nodes.len()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn len(&self) -> usize {
|
||||
self.length
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_empty(&self) -> bool {
|
||||
self.len() == 0
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn is_full(&self) -> bool {
|
||||
self.len() == self.capacity()
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn back_idx(&self) -> usize {
|
||||
self.back
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn next(&mut self) -> Option<usize> {
|
||||
if self.is_full() {
|
||||
None
|
||||
} else {
|
||||
let current_free = self.length;
|
||||
self.length += 1;
|
||||
Some(current_free)
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn node_mut(&mut self, idx: usize) -> Option<&mut FixedSizeListNode<T>> {
|
||||
self.nodes.get_mut(idx).and_then(|node| node.as_mut())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn node_ref(&self, idx: usize) -> Option<&FixedSizeListNode<T>> {
|
||||
self.nodes.get(idx).and_then(|node| node.as_ref())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn move_front(&mut self, idx: usize) -> Option<&mut T> {
|
||||
let node = self.nodes.get_mut(idx)?.take()?;
|
||||
if let Some(prev) = self.node_mut(node.prev) {
|
||||
prev.next = node.next;
|
||||
} else {
|
||||
self.front = node.next;
|
||||
}
|
||||
if let Some(next) = self.node_mut(node.next) {
|
||||
next.prev = node.prev;
|
||||
} else {
|
||||
self.back = node.prev;
|
||||
}
|
||||
|
||||
if let Some(front) = self.node_mut(self.front) {
|
||||
front.prev = idx;
|
||||
}
|
||||
if self.node_ref(self.back).is_none() {
|
||||
self.back = idx;
|
||||
}
|
||||
|
||||
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
|
||||
prev: usize::MAX,
|
||||
next: self.front,
|
||||
data: node.data,
|
||||
});
|
||||
self.front = idx;
|
||||
Some(&mut node.data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn push_front(&mut self, data: T) -> Option<(usize, &mut T)> {
|
||||
let idx = self.next()?;
|
||||
if let Some(front) = self.node_mut(self.front) {
|
||||
front.prev = idx;
|
||||
}
|
||||
if self.node_ref(self.back).is_none() {
|
||||
self.back = idx;
|
||||
}
|
||||
let node = self.nodes.get_mut(idx).unwrap().insert(FixedSizeListNode {
|
||||
prev: usize::MAX,
|
||||
next: self.front,
|
||||
data,
|
||||
});
|
||||
self.front = idx;
|
||||
Some((idx, &mut node.data))
|
||||
}
|
||||
}
|
||||
218
milli/src/update/new/extract/mod.rs
Normal file
218
milli/src/update/new/extract/mod.rs
Normal file
@@ -0,0 +1,218 @@
|
||||
mod cache;
|
||||
mod faceted;
|
||||
mod lru;
|
||||
mod searchable;
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::mem;
|
||||
|
||||
pub use faceted::*;
|
||||
use grenad::MergeFunction;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator as _};
|
||||
use rayon::slice::ParallelSliceMut as _;
|
||||
pub use searchable::*;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
use super::DocumentChange;
|
||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{GlobalFieldsIdsMap, Index, Result};
|
||||
|
||||
pub trait DocidsExtractor {
|
||||
fn run_extraction(
|
||||
index: &Index,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
) -> Result<HashMapMerger>;
|
||||
}
|
||||
|
||||
pub struct HashMapMerger {
|
||||
maps: Vec<HashMap<SmallVec<[u8; cache::KEY_SIZE]>, cache::DelAddRoaringBitmap>>,
|
||||
}
|
||||
|
||||
impl HashMapMerger {
|
||||
pub fn new() -> HashMapMerger {
|
||||
HashMapMerger { maps: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn extend<I>(&mut self, iter: I)
|
||||
where
|
||||
I: IntoIterator<
|
||||
Item = HashMap<SmallVec<[u8; cache::KEY_SIZE]>, cache::DelAddRoaringBitmap>,
|
||||
>,
|
||||
{
|
||||
self.maps.extend(iter);
|
||||
}
|
||||
}
|
||||
|
||||
impl IntoIterator for HashMapMerger {
|
||||
type Item = (SmallVec<[u8; 12]>, cache::DelAddRoaringBitmap);
|
||||
type IntoIter = IntoIter;
|
||||
|
||||
fn into_iter(self) -> Self::IntoIter {
|
||||
let mut entries = {
|
||||
let span = tracing::trace_span!(target: "indexing::documents::merge", "into_par_iter");
|
||||
let _entered = span.enter();
|
||||
let entries: Vec<_> =
|
||||
self.maps.into_par_iter().flat_map(|m| m.into_par_iter()).collect();
|
||||
eprintln!("There are {} entries in the HashMapMerger", entries.len());
|
||||
entries
|
||||
};
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::merge", "par_sort_unstable_by");
|
||||
let _entered = span.enter();
|
||||
entries.par_sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
|
||||
IntoIter {
|
||||
sorted_entries: entries.into_iter(),
|
||||
current_key: None,
|
||||
current_deladd: cache::DelAddRoaringBitmap::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct IntoIter {
|
||||
sorted_entries: std::vec::IntoIter<(SmallVec<[u8; 12]>, cache::DelAddRoaringBitmap)>,
|
||||
current_key: Option<SmallVec<[u8; 12]>>,
|
||||
current_deladd: cache::DelAddRoaringBitmap,
|
||||
}
|
||||
|
||||
impl Iterator for IntoIter {
|
||||
type Item = (SmallVec<[u8; 12]>, cache::DelAddRoaringBitmap);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
loop {
|
||||
match self.sorted_entries.next() {
|
||||
Some((k, deladd)) => {
|
||||
if self.current_key.as_deref() == Some(k.as_slice()) {
|
||||
self.current_deladd.merge_with(deladd);
|
||||
} else {
|
||||
let previous_key = self.current_key.replace(k);
|
||||
let previous_deladd = mem::replace(&mut self.current_deladd, deladd);
|
||||
if let Some(previous_key) = previous_key {
|
||||
return Some((previous_key, previous_deladd));
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let current_deladd = mem::take(&mut self.current_deladd);
|
||||
return self.current_key.take().map(|ck| (ck, current_deladd));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
pub mod perm_json_p {
|
||||
use serde_json::{Map, Value};
|
||||
|
||||
use crate::Result;
|
||||
const SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// Returns `true` if the `selector` match the `key`.
|
||||
///
|
||||
/// ```text
|
||||
/// Example:
|
||||
/// `animaux` match `animaux`
|
||||
/// `animaux.chien` match `animaux`
|
||||
/// `animaux.chien` match `animaux`
|
||||
/// `animaux.chien.nom` match `animaux`
|
||||
/// `animaux.chien.nom` match `animaux.chien`
|
||||
/// -----------------------------------------
|
||||
/// `animaux` doesn't match `animaux.chien`
|
||||
/// `animaux.` doesn't match `animaux`
|
||||
/// `animaux.ch` doesn't match `animaux.chien`
|
||||
/// `animau` doesn't match `animaux`
|
||||
/// ```
|
||||
pub fn contained_in(selector: &str, key: &str) -> bool {
|
||||
selector.starts_with(key)
|
||||
&& selector[key.len()..].chars().next().map(|c| c == SPLIT_SYMBOL).unwrap_or(true)
|
||||
}
|
||||
|
||||
pub fn seek_leaf_values_in_object(
|
||||
value: &Map<String, Value>,
|
||||
selectors: Option<&[&str]>,
|
||||
skip_selectors: &[&str],
|
||||
base_key: &str,
|
||||
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
if value.is_empty() {
|
||||
seeker(&base_key, &Value::Object(Map::with_capacity(0)))?;
|
||||
}
|
||||
|
||||
for (key, value) in value.iter() {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
// here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
|
||||
// so we check the contained_in on both side
|
||||
let should_continue = select_field(&base_key, selectors, skip_selectors);
|
||||
if should_continue {
|
||||
match value {
|
||||
Value::Object(object) => seek_leaf_values_in_object(
|
||||
object,
|
||||
selectors,
|
||||
skip_selectors,
|
||||
&base_key,
|
||||
seeker,
|
||||
),
|
||||
Value::Array(array) => seek_leaf_values_in_array(
|
||||
array,
|
||||
selectors,
|
||||
skip_selectors,
|
||||
&base_key,
|
||||
seeker,
|
||||
),
|
||||
value => seeker(&base_key, value),
|
||||
}?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn seek_leaf_values_in_array(
|
||||
values: &[Value],
|
||||
selectors: Option<&[&str]>,
|
||||
skip_selectors: &[&str],
|
||||
base_key: &str,
|
||||
seeker: &mut impl FnMut(&str, &Value) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
if values.is_empty() {
|
||||
seeker(&base_key, &Value::Array(vec![]))?;
|
||||
}
|
||||
|
||||
for value in values {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
seek_leaf_values_in_object(object, selectors, skip_selectors, base_key, seeker)
|
||||
}
|
||||
Value::Array(array) => {
|
||||
seek_leaf_values_in_array(array, selectors, skip_selectors, base_key, seeker)
|
||||
}
|
||||
value => seeker(base_key, value),
|
||||
}?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn select_field(
|
||||
field_name: &str,
|
||||
selectors: Option<&[&str]>,
|
||||
skip_selectors: &[&str],
|
||||
) -> bool {
|
||||
selectors.map_or(true, |selectors| {
|
||||
selectors.iter().any(|selector| {
|
||||
contained_in(selector, &field_name) || contained_in(&field_name, selector)
|
||||
})
|
||||
}) && !skip_selectors.iter().any(|skip_selector| {
|
||||
contained_in(skip_selector, &field_name) || contained_in(&field_name, skip_selector)
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,124 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use super::SearchableExtractor;
|
||||
use crate::update::new::extract::cache::CboCachedSorter;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
pub struct FidWordCountDocidsExtractor;
|
||||
impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
// This method is reimplemented to count the number of words in the document in each field
|
||||
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
|
||||
fn extract_document_change(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut CboCachedSorter,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let mut key_buffer = Vec::new();
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut fid_word_count = HashMap::new();
|
||||
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
// The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are deleted.
|
||||
for (fid, count) in fid_word_count.iter() {
|
||||
if *count <= MAX_COUNTED_WORDS {
|
||||
let key = build_key(*fid, *count as u8, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, inner.docid());
|
||||
}
|
||||
}
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let mut fid_word_count = HashMap::new();
|
||||
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count
|
||||
.entry(fid)
|
||||
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||
.or_insert((1, 0));
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count
|
||||
.entry(fid)
|
||||
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||
.or_insert((0, 1));
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||
|
||||
// Only the fields that have a change in the number of words are updated.
|
||||
for (fid, (current_count, new_count)) in fid_word_count.iter() {
|
||||
if *current_count != *new_count {
|
||||
if *current_count <= MAX_COUNTED_WORDS {
|
||||
let key = build_key(*fid, *current_count as u8, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, inner.docid());
|
||||
}
|
||||
if *new_count <= MAX_COUNTED_WORDS {
|
||||
let key = build_key(*fid, *new_count as u8, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, inner.docid());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let mut fid_word_count = HashMap::new();
|
||||
let mut token_fn = |_fname: &str, fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||
|
||||
// The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are stored.
|
||||
for (fid, count) in fid_word_count.iter() {
|
||||
if *count <= MAX_COUNTED_WORDS {
|
||||
let key = build_key(*fid, *count as u8, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, inner.docid());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_key(fid: FieldId, count: u8, key_buffer: &mut Vec<u8>) -> &[u8] {
|
||||
key_buffer.clear();
|
||||
key_buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
key_buffer.push(count);
|
||||
key_buffer.as_slice()
|
||||
}
|
||||
594
milli/src/update/new/extract/searchable/extract_word_docids.rs
Normal file
594
milli/src/update/new/extract/searchable/extract_word_docids.rs
Normal file
@@ -0,0 +1,594 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
use std::fs::File;
|
||||
use std::num::NonZero;
|
||||
|
||||
use grenad::{Merger, MergerBuilder};
|
||||
use heed::RoTxn;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
|
||||
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
use super::SearchableExtractor;
|
||||
use crate::update::new::extract::cache::CboCachedSorter;
|
||||
use crate::update::new::extract::perm_json_p::contained_in;
|
||||
use crate::update::new::extract::HashMapMerger;
|
||||
use crate::update::new::{DocumentChange, ItemsPool};
|
||||
use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{
|
||||
bucketed_position, DocumentId, FieldId, GlobalFieldsIdsMap, Index, Result,
|
||||
MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
trait ProtoWordDocidsExtractor {
|
||||
fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>;
|
||||
fn attributes_to_extract<'a>(
|
||||
_rtxn: &'a RoTxn,
|
||||
_index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>>;
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
|
||||
}
|
||||
|
||||
impl<T> SearchableExtractor for T
|
||||
where
|
||||
T: ProtoWordDocidsExtractor,
|
||||
{
|
||||
fn extract_document_change(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut CboCachedSorter,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||
let key = Self::build_key(fid, pos, word);
|
||||
Ok(cached_sorter.insert_del_u32(&key, inner.docid()))
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||
let key = Self::build_key(fid, pos, word);
|
||||
Ok(cached_sorter.insert_del_u32(&key, inner.docid()))
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||
let key = Self::build_key(fid, pos, word);
|
||||
Ok(cached_sorter.insert_add_u32(&key, inner.docid()))
|
||||
};
|
||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let mut token_fn = |_fname: &str, fid, pos, word: &str| {
|
||||
let key = Self::build_key(fid, pos, word);
|
||||
Ok(cached_sorter.insert_add_u32(&key, inner.docid()))
|
||||
};
|
||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
Self::attributes_to_extract(rtxn, index)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Self::attributes_to_skip(rtxn, index)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsExtractor;
|
||||
impl ProtoWordDocidsExtractor for WordDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
// exact attributes must be skipped and stored in a separate DB, see `ExactWordDocidsExtractor`.
|
||||
index.exact_attributes(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
/// TODO write in an external Vec buffer
|
||||
fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
|
||||
Cow::Borrowed(word.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ExactWordDocidsExtractor;
|
||||
impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||
// If there are no user-defined searchable fields, we return all exact attributes.
|
||||
// Otherwise, we return the intersection of exact attributes and user-defined searchable fields.
|
||||
if let Some(searchable_attributes) = index.user_defined_searchable_fields(rtxn)? {
|
||||
let attributes = exact_attributes
|
||||
.into_iter()
|
||||
.filter(|attr| searchable_attributes.contains(attr))
|
||||
.collect();
|
||||
Ok(Some(attributes))
|
||||
} else {
|
||||
Ok(Some(exact_attributes))
|
||||
}
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
|
||||
Cow::Borrowed(word.as_bytes())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordFidDocidsExtractor;
|
||||
impl ProtoWordDocidsExtractor for WordFidDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
|
||||
let mut key = Vec::new();
|
||||
key.extend_from_slice(word.as_bytes());
|
||||
key.push(0);
|
||||
key.extend_from_slice(&field_id.to_be_bytes());
|
||||
Cow::Owned(key)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordPositionDocidsExtractor;
|
||||
impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> {
|
||||
// position must be bucketed to reduce the number of keys in the DB.
|
||||
let position = bucketed_position(position);
|
||||
let mut key = Vec::new();
|
||||
key.extend_from_slice(word.as_bytes());
|
||||
key.push(0);
|
||||
key.extend_from_slice(&position.to_be_bytes());
|
||||
Cow::Owned(key)
|
||||
}
|
||||
}
|
||||
|
||||
// V2
|
||||
|
||||
struct WordDocidsCachedSorters {
|
||||
word_fid_docids: CboCachedSorter,
|
||||
word_docids: CboCachedSorter,
|
||||
exact_word_docids: CboCachedSorter,
|
||||
word_position_docids: CboCachedSorter,
|
||||
fid_word_count_docids: CboCachedSorter,
|
||||
fid_word_count: HashMap<FieldId, (usize, usize)>,
|
||||
current_docid: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl WordDocidsCachedSorters {
|
||||
pub fn new(
|
||||
indexer: GrenadParameters,
|
||||
max_memory: Option<usize>,
|
||||
capacity: NonZero<usize>,
|
||||
) -> Self {
|
||||
let max_memory = max_memory.map(|max_memory| max_memory / 4);
|
||||
|
||||
let word_fid_docids = CboCachedSorter::new();
|
||||
let word_docids = CboCachedSorter::new();
|
||||
let exact_word_docids = CboCachedSorter::new();
|
||||
let word_position_docids = CboCachedSorter::new();
|
||||
let fid_word_count_docids = CboCachedSorter::new();
|
||||
|
||||
Self {
|
||||
word_fid_docids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
fid_word_count: HashMap::new(),
|
||||
current_docid: None,
|
||||
}
|
||||
}
|
||||
|
||||
fn insert_add_u32(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
position: u16,
|
||||
word: &str,
|
||||
exact: bool,
|
||||
docid: u32,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
let key = word.as_bytes();
|
||||
if exact {
|
||||
self.exact_word_docids.insert_add_u32(key, docid);
|
||||
} else {
|
||||
self.word_docids.insert_add_u32(key, docid);
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word.as_bytes());
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&position.to_be_bytes());
|
||||
self.word_fid_docids.insert_add_u32(buffer, docid);
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word.as_bytes());
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_position_docids.insert_add_u32(buffer, docid);
|
||||
|
||||
if self.current_docid.map_or(false, |id| docid != id) {
|
||||
self.flush_fid_word_count(buffer)?;
|
||||
}
|
||||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||
.or_insert((0, 1));
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn insert_del_u32(
|
||||
&mut self,
|
||||
field_id: FieldId,
|
||||
position: u16,
|
||||
word: &str,
|
||||
exact: bool,
|
||||
docid: u32,
|
||||
buffer: &mut Vec<u8>,
|
||||
) -> Result<()> {
|
||||
let key = word.as_bytes();
|
||||
if exact {
|
||||
self.exact_word_docids.insert_del_u32(key, docid);
|
||||
} else {
|
||||
self.word_docids.insert_del_u32(key, docid);
|
||||
}
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word.as_bytes());
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&position.to_be_bytes());
|
||||
self.word_fid_docids.insert_del_u32(buffer, docid);
|
||||
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(word.as_bytes());
|
||||
buffer.push(0);
|
||||
buffer.extend_from_slice(&field_id.to_be_bytes());
|
||||
self.word_position_docids.insert_del_u32(buffer, docid);
|
||||
|
||||
if self.current_docid.map_or(false, |id| docid != id) {
|
||||
self.flush_fid_word_count(buffer)?;
|
||||
}
|
||||
|
||||
self.fid_word_count
|
||||
.entry(field_id)
|
||||
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||
.or_insert((1, 0));
|
||||
self.current_docid = Some(docid);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_fid_word_count(&mut self, buffer: &mut Vec<u8>) -> Result<()> {
|
||||
for (fid, (current_count, new_count)) in self.fid_word_count.drain() {
|
||||
if current_count != new_count {
|
||||
if current_count <= MAX_COUNTED_WORDS {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(current_count as u8);
|
||||
self.fid_word_count_docids.insert_del_u32(buffer, self.current_docid.unwrap());
|
||||
}
|
||||
if new_count <= MAX_COUNTED_WORDS {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(&fid.to_be_bytes());
|
||||
buffer.push(new_count as u8);
|
||||
self.fid_word_count_docids.insert_add_u32(buffer, self.current_docid.unwrap());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct WordDocidsMergerBuilders {
|
||||
word_fid_docids: HashMapMerger,
|
||||
word_docids: HashMapMerger,
|
||||
exact_word_docids: HashMapMerger,
|
||||
word_position_docids: HashMapMerger,
|
||||
fid_word_count_docids: HashMapMerger,
|
||||
}
|
||||
|
||||
pub struct WordDocidsMergers {
|
||||
pub word_fid_docids: HashMapMerger,
|
||||
pub word_docids: HashMapMerger,
|
||||
pub exact_word_docids: HashMapMerger,
|
||||
pub word_position_docids: HashMapMerger,
|
||||
pub fid_word_count_docids: HashMapMerger,
|
||||
}
|
||||
|
||||
impl WordDocidsMergerBuilders {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
word_fid_docids: HashMapMerger::new(),
|
||||
word_docids: HashMapMerger::new(),
|
||||
exact_word_docids: HashMapMerger::new(),
|
||||
word_position_docids: HashMapMerger::new(),
|
||||
fid_word_count_docids: HashMapMerger::new(),
|
||||
}
|
||||
}
|
||||
|
||||
fn add_sorters(&mut self, other: WordDocidsCachedSorters) -> Result<()> {
|
||||
let WordDocidsCachedSorters {
|
||||
word_fid_docids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
fid_word_count: _,
|
||||
current_docid: _,
|
||||
} = other;
|
||||
|
||||
let mut word_fid_docids_readers = HashMap::new();
|
||||
let mut word_docids_readers = HashMap::new();
|
||||
let mut exact_word_docids_readers = HashMap::new();
|
||||
let mut word_position_docids_readers = HashMap::new();
|
||||
let mut fid_word_count_docids_readers = HashMap::new();
|
||||
rayon::scope(|s| {
|
||||
s.spawn(|_| {
|
||||
word_fid_docids_readers = word_fid_docids.into_sorter();
|
||||
});
|
||||
s.spawn(|_| {
|
||||
word_docids_readers = word_docids.into_sorter();
|
||||
});
|
||||
s.spawn(|_| {
|
||||
exact_word_docids_readers = exact_word_docids.into_sorter();
|
||||
});
|
||||
s.spawn(|_| {
|
||||
word_position_docids_readers = word_position_docids.into_sorter();
|
||||
});
|
||||
s.spawn(|_| {
|
||||
fid_word_count_docids_readers = fid_word_count_docids.into_sorter();
|
||||
});
|
||||
});
|
||||
self.word_fid_docids.extend([word_fid_docids_readers]);
|
||||
self.word_docids.extend([word_docids_readers]);
|
||||
self.exact_word_docids.extend([exact_word_docids_readers]);
|
||||
self.word_position_docids.extend([word_position_docids_readers]);
|
||||
self.fid_word_count_docids.extend([fid_word_count_docids_readers]);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn build(self) -> WordDocidsMergers {
|
||||
WordDocidsMergers {
|
||||
word_fid_docids: self.word_fid_docids,
|
||||
word_docids: self.word_docids,
|
||||
exact_word_docids: self.exact_word_docids,
|
||||
word_position_docids: self.word_position_docids,
|
||||
fid_word_count_docids: self.fid_word_count_docids,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct WordDocidsExtractors;
|
||||
|
||||
impl WordDocidsExtractors {
|
||||
pub fn run_extraction(
|
||||
index: &Index,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
) -> Result<WordDocidsMergers> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let stop_words = index.stop_words(&rtxn)?;
|
||||
let allowed_separators = index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.into_tokenizer();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
|
||||
let localized_attributes_rules =
|
||||
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
let context_pool = ItemsPool::new(|| {
|
||||
Ok((
|
||||
index.read_txn()?,
|
||||
&document_tokenizer,
|
||||
fields_ids_map.clone(),
|
||||
WordDocidsCachedSorters::new(
|
||||
indexer,
|
||||
max_memory,
|
||||
// TODO use a better value
|
||||
200_000.try_into().unwrap(),
|
||||
),
|
||||
))
|
||||
});
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||
context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
|
||||
Self::extract_document_change(
|
||||
&*rtxn,
|
||||
index,
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
cached_sorter,
|
||||
document_change?,
|
||||
)
|
||||
})
|
||||
})?;
|
||||
}
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
||||
let _entered = span.enter();
|
||||
let mut builder = WordDocidsMergerBuilders::new();
|
||||
for (_rtxn, _tokenizer, _fields_ids_map, cache) in context_pool.into_items() {
|
||||
builder.add_sorters(cache)?;
|
||||
}
|
||||
|
||||
Ok(builder.build())
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_document_change(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut WordDocidsCachedSorters,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let exact_attributes = index.exact_attributes(rtxn)?;
|
||||
let is_exact_attribute =
|
||||
|fname: &str| exact_attributes.iter().any(|attr| contained_in(fname, attr));
|
||||
let mut buffer = Vec::new();
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter
|
||||
.insert_del_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
&mut buffer,
|
||||
)
|
||||
.map_err(crate::Error::from)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter
|
||||
.insert_del_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
&mut buffer,
|
||||
)
|
||||
.map_err(crate::Error::from)
|
||||
};
|
||||
document_tokenizer.tokenize_document(
|
||||
inner.current(rtxn, index)?.unwrap(),
|
||||
fields_ids_map,
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter
|
||||
.insert_add_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
&mut buffer,
|
||||
)
|
||||
.map_err(crate::Error::from)
|
||||
};
|
||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let mut token_fn = |fname: &str, fid, pos, word: &str| {
|
||||
cached_sorter
|
||||
.insert_add_u32(
|
||||
fid,
|
||||
pos,
|
||||
word,
|
||||
is_exact_attribute(fname),
|
||||
inner.docid(),
|
||||
&mut buffer,
|
||||
)
|
||||
.map_err(crate::Error::from)
|
||||
};
|
||||
document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
|
||||
}
|
||||
}
|
||||
|
||||
cached_sorter.flush_fid_word_count(&mut buffer)
|
||||
}
|
||||
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,160 @@
|
||||
use std::collections::VecDeque;
|
||||
use std::rc::Rc;
|
||||
|
||||
use heed::RoTxn;
|
||||
use obkv::KvReader;
|
||||
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use super::SearchableExtractor;
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::new::extract::cache::CboCachedSorter;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||
|
||||
pub struct WordPairProximityDocidsExtractor;
|
||||
impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
fn attributes_to_extract<'a>(
|
||||
rtxn: &'a RoTxn,
|
||||
index: &'a Index,
|
||||
) -> Result<Option<Vec<&'a str>>> {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
// This method is reimplemented to count the number of words in the document in each field
|
||||
// and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
|
||||
fn extract_document_change(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut CboCachedSorter,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()> {
|
||||
let mut key_buffer = Vec::new();
|
||||
let mut del_word_pair_proximity = Vec::new();
|
||||
let mut add_word_pair_proximity = Vec::new();
|
||||
let mut word_positions: VecDeque<(Rc<str>, u16)> =
|
||||
VecDeque::with_capacity(MAX_DISTANCE as usize);
|
||||
|
||||
let docid = document_change.docid();
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let document = inner.current(rtxn, index)?.unwrap();
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
del_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let document = inner.current(rtxn, index)?.unwrap();
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
del_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
let document = inner.new();
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
add_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let document = inner.new();
|
||||
process_document_tokens(
|
||||
document,
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
&mut word_positions,
|
||||
&mut |(w1, w2), prox| {
|
||||
add_word_pair_proximity.push(((w1, w2), prox));
|
||||
},
|
||||
)?;
|
||||
}
|
||||
}
|
||||
|
||||
del_word_pair_proximity.sort_unstable();
|
||||
del_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
|
||||
for ((w1, w2), prox) in del_word_pair_proximity.iter() {
|
||||
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_del_u32(key, docid);
|
||||
}
|
||||
|
||||
add_word_pair_proximity.sort_unstable();
|
||||
add_word_pair_proximity.dedup_by(|(k1, _), (k2, _)| k1 == k2);
|
||||
for ((w1, w2), prox) in add_word_pair_proximity.iter() {
|
||||
let key = build_key(*prox, w1, w2, &mut key_buffer);
|
||||
cached_sorter.insert_add_u32(key, docid);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> &'a [u8] {
|
||||
key_buffer.clear();
|
||||
key_buffer.push(prox);
|
||||
key_buffer.extend_from_slice(w1.as_bytes());
|
||||
key_buffer.push(0);
|
||||
key_buffer.extend_from_slice(w2.as_bytes());
|
||||
key_buffer.as_slice()
|
||||
}
|
||||
|
||||
fn word_positions_into_word_pair_proximity(
|
||||
word_positions: &mut VecDeque<(Rc<str>, u16)>,
|
||||
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
|
||||
) -> Result<()> {
|
||||
let (head_word, head_position) = word_positions.pop_front().unwrap();
|
||||
for (word, position) in word_positions.iter() {
|
||||
let prox = index_proximity(head_position as u32, *position as u32) as u8;
|
||||
if prox > 0 && prox < MAX_DISTANCE as u8 {
|
||||
word_pair_proximity((head_word.clone(), word.clone()), prox);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn process_document_tokens(
|
||||
document: &KvReader<FieldId>,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
word_positions: &mut VecDeque<(Rc<str>, u16)>,
|
||||
word_pair_proximity: &mut impl FnMut((Rc<str>, Rc<str>), u8),
|
||||
) -> Result<()> {
|
||||
let mut token_fn = |_fname: &str, _fid: FieldId, pos: u16, word: &str| {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while word_positions
|
||||
.front()
|
||||
.map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
|
||||
{
|
||||
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
|
||||
}
|
||||
|
||||
// insert the new word.
|
||||
word_positions.push_back((Rc::from(word), pos));
|
||||
Ok(())
|
||||
};
|
||||
document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
|
||||
|
||||
while !word_positions.is_empty() {
|
||||
word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
126
milli/src/update/new/extract/searchable/mod.rs
Normal file
126
milli/src/update/new/extract/searchable/mod.rs
Normal file
@@ -0,0 +1,126 @@
|
||||
mod extract_fid_word_count_docids;
|
||||
mod extract_word_docids;
|
||||
mod extract_word_pair_proximity_docids;
|
||||
mod tokenize_document;
|
||||
|
||||
use std::fs::File;
|
||||
|
||||
pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
|
||||
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
|
||||
use grenad::Merger;
|
||||
use heed::RoTxn;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelBridge, ParallelIterator};
|
||||
use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
||||
|
||||
use super::cache::CboCachedSorter;
|
||||
use super::{DocidsExtractor, HashMapMerger};
|
||||
use crate::update::new::{DocumentChange, ItemsPool};
|
||||
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps};
|
||||
use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE};
|
||||
|
||||
pub trait SearchableExtractor {
|
||||
fn run_extraction(
|
||||
index: &Index,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
) -> Result<HashMapMerger> {
|
||||
let max_memory = indexer.max_memory_by_thread();
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let stop_words = index.stop_words(&rtxn)?;
|
||||
let allowed_separators = index.allowed_separators(&rtxn)?;
|
||||
let allowed_separators: Option<Vec<_>> =
|
||||
allowed_separators.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let dictionary = index.dictionary(&rtxn)?;
|
||||
let dictionary: Option<Vec<_>> =
|
||||
dictionary.as_ref().map(|s| s.iter().map(String::as_str).collect());
|
||||
let builder = tokenizer_builder(
|
||||
stop_words.as_ref(),
|
||||
allowed_separators.as_deref(),
|
||||
dictionary.as_deref(),
|
||||
);
|
||||
let tokenizer = builder.into_tokenizer();
|
||||
|
||||
let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?;
|
||||
let attributes_to_skip = Self::attributes_to_skip(&rtxn, index)?;
|
||||
let localized_attributes_rules =
|
||||
index.localized_attributes_rules(&rtxn)?.unwrap_or_default();
|
||||
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tokenizer,
|
||||
attribute_to_extract: attributes_to_extract.as_deref(),
|
||||
attribute_to_skip: attributes_to_skip.as_slice(),
|
||||
localized_attributes_rules: &localized_attributes_rules,
|
||||
max_positions_per_attributes: MAX_POSITION_PER_ATTRIBUTE,
|
||||
};
|
||||
|
||||
let context_pool = ItemsPool::new(|| {
|
||||
Ok((
|
||||
index.read_txn()?,
|
||||
&document_tokenizer,
|
||||
fields_ids_map.clone(),
|
||||
CboCachedSorter::new(),
|
||||
))
|
||||
});
|
||||
|
||||
{
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "docids_extraction");
|
||||
let _entered = span.enter();
|
||||
document_changes.into_par_iter().try_for_each(|document_change| {
|
||||
context_pool.with(|(rtxn, document_tokenizer, fields_ids_map, cached_sorter)| {
|
||||
Self::extract_document_change(
|
||||
&*rtxn,
|
||||
index,
|
||||
document_tokenizer,
|
||||
fields_ids_map,
|
||||
cached_sorter,
|
||||
document_change?,
|
||||
)
|
||||
})
|
||||
})?;
|
||||
}
|
||||
{
|
||||
let mut builder = HashMapMerger::new();
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
|
||||
let _entered = span.enter();
|
||||
|
||||
let readers: Vec<_> = context_pool
|
||||
.into_items()
|
||||
.par_bridge()
|
||||
.map(|(_rtxn, _tokenizer, _fields_ids_map, cached_sorter)| {
|
||||
cached_sorter.into_sorter()
|
||||
})
|
||||
.collect();
|
||||
builder.extend(readers);
|
||||
Ok(builder)
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_document_change(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
document_tokenizer: &DocumentTokenizer,
|
||||
fields_ids_map: &mut GlobalFieldsIdsMap,
|
||||
cached_sorter: &mut CboCachedSorter,
|
||||
document_change: DocumentChange,
|
||||
) -> Result<()>;
|
||||
|
||||
fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index)
|
||||
-> Result<Option<Vec<&'a str>>>;
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>>;
|
||||
}
|
||||
|
||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||
fn run_extraction(
|
||||
index: &Index,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
) -> Result<HashMapMerger> {
|
||||
Self::run_extraction(index, fields_ids_map, indexer, document_changes)
|
||||
}
|
||||
}
|
||||
273
milli/src/update/new/extract/searchable/tokenize_document.rs
Normal file
273
milli/src/update/new/extract/searchable/tokenize_document.rs
Normal file
@@ -0,0 +1,273 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::proximity::MAX_DISTANCE;
|
||||
use crate::update::new::extract::perm_json_p::{
|
||||
seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
|
||||
};
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::{
|
||||
FieldId, GlobalFieldsIdsMap, InternalError, LocalizedAttributesRule, Result, UserError,
|
||||
MAX_WORD_LENGTH,
|
||||
};
|
||||
|
||||
pub struct DocumentTokenizer<'a> {
|
||||
pub tokenizer: &'a Tokenizer<'a>,
|
||||
pub attribute_to_extract: Option<&'a [&'a str]>,
|
||||
pub attribute_to_skip: &'a [&'a str],
|
||||
pub localized_attributes_rules: &'a [LocalizedAttributesRule],
|
||||
pub max_positions_per_attributes: u32,
|
||||
}
|
||||
|
||||
impl<'a> DocumentTokenizer<'a> {
|
||||
pub fn tokenize_document(
|
||||
&self,
|
||||
obkv: &KvReaderFieldId,
|
||||
field_id_map: &mut GlobalFieldsIdsMap,
|
||||
token_fn: &mut impl FnMut(&str, FieldId, u16, &str) -> Result<()>,
|
||||
) -> Result<()> {
|
||||
let mut field_position = HashMap::new();
|
||||
let mut field_name = String::new();
|
||||
for (field_id, field_bytes) in obkv {
|
||||
let Some(field_name) = field_id_map.name(field_id).map(|s| {
|
||||
field_name.clear();
|
||||
field_name.push_str(s);
|
||||
&field_name
|
||||
}) else {
|
||||
unreachable!("field id not found in field id map");
|
||||
};
|
||||
|
||||
let mut tokenize_field = |name: &str, value: &Value| {
|
||||
let Some(field_id) = field_id_map.id_or_insert(name) else {
|
||||
return Err(UserError::AttributeLimitReached.into());
|
||||
};
|
||||
|
||||
let position = field_position
|
||||
.entry(field_id)
|
||||
.and_modify(|counter| *counter += MAX_DISTANCE)
|
||||
.or_insert(0);
|
||||
if *position as u32 >= self.max_positions_per_attributes {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
match value {
|
||||
Value::Number(n) => {
|
||||
let token = n.to_string();
|
||||
if let Ok(position) = (*position).try_into() {
|
||||
token_fn(name, field_id, position, token.as_str())?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Value::String(text) => {
|
||||
// create an iterator of token with their positions.
|
||||
let locales = self
|
||||
.localized_attributes_rules
|
||||
.iter()
|
||||
.find(|rule| rule.match_str(field_name))
|
||||
.map(|rule| rule.locales());
|
||||
let tokens = process_tokens(
|
||||
*position,
|
||||
self.tokenizer.tokenize_with_allow_list(text.as_str(), locales),
|
||||
)
|
||||
.take_while(|(p, _)| (*p as u32) < self.max_positions_per_attributes);
|
||||
|
||||
for (index, token) in tokens {
|
||||
// keep a word only if it is not empty and fit in a LMDB key.
|
||||
let token = token.lemma().trim();
|
||||
if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
|
||||
*position = index;
|
||||
if let Ok(position) = (*position).try_into() {
|
||||
token_fn(name, field_id, position, token)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
_ => Ok(()),
|
||||
}
|
||||
};
|
||||
|
||||
// if the current field is searchable or contains a searchable attribute
|
||||
if select_field(field_name, self.attribute_to_extract, self.attribute_to_skip) {
|
||||
// parse json.
|
||||
match serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)? {
|
||||
Value::Object(object) => seek_leaf_values_in_object(
|
||||
&object,
|
||||
self.attribute_to_extract,
|
||||
self.attribute_to_skip,
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
Value::Array(array) => seek_leaf_values_in_array(
|
||||
&array,
|
||||
self.attribute_to_extract,
|
||||
self.attribute_to_skip,
|
||||
field_name,
|
||||
&mut tokenize_field,
|
||||
)?,
|
||||
value => tokenize_field(field_name, &value)?,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// take an iterator on tokens and compute their relative position depending on separator kinds
|
||||
/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
|
||||
/// else we keep the standard proximity of 1 between words.
|
||||
fn process_tokens<'a>(
|
||||
start_offset: u32,
|
||||
tokens: impl Iterator<Item = Token<'a>>,
|
||||
) -> impl Iterator<Item = (u32, Token<'a>)> {
|
||||
tokens
|
||||
.skip_while(|token| token.is_separator())
|
||||
.scan((start_offset, None), |(offset, prev_kind), mut token| {
|
||||
match token.kind {
|
||||
TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
|
||||
*offset += match *prev_kind {
|
||||
Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
|
||||
Some(_) => 1,
|
||||
None => 0,
|
||||
};
|
||||
*prev_kind = Some(token.kind)
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Hard) => {
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
TokenKind::Separator(SeparatorKind::Soft)
|
||||
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) =>
|
||||
{
|
||||
*prev_kind = Some(token.kind);
|
||||
}
|
||||
_ => token.kind = TokenKind::Unknown,
|
||||
}
|
||||
Some((*offset, token))
|
||||
})
|
||||
.filter(|(_, t)| t.is_word())
|
||||
}
|
||||
|
||||
/// Factorize tokenizer building.
|
||||
pub fn tokenizer_builder<'a>(
|
||||
stop_words: Option<&'a fst::Set<&'a [u8]>>,
|
||||
allowed_separators: Option<&'a [&str]>,
|
||||
dictionary: Option<&'a [&str]>,
|
||||
) -> TokenizerBuilder<'a, &'a [u8]> {
|
||||
let mut tokenizer_builder = TokenizerBuilder::new();
|
||||
if let Some(stop_words) = stop_words {
|
||||
tokenizer_builder.stop_words(stop_words);
|
||||
}
|
||||
if let Some(dictionary) = dictionary {
|
||||
tokenizer_builder.words_dict(dictionary);
|
||||
}
|
||||
if let Some(separators) = allowed_separators {
|
||||
tokenizer_builder.separators(separators);
|
||||
}
|
||||
|
||||
tokenizer_builder
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use charabia::TokenizerBuilder;
|
||||
use meili_snap::snapshot;
|
||||
use obkv::KvReader;
|
||||
use serde_json::json;
|
||||
|
||||
use super::*;
|
||||
use crate::FieldsIdsMap;
|
||||
|
||||
#[test]
|
||||
fn test_tokenize_document() {
|
||||
let mut fields_ids_map = FieldsIdsMap::new();
|
||||
|
||||
let field_1 = json!({
|
||||
"name": "doggo",
|
||||
"age": 10,
|
||||
});
|
||||
|
||||
let field_2 = json!({
|
||||
"catto": {
|
||||
"name": "pesti",
|
||||
"age": 23,
|
||||
}
|
||||
});
|
||||
|
||||
let field_3 = json!(["doggo", "catto"]);
|
||||
let field_4 = json!("UNSEARCHABLE");
|
||||
let field_5 = json!({"nope": "unsearchable"});
|
||||
|
||||
let mut obkv = obkv::KvWriter::memory();
|
||||
let field_1_id = fields_ids_map.insert("doggo").unwrap();
|
||||
let field_1 = serde_json::to_string(&field_1).unwrap();
|
||||
obkv.insert(field_1_id, field_1.as_bytes()).unwrap();
|
||||
let field_2_id = fields_ids_map.insert("catto").unwrap();
|
||||
let field_2 = serde_json::to_string(&field_2).unwrap();
|
||||
obkv.insert(field_2_id, field_2.as_bytes()).unwrap();
|
||||
let field_3_id = fields_ids_map.insert("doggo.name").unwrap();
|
||||
let field_3 = serde_json::to_string(&field_3).unwrap();
|
||||
obkv.insert(field_3_id, field_3.as_bytes()).unwrap();
|
||||
let field_4_id = fields_ids_map.insert("not-me").unwrap();
|
||||
let field_4 = serde_json::to_string(&field_4).unwrap();
|
||||
obkv.insert(field_4_id, field_4.as_bytes()).unwrap();
|
||||
let field_5_id = fields_ids_map.insert("me-nether").unwrap();
|
||||
let field_5 = serde_json::to_string(&field_5).unwrap();
|
||||
obkv.insert(field_5_id, field_5.as_bytes()).unwrap();
|
||||
let value = obkv.into_inner().unwrap();
|
||||
let obkv = KvReader::from_slice(value.as_slice());
|
||||
|
||||
let mut tb = TokenizerBuilder::default();
|
||||
let document_tokenizer = DocumentTokenizer {
|
||||
tokenizer: &tb.build(),
|
||||
attribute_to_extract: None,
|
||||
attribute_to_skip: &["not-me", "me-nether.nope"],
|
||||
localized_attributes_rules: &[],
|
||||
max_positions_per_attributes: 1000,
|
||||
};
|
||||
|
||||
let fields_ids_map_lock = std::sync::RwLock::new(fields_ids_map);
|
||||
let mut global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
||||
|
||||
let mut words = std::collections::BTreeMap::new();
|
||||
document_tokenizer
|
||||
.tokenize_document(obkv, &mut global_fields_ids_map, &mut |_fname, fid, pos, word| {
|
||||
words.insert([fid, pos], word.to_string());
|
||||
Ok(())
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
snapshot!(format!("{:#?}", words), @r###"
|
||||
{
|
||||
[
|
||||
2,
|
||||
0,
|
||||
]: "doggo",
|
||||
[
|
||||
2,
|
||||
MAX_DISTANCE,
|
||||
]: "doggo",
|
||||
[
|
||||
2,
|
||||
16,
|
||||
]: "catto",
|
||||
[
|
||||
3,
|
||||
0,
|
||||
]: "10",
|
||||
[
|
||||
4,
|
||||
0,
|
||||
]: "pesti",
|
||||
[
|
||||
5,
|
||||
0,
|
||||
]: "23",
|
||||
}
|
||||
"###);
|
||||
}
|
||||
}
|
||||
42
milli/src/update/new/indexer/document_deletion.rs
Normal file
42
milli/src/update/new/indexer/document_deletion.rs
Normal file
@@ -0,0 +1,42 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::DocumentChanges;
|
||||
use crate::update::new::{Deletion, DocumentChange, ItemsPool};
|
||||
use crate::{FieldsIdsMap, Index, Result};
|
||||
|
||||
pub struct DocumentDeletion {
|
||||
pub to_delete: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl DocumentDeletion {
|
||||
pub fn new() -> Self {
|
||||
Self { to_delete: Default::default() }
|
||||
}
|
||||
|
||||
pub fn delete_documents_by_docids(&mut self, docids: RoaringBitmap) {
|
||||
self.to_delete |= docids;
|
||||
}
|
||||
}
|
||||
|
||||
impl<'p> DocumentChanges<'p> for DocumentDeletion {
|
||||
type Parameter = &'p Index;
|
||||
|
||||
fn document_changes(
|
||||
self,
|
||||
_fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||
let index = param;
|
||||
let items = Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from)));
|
||||
let to_delete: Vec<_> = self.to_delete.into_iter().collect();
|
||||
Ok(to_delete.into_par_iter().map_with(items, |items, docid| {
|
||||
items.with(|rtxn| {
|
||||
let current = index.document(rtxn, docid)?;
|
||||
Ok(DocumentChange::Deletion(Deletion::create(docid, current.boxed())))
|
||||
})
|
||||
}))
|
||||
}
|
||||
}
|
||||
392
milli/src/update/new/indexer/document_operation.rs
Normal file
392
milli/src/update/new/indexer/document_operation.rs
Normal file
@@ -0,0 +1,392 @@
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::sync::Arc;
|
||||
|
||||
use heed::types::Bytes;
|
||||
use heed::RoTxn;
|
||||
use memmap2::Mmap;
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
|
||||
use IndexDocumentsMethod as Idm;
|
||||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use super::super::items_pool::ItemsPool;
|
||||
use super::super::{CowStr, TopLevelMap};
|
||||
use super::DocumentChanges;
|
||||
use crate::documents::{DocumentIdExtractionError, PrimaryKey};
|
||||
use crate::update::new::{Deletion, Insertion, KvReaderFieldId, KvWriterFieldId, Update};
|
||||
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
||||
use crate::{DocumentId, Error, FieldsIdsMap, Index, Result, UserError};
|
||||
|
||||
pub struct DocumentOperation<'pl> {
|
||||
operations: Vec<Payload<'pl>>,
|
||||
index_documents_method: IndexDocumentsMethod,
|
||||
}
|
||||
|
||||
pub enum Payload<'pl> {
|
||||
Addition(&'pl [u8]),
|
||||
Deletion(Vec<String>),
|
||||
}
|
||||
|
||||
pub struct PayloadStats {
|
||||
pub document_count: usize,
|
||||
pub bytes: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
enum InnerDocOp<'pl> {
|
||||
Addition(DocumentOffset<'pl>),
|
||||
Deletion,
|
||||
}
|
||||
|
||||
/// Represents an offset where a document lives
|
||||
/// in an mmapped grenad reader file.
|
||||
#[derive(Clone)]
|
||||
pub struct DocumentOffset<'pl> {
|
||||
/// The mmapped payload files.
|
||||
pub content: &'pl [u8],
|
||||
}
|
||||
|
||||
impl<'pl> DocumentOperation<'pl> {
|
||||
pub fn new(method: IndexDocumentsMethod) -> Self {
|
||||
Self { operations: Default::default(), index_documents_method: method }
|
||||
}
|
||||
|
||||
/// TODO please give me a type
|
||||
/// The payload is expected to be in the grenad format
|
||||
pub fn add_documents(&mut self, payload: &'pl Mmap) -> Result<PayloadStats> {
|
||||
payload.advise(memmap2::Advice::Sequential)?;
|
||||
let document_count =
|
||||
memchr::memmem::find_iter(&payload[..], "}{").count().saturating_add(1);
|
||||
self.operations.push(Payload::Addition(&payload[..]));
|
||||
Ok(PayloadStats { bytes: payload.len() as u64, document_count })
|
||||
}
|
||||
|
||||
pub fn delete_documents(&mut self, to_delete: Vec<String>) {
|
||||
self.operations.push(Payload::Deletion(to_delete))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'p, 'pl: 'p> DocumentChanges<'p> for DocumentOperation<'pl> {
|
||||
type Parameter = (&'p Index, &'p RoTxn<'p>, &'p PrimaryKey<'p>);
|
||||
|
||||
fn document_changes(
|
||||
self,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p> {
|
||||
let (index, rtxn, primary_key) = param;
|
||||
|
||||
let documents_ids = index.documents_ids(rtxn)?;
|
||||
let mut available_docids = AvailableIds::new(&documents_ids);
|
||||
let mut docids_version_offsets = HashMap::<CowStr<'pl>, _>::new();
|
||||
|
||||
for operation in self.operations {
|
||||
match operation {
|
||||
Payload::Addition(payload) => {
|
||||
let mut iter =
|
||||
serde_json::Deserializer::from_slice(payload).into_iter::<TopLevelMap>();
|
||||
|
||||
/// TODO manage the error
|
||||
let mut previous_offset = 0;
|
||||
while let Some(document) = iter.next().transpose().unwrap() {
|
||||
// TODO Fetch all document fields to fill the fields ids map
|
||||
document.0.keys().for_each(|key| {
|
||||
fields_ids_map.insert(key.as_ref());
|
||||
});
|
||||
|
||||
// TODO we must manage the TooManyDocumentIds,InvalidDocumentId
|
||||
// we must manage the unwrap
|
||||
let external_document_id =
|
||||
match primary_key.document_id_from_top_level_map(&document)? {
|
||||
Ok(document_id) => Ok(document_id),
|
||||
Err(DocumentIdExtractionError::InvalidDocumentId(e)) => Err(e),
|
||||
Err(DocumentIdExtractionError::MissingDocumentId) => {
|
||||
Err(UserError::MissingDocumentId {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: document.try_into().unwrap(),
|
||||
})
|
||||
}
|
||||
Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
|
||||
Err(UserError::TooManyDocumentIds {
|
||||
primary_key: primary_key.name().to_string(),
|
||||
document: document.try_into().unwrap(),
|
||||
})
|
||||
}
|
||||
}?;
|
||||
|
||||
let current_offset = iter.byte_offset();
|
||||
let document_operation = InnerDocOp::Addition(DocumentOffset {
|
||||
content: &payload[previous_offset..current_offset],
|
||||
});
|
||||
|
||||
match docids_version_offsets.get_mut(external_document_id.as_ref()) {
|
||||
None => {
|
||||
let docid = match index
|
||||
.external_documents_ids()
|
||||
.get(rtxn, &external_document_id)?
|
||||
{
|
||||
Some(docid) => docid,
|
||||
None => available_docids
|
||||
.next()
|
||||
.ok_or(Error::UserError(UserError::DocumentLimitReached))?,
|
||||
};
|
||||
|
||||
docids_version_offsets.insert(
|
||||
external_document_id,
|
||||
(docid, vec![document_operation]),
|
||||
);
|
||||
}
|
||||
Some((_, offsets)) => {
|
||||
let useless_previous_addition = match self.index_documents_method {
|
||||
IndexDocumentsMethod::ReplaceDocuments => {
|
||||
MergeDocumentForReplacement::USELESS_PREVIOUS_CHANGES
|
||||
}
|
||||
IndexDocumentsMethod::UpdateDocuments => {
|
||||
MergeDocumentForUpdates::USELESS_PREVIOUS_CHANGES
|
||||
}
|
||||
};
|
||||
|
||||
if useless_previous_addition {
|
||||
offsets.clear();
|
||||
}
|
||||
|
||||
offsets.push(document_operation);
|
||||
}
|
||||
}
|
||||
|
||||
previous_offset = iter.byte_offset();
|
||||
}
|
||||
}
|
||||
Payload::Deletion(to_delete) => {
|
||||
for external_document_id in to_delete {
|
||||
match docids_version_offsets.get_mut(external_document_id.as_str()) {
|
||||
None => {
|
||||
let docid = match index
|
||||
.external_documents_ids()
|
||||
.get(rtxn, &external_document_id)?
|
||||
{
|
||||
Some(docid) => docid,
|
||||
None => available_docids
|
||||
.next()
|
||||
.ok_or(Error::UserError(UserError::DocumentLimitReached))?,
|
||||
};
|
||||
|
||||
docids_version_offsets.insert(
|
||||
CowStr(external_document_id.into()),
|
||||
(docid, vec![InnerDocOp::Deletion]),
|
||||
);
|
||||
}
|
||||
Some((_, offsets)) => {
|
||||
offsets.clear();
|
||||
offsets.push(InnerDocOp::Deletion);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// TODO is it the best way to provide FieldsIdsMap to the parallel iterator?
|
||||
let fields_ids_map = fields_ids_map.clone();
|
||||
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||
let mut docids_version_offsets: Vec<_> = docids_version_offsets.drain().collect();
|
||||
// Reorder the offsets to make sure we iterate on the file sequentially
|
||||
let sort_function_key = match self.index_documents_method {
|
||||
Idm::ReplaceDocuments => MergeDocumentForReplacement::sort_key,
|
||||
Idm::UpdateDocuments => MergeDocumentForUpdates::sort_key,
|
||||
};
|
||||
|
||||
// And finally sort them
|
||||
docids_version_offsets.sort_unstable_by_key(|(_, (_, docops))| sort_function_key(docops));
|
||||
|
||||
Ok(docids_version_offsets.into_par_iter().map_with(
|
||||
Arc::new(ItemsPool::new(|| index.read_txn().map_err(crate::Error::from))),
|
||||
move |context_pool, (external_docid, (internal_docid, operations))| {
|
||||
context_pool.with(|rtxn| {
|
||||
let document_merge_function = match self.index_documents_method {
|
||||
Idm::ReplaceDocuments => MergeDocumentForReplacement::merge,
|
||||
Idm::UpdateDocuments => MergeDocumentForUpdates::merge,
|
||||
};
|
||||
|
||||
document_merge_function(
|
||||
rtxn,
|
||||
index,
|
||||
&fields_ids_map,
|
||||
internal_docid,
|
||||
external_docid.to_string(), // TODO do not clone
|
||||
&operations,
|
||||
)
|
||||
})
|
||||
},
|
||||
))
|
||||
}
|
||||
}
|
||||
|
||||
trait MergeChanges {
|
||||
/// Wether the payloads in the list of operations are useless or not.
|
||||
const USELESS_PREVIOUS_CHANGES: bool;
|
||||
|
||||
/// Returns a key that is used to order the payloads the right way.
|
||||
fn sort_key(docops: &[InnerDocOp]) -> usize;
|
||||
|
||||
fn merge(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
docid: DocumentId,
|
||||
external_docid: String,
|
||||
operations: &[InnerDocOp],
|
||||
) -> Result<DocumentChange>;
|
||||
}
|
||||
|
||||
struct MergeDocumentForReplacement;
|
||||
|
||||
impl MergeChanges for MergeDocumentForReplacement {
|
||||
const USELESS_PREVIOUS_CHANGES: bool = true;
|
||||
|
||||
/// Reorders to read only the last change.
|
||||
fn sort_key(docops: &[InnerDocOp]) -> usize {
|
||||
let f = |ido: &_| match ido {
|
||||
InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize),
|
||||
InnerDocOp::Deletion => None,
|
||||
};
|
||||
docops.iter().rev().find_map(f).unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Returns only the most recent version of a document based on the updates from the payloads.
|
||||
///
|
||||
/// This function is only meant to be used when doing a replacement and not an update.
|
||||
fn merge(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
docid: DocumentId,
|
||||
external_docid: String,
|
||||
operations: &[InnerDocOp],
|
||||
) -> Result<DocumentChange> {
|
||||
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
||||
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
||||
|
||||
match operations.last() {
|
||||
Some(InnerDocOp::Addition(DocumentOffset { content })) => {
|
||||
let map: TopLevelMap = serde_json::from_slice(content).unwrap();
|
||||
let mut document_entries = Vec::new();
|
||||
for (key, v) in map.0 {
|
||||
let id = fields_ids_map.id(key.as_ref()).unwrap();
|
||||
document_entries.push((id, v));
|
||||
}
|
||||
|
||||
document_entries.sort_unstable_by_key(|(id, _)| *id);
|
||||
|
||||
let mut writer = KvWriterFieldId::memory();
|
||||
document_entries
|
||||
.into_iter()
|
||||
.for_each(|(id, value)| writer.insert(id, value.get()).unwrap());
|
||||
let new = writer.into_boxed();
|
||||
|
||||
match current {
|
||||
Some(current) => {
|
||||
let update = Update::create(docid, current.boxed(), new);
|
||||
Ok(DocumentChange::Update(update))
|
||||
}
|
||||
None => Ok(DocumentChange::Insertion(Insertion::create(docid, new))),
|
||||
}
|
||||
}
|
||||
Some(InnerDocOp::Deletion) => {
|
||||
let deletion = match current {
|
||||
Some(current) => Deletion::create(docid, current.boxed()),
|
||||
None => todo!("Do that with Louis"),
|
||||
};
|
||||
Ok(DocumentChange::Deletion(deletion))
|
||||
}
|
||||
None => unreachable!("We must not have empty set of operations on a document"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct MergeDocumentForUpdates;
|
||||
|
||||
impl MergeChanges for MergeDocumentForUpdates {
|
||||
const USELESS_PREVIOUS_CHANGES: bool = false;
|
||||
|
||||
/// Reorders to read the first changes first so that it's faster to read the first one and then the rest.
|
||||
fn sort_key(docops: &[InnerDocOp]) -> usize {
|
||||
let f = |ido: &_| match ido {
|
||||
InnerDocOp::Addition(add) => Some(add.content.as_ptr() as usize),
|
||||
InnerDocOp::Deletion => None,
|
||||
};
|
||||
docops.iter().find_map(f).unwrap_or(0)
|
||||
}
|
||||
|
||||
/// Reads the previous version of a document from the database, the new versions
|
||||
/// in the grenad update files and merges them to generate a new boxed obkv.
|
||||
///
|
||||
/// This function is only meant to be used when doing an update and not a replacement.
|
||||
fn merge(
|
||||
rtxn: &RoTxn,
|
||||
index: &Index,
|
||||
fields_ids_map: &FieldsIdsMap,
|
||||
docid: DocumentId,
|
||||
external_docid: String,
|
||||
operations: &[InnerDocOp],
|
||||
) -> Result<DocumentChange> {
|
||||
let mut document = BTreeMap::<_, Cow<_>>::new();
|
||||
let current = index.documents.remap_data_type::<Bytes>().get(rtxn, &docid)?;
|
||||
let current: Option<&KvReaderFieldId> = current.map(Into::into);
|
||||
|
||||
if operations.is_empty() {
|
||||
unreachable!("We must not have empty set of operations on a document");
|
||||
}
|
||||
|
||||
let last_deletion = operations.iter().rposition(|op| matches!(op, InnerDocOp::Deletion));
|
||||
let operations = &operations[last_deletion.map_or(0, |i| i + 1)..];
|
||||
|
||||
// If there was a deletion we must not start
|
||||
// from the original document but from scratch.
|
||||
if last_deletion.is_none() {
|
||||
if let Some(current) = current {
|
||||
current.into_iter().for_each(|(k, v)| {
|
||||
document.insert(k, v.into());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
if operations.is_empty() {
|
||||
let deletion = match current {
|
||||
Some(current) => Deletion::create(docid, current.boxed()),
|
||||
None => todo!("Do that with Louis"),
|
||||
};
|
||||
return Ok(DocumentChange::Deletion(deletion));
|
||||
}
|
||||
|
||||
for operation in operations {
|
||||
let DocumentOffset { content } = match operation {
|
||||
InnerDocOp::Addition(offset) => offset,
|
||||
InnerDocOp::Deletion => {
|
||||
unreachable!("Deletion in document operations")
|
||||
}
|
||||
};
|
||||
|
||||
let map: TopLevelMap = serde_json::from_slice(content).unwrap();
|
||||
for (key, v) in map.0 {
|
||||
let id = fields_ids_map.id(key.as_ref()).unwrap();
|
||||
document.insert(id, v.get().as_bytes().to_vec().into());
|
||||
}
|
||||
}
|
||||
|
||||
let mut writer = KvWriterFieldId::memory();
|
||||
document.into_iter().for_each(|(id, value)| writer.insert(id, value).unwrap());
|
||||
let new = writer.into_boxed();
|
||||
|
||||
match current {
|
||||
Some(current) => {
|
||||
let update = Update::create(docid, current.boxed(), new);
|
||||
Ok(DocumentChange::Update(update))
|
||||
}
|
||||
None => {
|
||||
let insertion = Insertion::create(docid, new);
|
||||
Ok(DocumentChange::Insertion(insertion))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
316
milli/src/update/new/indexer/mod.rs
Normal file
316
milli/src/update/new/indexer/mod.rs
Normal file
@@ -0,0 +1,316 @@
|
||||
use std::sync::RwLock;
|
||||
use std::thread::{self, Builder};
|
||||
|
||||
use big_s::S;
|
||||
pub use document_deletion::DocumentDeletion;
|
||||
pub use document_operation::DocumentOperation;
|
||||
use heed::{RoTxn, RwTxn};
|
||||
pub use partial_dump::PartialDump;
|
||||
use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator};
|
||||
use rayon::ThreadPool;
|
||||
pub use update_by_function::UpdateByFunction;
|
||||
|
||||
use super::channel::*;
|
||||
use super::document_change::DocumentChange;
|
||||
use super::extract::*;
|
||||
use super::merger::merge_grenad_entries;
|
||||
use super::{ItemsPool, StdResult, TopLevelMap};
|
||||
use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||
use crate::update::new::channel::ExtractorSender;
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
|
||||
|
||||
mod document_deletion;
|
||||
mod document_operation;
|
||||
mod partial_dump;
|
||||
mod update_by_function;
|
||||
|
||||
pub trait DocumentChanges<'p> {
|
||||
type Parameter: 'p;
|
||||
|
||||
fn document_changes(
|
||||
self,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
param: Self::Parameter,
|
||||
) -> Result<impl IndexedParallelIterator<Item = Result<DocumentChange>> + Clone + 'p>;
|
||||
}
|
||||
|
||||
/// This is the main function of this crate.
|
||||
///
|
||||
/// Give it the output of the [`Indexer::document_changes`] method and it will execute it in the [`rayon::ThreadPool`].
|
||||
///
|
||||
/// TODO return stats
|
||||
pub fn index<PI>(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &Index,
|
||||
fields_ids_map: FieldsIdsMap,
|
||||
pool: &ThreadPool,
|
||||
document_changes: PI,
|
||||
) -> Result<()>
|
||||
where
|
||||
PI: IndexedParallelIterator<Item = Result<DocumentChange>> + Send + Clone,
|
||||
{
|
||||
let (merger_sender, writer_receiver) = merger_writer_channel(10_000);
|
||||
// This channel acts as a rendezvous point to ensure that we are one task ahead
|
||||
let (extractor_sender, merger_receiver) = extractors_merger_channels(4);
|
||||
|
||||
let fields_ids_map_lock = RwLock::new(fields_ids_map);
|
||||
let global_fields_ids_map = GlobalFieldsIdsMap::new(&fields_ids_map_lock);
|
||||
let global_fields_ids_map_clone = global_fields_ids_map.clone();
|
||||
|
||||
thread::scope(|s| {
|
||||
// TODO manage the errors correctly
|
||||
let current_span = tracing::Span::current();
|
||||
let handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
|
||||
pool.in_place_scope(|_s| {
|
||||
let span = tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "extract");
|
||||
let _entered = span.enter();
|
||||
let document_changes = document_changes.into_par_iter();
|
||||
|
||||
// document but we need to create a function that collects and compresses documents.
|
||||
let document_sender = extractor_sender.document_sender();
|
||||
document_changes.clone().into_par_iter().try_for_each(|result| {
|
||||
match result? {
|
||||
DocumentChange::Deletion(deletion) => {
|
||||
let docid = deletion.docid();
|
||||
document_sender.delete(docid).unwrap();
|
||||
}
|
||||
DocumentChange::Update(update) => {
|
||||
let docid = update.docid();
|
||||
let content = update.new();
|
||||
document_sender.insert(docid, content.boxed()).unwrap();
|
||||
}
|
||||
DocumentChange::Insertion(insertion) => {
|
||||
let docid = insertion.docid();
|
||||
let content = insertion.new();
|
||||
document_sender.insert(docid, content.boxed()).unwrap();
|
||||
// extracted_dictionary_sender.send(self, dictionary: &[u8]);
|
||||
}
|
||||
}
|
||||
Ok(()) as Result<_>
|
||||
})?;
|
||||
|
||||
document_sender.finish().unwrap();
|
||||
|
||||
const TEN_GIB: usize = 10 * 1024 * 1024 * 1024;
|
||||
let max_memory = TEN_GIB / dbg!(rayon::current_num_threads());
|
||||
let grenad_parameters = GrenadParameters {
|
||||
max_memory: Some(max_memory),
|
||||
..GrenadParameters::default()
|
||||
};
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
|
||||
let _entered = span.enter();
|
||||
extract_and_send_docids::<
|
||||
FacetedDocidsExtractor,
|
||||
FacetDocids,
|
||||
>(
|
||||
index,
|
||||
&global_fields_ids_map,
|
||||
grenad_parameters,
|
||||
document_changes.clone(),
|
||||
&extractor_sender,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
|
||||
let _entered = span.enter();
|
||||
|
||||
let WordDocidsMergers {
|
||||
word_fid_docids,
|
||||
word_docids,
|
||||
exact_word_docids,
|
||||
word_position_docids,
|
||||
fid_word_count_docids,
|
||||
} = WordDocidsExtractors::run_extraction(index, &global_fields_ids_map, grenad_parameters, document_changes.clone())?;
|
||||
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
|
||||
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
|
||||
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
|
||||
extractor_sender.send_searchable::<WordPositionDocids>(word_position_docids).unwrap();
|
||||
extractor_sender.send_searchable::<FidWordCountDocids>(fid_word_count_docids).unwrap();
|
||||
}
|
||||
|
||||
// {
|
||||
// let span = tracing::trace_span!(target: "indexing::documents::extract", "exact_word_docids");
|
||||
// let _entered = span.enter();
|
||||
// extract_and_send_docids::<ExactWordDocidsExtractor, ExactWordDocids>(
|
||||
// index,
|
||||
// &global_fields_ids_map,
|
||||
// grenad_parameters,
|
||||
// document_changes.clone(),
|
||||
// &extractor_sender,
|
||||
// )?;
|
||||
// }
|
||||
|
||||
// {
|
||||
// let span = tracing::trace_span!(target: "indexing::documents::extract", "word_position_docids");
|
||||
// let _entered = span.enter();
|
||||
// extract_and_send_docids::<WordPositionDocidsExtractor, WordPositionDocids>(
|
||||
// index,
|
||||
// &global_fields_ids_map,
|
||||
// grenad_parameters,
|
||||
// document_changes.clone(),
|
||||
// &extractor_sender,
|
||||
// )?;
|
||||
// }
|
||||
|
||||
// {
|
||||
// let span = tracing::trace_span!(target: "indexing::documents::extract", "fid_word_count_docids");
|
||||
// let _entered = span.enter();
|
||||
// extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
|
||||
// index,
|
||||
// &global_fields_ids_map,
|
||||
// GrenadParameters::default(),
|
||||
// document_changes.clone(),
|
||||
// &extractor_sender,
|
||||
// )?;
|
||||
// }
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
|
||||
let _entered = span.enter();
|
||||
extract_and_send_docids::<
|
||||
WordPairProximityDocidsExtractor,
|
||||
WordPairProximityDocids,
|
||||
>(
|
||||
index,
|
||||
&global_fields_ids_map,
|
||||
grenad_parameters,
|
||||
document_changes.clone(),
|
||||
&extractor_sender,
|
||||
)?;
|
||||
}
|
||||
|
||||
{
|
||||
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
|
||||
let _entered = span.enter();
|
||||
}
|
||||
|
||||
// TODO THIS IS TOO MUCH
|
||||
// - [ ] Extract fieldid docid facet number
|
||||
// - [ ] Extract fieldid docid facet string
|
||||
// - [ ] Extract facetid string fst
|
||||
// - [ ] Extract facetid normalized string strings
|
||||
|
||||
// TODO Inverted Indexes again
|
||||
// - [x] Extract fieldid facet isempty docids
|
||||
// - [x] Extract fieldid facet isnull docids
|
||||
// - [x] Extract fieldid facet exists docids
|
||||
|
||||
// TODO This is the normal system
|
||||
// - [x] Extract fieldid facet number docids
|
||||
// - [x] Extract fieldid facet string docids
|
||||
|
||||
Ok(()) as Result<_>
|
||||
})
|
||||
})?;
|
||||
|
||||
// TODO manage the errors correctly
|
||||
let current_span = tracing::Span::current();
|
||||
let handle2 = Builder::new().name(S("indexer-merger")).spawn_scoped(s, move || {
|
||||
let span =
|
||||
tracing::trace_span!(target: "indexing::documents", parent: ¤t_span, "merge");
|
||||
let _entered = span.enter();
|
||||
|
||||
let rtxn_pool = ItemsPool::new(|| index.read_txn().map_err(Into::into));
|
||||
merge_grenad_entries(
|
||||
merger_receiver,
|
||||
merger_sender,
|
||||
&rtxn_pool,
|
||||
index,
|
||||
global_fields_ids_map_clone,
|
||||
)
|
||||
})?;
|
||||
|
||||
let mut entries_count = 0;
|
||||
for operation in writer_receiver {
|
||||
let database = operation.database(index);
|
||||
match operation.entry() {
|
||||
EntryOperation::Delete(e) => {
|
||||
if !database.delete(wtxn, e.entry())? {
|
||||
unreachable!("We tried to delete an unknown key")
|
||||
}
|
||||
}
|
||||
EntryOperation::Write(e) => {
|
||||
entries_count += 1;
|
||||
database.put(wtxn, e.key(), e.value())?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("We saw {entries_count}");
|
||||
|
||||
/// TODO handle the panicking threads
|
||||
handle.join().unwrap()?;
|
||||
handle2.join().unwrap()?;
|
||||
|
||||
Ok(()) as Result<_>
|
||||
})?;
|
||||
|
||||
let fields_ids_map = fields_ids_map_lock.into_inner().unwrap();
|
||||
index.put_fields_ids_map(wtxn, &fields_ids_map)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// TODO: GrenadParameters::default() should be removed in favor a passed parameter
|
||||
/// TODO: manage the errors correctly
|
||||
/// TODO: we must have a single trait that also gives the extractor type
|
||||
fn extract_and_send_docids<E: DocidsExtractor, D: MergerOperationType>(
|
||||
index: &Index,
|
||||
fields_ids_map: &GlobalFieldsIdsMap,
|
||||
indexer: GrenadParameters,
|
||||
document_changes: impl IntoParallelIterator<Item = Result<DocumentChange>>,
|
||||
sender: &ExtractorSender,
|
||||
) -> Result<()> {
|
||||
let merger = E::run_extraction(index, fields_ids_map, indexer, document_changes)?;
|
||||
Ok(sender.send_searchable::<D>(merger).unwrap())
|
||||
}
|
||||
|
||||
/// Returns the primary key *field id* that has already been set for this index or the
|
||||
/// one we will guess by searching for the first key that contains "id" as a substring.
|
||||
/// TODO move this elsewhere
|
||||
pub fn retrieve_or_guess_primary_key<'a>(
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
index: &Index,
|
||||
fields_ids_map: &mut FieldsIdsMap,
|
||||
first_document: Option<&'a TopLevelMap<'_>>,
|
||||
) -> Result<StdResult<PrimaryKey<'a>, UserError>> {
|
||||
match index.primary_key(rtxn)? {
|
||||
Some(primary_key) => match PrimaryKey::new(primary_key, fields_ids_map) {
|
||||
Some(primary_key) => Ok(Ok(primary_key)),
|
||||
None => unreachable!("Why is the primary key not in the fidmap?"),
|
||||
},
|
||||
None => {
|
||||
let first_document = match first_document {
|
||||
Some(document) => document,
|
||||
None => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
};
|
||||
|
||||
let mut guesses: Vec<&str> = first_document
|
||||
.keys()
|
||||
.map(AsRef::as_ref)
|
||||
.filter(|name| name.to_lowercase().ends_with(DEFAULT_PRIMARY_KEY))
|
||||
.collect();
|
||||
|
||||
// sort the keys in lexicographical order, so that fields are always in the same order.
|
||||
guesses.sort_unstable();
|
||||
|
||||
match guesses.as_slice() {
|
||||
[] => Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
|
||||
[name] => {
|
||||
tracing::info!("Primary key was not specified in index. Inferred to '{name}'");
|
||||
match fields_ids_map.insert(name) {
|
||||
Some(field_id) => Ok(Ok(PrimaryKey::Flat { name, field_id })),
|
||||
None => Ok(Err(UserError::AttributeLimitReached)),
|
||||
}
|
||||
}
|
||||
multiple => Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
|
||||
candidates: multiple.iter().map(|candidate| candidate.to_string()).collect(),
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user