diff --git a/Cargo.lock b/Cargo.lock index 0a2a226fa..1baecd1c8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -55,7 +55,7 @@ dependencies = [ "flate2", "foldhash", "futures-core", - "h2 0.3.26", + "h2 0.3.27", "http 0.2.12", "httparse", "httpdate", @@ -65,7 +65,7 @@ dependencies = [ "mime", "percent-encoding", "pin-project-lite", - "rand 0.9.1", + "rand 0.9.2", "sha1", "smallvec", "tokio", @@ -80,7 +80,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e01ed3140b2f8d422c68afa1ed2e85d996ea619c988ac834d255db32138655cb" dependencies = [ "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -121,7 +121,7 @@ dependencies = [ "futures-core", "futures-util", "mio", - "socket2", + "socket2 0.5.10", "tokio", "tracing", ] @@ -202,7 +202,7 @@ dependencies = [ "serde_json", "serde_urlencoded", "smallvec", - "socket2", + "socket2 0.5.10", "time", "tracing", "url", @@ -217,14 +217,14 @@ dependencies = [ "actix-router", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] name = "actix-web-lab" -version = "0.24.1" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a33034dd88446a5deb20e42156dbfe43d07e0499345db3ae65b3f51854190531" +checksum = "781b41742426e73ab2fe531a7d204b9ffa0815a63e358bcd24f2c3c69cda644a" dependencies = [ "actix-http", "actix-router", @@ -268,9 +268,9 @@ dependencies = [ [[package]] name = "adler2" -version = "2.0.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" [[package]] name = "adler32" @@ -358,9 +358,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.18" +version = "0.6.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" dependencies = [ "anstyle", "anstyle-parse", @@ -373,44 +373,44 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" [[package]] name = "anstyle-parse" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.2" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79947af37f4177cfead1110013d678905c37501914fba0efea834c3fe9a8d60c" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "anstyle-wincon" -version = "3.0.8" +version = "3.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6680de5231bd6ee4c6191b8a1325daa282b415391ec9d3a37bd34f2060dc73fa" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "anyhow" -version = "1.0.98" +version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" +checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" dependencies = [ "backtrace", ] @@ -444,9 +444,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arroy" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "733ce4c7a5250d770985c56466fac41238ffdaec0502bee64a4289e300164c5e" +checksum = "8578a72223dfa13dfd9fc144d15260d134361789ebdea9b16e85a511edc73c7d" dependencies = [ "bytemuck", "byteorder", @@ -460,7 +460,7 @@ dependencies = [ "rayon", "roaring", "tempfile", - "thiserror 2.0.12", + "thiserror 2.0.14", "tracing", ] @@ -492,7 +492,7 @@ dependencies = [ "secrecy", "serde", "serde_json", - "thiserror 2.0.12", + "thiserror 2.0.14", "tokio", "tokio-stream", "tokio-util", @@ -506,7 +506,7 @@ source = "git+https://github.com/meilisearch/async-openai?branch=better-error-ha dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -517,7 +517,7 @@ checksum = "e539d3fca749fcee5236ab05e93a52867dd549cc157c8cb7f99595f3cedffdb5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -528,9 +528,9 @@ checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" [[package]] name = "autocfg" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "backoff" @@ -624,6 +624,26 @@ dependencies = [ "serde", ] +[[package]] +name = "bincode" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36eaf5d7b090263e8150820482d5d93cd964a81e4019913c972f4edcc6edb740" +dependencies = [ + "bincode_derive", + "serde", + "unty", +] + +[[package]] +name = "bincode_derive" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf95709a440f45e986983918d0e8a1f30a9b1df04918fc828670606804ac3c09" +dependencies = [ + "virtue", +] + [[package]] name = "bindgen" version = "0.70.1" @@ -639,7 +659,7 @@ dependencies = [ "regex", "rustc-hash 1.1.0", "shlex", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -733,7 +753,7 @@ dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -779,9 +799,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.18.1" +version = "3.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "793db76d6187cd04dff33004d8e6c9cc4e05cd330500379d2394209271b4aeee" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" dependencies = [ "allocator-api2 0.2.21", "serde", @@ -796,7 +816,7 @@ dependencies = [ "allocator-api2 0.2.21", "bitpacking", "bumpalo", - "hashbrown 0.15.4", + "hashbrown 0.15.5", "serde", "serde_json", ] @@ -842,22 +862,22 @@ checksum = "175812e0be2bccb6abe50bb8d566126198344f707e304f45c648fd8f2cc0365e" [[package]] name = "bytemuck" -version = "1.23.1" +version = "1.23.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c76a5792e44e4abe34d3abf15636779261d45a7450612059293d1d2cfc63422" +checksum = "3995eaeebcdf32f91f980d360f78732ddc061097ab4e39991ae7a6ace9194677" dependencies = [ "bytemuck_derive", ] [[package]] name = "bytemuck_derive" -version = "1.9.3" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ecc273b49b3205b83d648f0690daa588925572cc5063745bfe547fe7ec8e1a1" +checksum = "4f154e572231cb6ba2bd1176980827e3d5dc04cc183a75dea38109fbdd672d29" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -883,28 +903,18 @@ dependencies = [ [[package]] name = "bzip2" -version = "0.5.2" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" +checksum = "bea8dcd42434048e4f7a304411d9273a411f647446c1234a65ce0554923f4cff" dependencies = [ - "bzip2-sys", -] - -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", + "libbz2-rs-sys", ] [[package]] name = "camino" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0da45bc31171d8d6960122e222a67740df867c1dd53b4d51caa297084c185cab" +checksum = "5d07aa9a93b00c76f71bc35d598bed923f6d4f3a9ca5c24b7737ae1a292841c0" dependencies = [ "serde", ] @@ -923,7 +933,7 @@ dependencies = [ "memmap2", "num-traits", "num_cpus", - "rand 0.9.1", + "rand 0.9.2", "rand_distr", "rayon", "safetensors", @@ -969,7 +979,7 @@ dependencies = [ "candle-nn", "fancy-regex", "num-traits", - "rand 0.9.1", + "rand 0.9.2", "rayon", "serde", "serde_json", @@ -997,7 +1007,7 @@ dependencies = [ "serde-untagged", "serde-value", "thiserror 1.0.69", - "toml", + "toml 0.8.23", "unicode-xid", "url", ] @@ -1014,17 +1024,17 @@ dependencies = [ "semver", "serde", "serde_json", - "thiserror 2.0.12", + "thiserror 2.0.14", ] [[package]] name = "cargo_toml" -version = "0.22.1" +version = "0.22.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02260d489095346e5cafd04dea8e8cb54d1d74fcd759022a9b72986ebe9a1257" +checksum = "374b7c592d9c00c1f4972ea58390ac6b18cbb6ab79011f3bdc90a0b82ca06b77" dependencies = [ "serde", - "toml", + "toml 0.9.5", ] [[package]] @@ -1035,9 +1045,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" [[package]] name = "cc" -version = "1.2.25" +version = "1.2.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0fc897dc1e865cc67c0e05a836d9d3f1df3cbe442aa4a9473b18e12624a4951" +checksum = "2352e5597e9c544d5e6d9c95190d5d27738ade584fa8db0a16e130e5c2b5296e" dependencies = [ "jobserver", "libc", @@ -1064,9 +1074,9 @@ dependencies = [ [[package]] name = "cfg-if" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" [[package]] name = "cfg_aliases" @@ -1156,9 +1166,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.40" +version = "4.5.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" +checksum = "1fc0e74a703892159f5ae7d3aac52c8e6c392f5ae5f359c70b5881d60aaac318" dependencies = [ "clap_builder", "clap_derive", @@ -1166,9 +1176,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.40" +version = "4.5.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" +checksum = "b3e7f4214277f3c7aa526a59dd3fbe306a370daee1f8b7b8c987069cd8e888a8" dependencies = [ "anstream", "anstyle", @@ -1178,21 +1188,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.40" +version = "4.5.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" +checksum = "14cb31bb0a7d536caef2639baa7fad459e15c3144efefa6dbd1c84562c4739f6" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] name = "clap_lex" -version = "0.7.4" +version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" [[package]] name = "color-spantrace" @@ -1208,9 +1218,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b63caa9aa9397e2d9480a9b13673856c78d8ac123288526c37d7839f2a86990" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" [[package]] name = "concat-arrays" @@ -1327,9 +1337,9 @@ dependencies = [ [[package]] name = "crc32fast" -version = "1.4.2" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" dependencies = [ "cfg-if", ] @@ -1412,9 +1422,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" [[package]] name = "crypto-common" @@ -1449,9 +1459,9 @@ dependencies = [ [[package]] name = "cudarc" -version = "0.16.4" +version = "0.16.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9574894139a982bf26fbb44473a9d416c015e779c51ef0fbc0789f1a1c17b25" +checksum = "17200eb07e7d85a243aa1bf4569a7aa998385ba98d14833973a817a63cc86e92" dependencies = [ "half", "libloading", @@ -1502,7 +1512,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1524,7 +1534,7 @@ checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" dependencies = [ "darling_core 0.20.11", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1584,7 +1594,7 @@ checksum = "30542c1ad912e0e3d22a1935c290e12e8a29d704a420177a31faad4a601a0800" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1626,7 +1636,7 @@ dependencies = [ "darling 0.20.11", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1646,7 +1656,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" dependencies = [ "derive_builder_core 0.20.2", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1666,7 +1676,7 @@ checksum = "bda628edc44c4bb645fbe0f758797143e4e07926f7ebf4e9bdfbd3d2ce621df3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", "unicode-xid", ] @@ -1696,7 +1706,7 @@ dependencies = [ "convert_case 0.6.0", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1760,7 +1770,7 @@ checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1790,7 +1800,7 @@ dependencies = [ "serde_json", "tar", "tempfile", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", "tracing", "uuid", @@ -1921,7 +1931,7 @@ dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1941,7 +1951,7 @@ checksum = "a1ab991c1362ac86c61ab6f556cff143daa22e5a15e4e189df818b2fd19fe65b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -1962,12 +1972,12 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.12" +version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea14ef9355e3beab063703aa9dab15afd25f0667c341310c1e5274bb1d0da18" +checksum = "778e2ac28f6c47af28e4907f13ffd1e1ddbd400980a9abd7c8df189bf578a5ad" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -2009,7 +2019,7 @@ name = "file-store" version = "1.20.0" dependencies = [ "tempfile", - "thiserror 2.0.12", + "thiserror 2.0.14", "tracing", "uuid", ] @@ -2154,7 +2164,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -2493,7 +2503,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2532,9 +2542,9 @@ dependencies = [ [[package]] name = "glob" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" [[package]] name = "grenad" @@ -2551,9 +2561,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.26" +version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" dependencies = [ "bytes", "fnv", @@ -2570,9 +2580,9 @@ dependencies = [ [[package]] name = "h2" -version = "0.4.10" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9421a676d1b147b16b82c9225157dc629087ef8ec4d5e2960f9437a90dac0a5" +checksum = "f3c0b69cfcb4e1b9f1bf2f53f95f766e4661169728ec61cd3fe5a0166f2d1386" dependencies = [ "atomic-waker", "bytes", @@ -2597,10 +2607,33 @@ dependencies = [ "cfg-if", "crunchy", "num-traits", - "rand 0.9.1", + "rand 0.9.2", "rand_distr", ] +[[package]] +name = "hannoy" +version = "0.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b6a412d145918473a8257706599a1088c505047eef9cc6c63c494c95786044f" +dependencies = [ + "bytemuck", + "byteorder", + "hashbrown 0.15.5", + "heed", + "min-max-heap", + "page_size", + "papaya", + "rand 0.8.5", + "rayon", + "roaring", + "rustc-hash 2.1.1", + "steppe", + "thiserror 2.0.14", + "tinyvec", + "tracing", +] + [[package]] name = "hash32" version = "0.3.1" @@ -2631,9 +2664,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.4" +version = "0.15.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" dependencies = [ "allocator-api2 0.2.21", "equivalent", @@ -2687,7 +2720,7 @@ version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c255bdf46e07fb840d120a36dcc81f385140d7191c76a7391672675c01a55d" dependencies = [ - "bincode", + "bincode 1.3.3", "byteorder", "heed-traits", "serde", @@ -2696,9 +2729,9 @@ dependencies = [ [[package]] name = "hermit-abi" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f154ce46856750ed433c8649605bf7ed2de3bc35fd9d2a9f30cddd873c80cb08" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" [[package]] name = "hex" @@ -2797,7 +2830,7 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.10", + "h2 0.4.12", "http 1.3.1", "http-body", "httparse", @@ -2811,9 +2844,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.27.6" +version = "0.27.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03a01595e11bdcec50946522c32dde3fc6914743000a68b93000965f2f02406d" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ "http 1.3.1", "hyper", @@ -2824,14 +2857,14 @@ dependencies = [ "tokio", "tokio-rustls", "tower-service", - "webpki-roots 1.0.0", + "webpki-roots 1.0.2", ] [[package]] name = "hyper-util" -version = "0.1.13" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1c293b6b3d21eca78250dc7dbebd6b9210ec5530e038cbfe0661b5c47ab06e8" +checksum = "8d9b05277c7e8da2c93a568989bb6207bef0112e8d17df7a6eda4a3cf143bc5e" dependencies = [ "base64 0.22.1", "bytes", @@ -2845,7 +2878,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.0", "tokio", "tower-service", "tracing", @@ -2990,7 +3023,7 @@ dependencies = [ "libflate", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -3000,7 +3033,7 @@ dependencies = [ "anyhow", "backoff", "big_s", - "bincode", + "bincode 1.3.3", "bumpalo", "bumparaw-collections", "byte-unit", @@ -3026,7 +3059,7 @@ dependencies = [ "serde_json", "synchronoise", "tempfile", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", "tracing", "ureq", @@ -3035,12 +3068,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.9.0" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" dependencies = [ "equivalent", - "hashbrown 0.15.4", + "hashbrown 0.15.5", "serde", ] @@ -3090,6 +3123,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "io-uring" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" +dependencies = [ + "bitflags 2.9.1", + "cfg-if", + "libc", +] + [[package]] name = "ipnet" version = "2.11.0" @@ -3196,9 +3240,9 @@ dependencies = [ [[package]] name = "jieba-rs" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06096b4b61fb4bfdbf16c6a968ea2d6be1ac9617cf3db741c3b641e6c290a35" +checksum = "f5dd552bbb95d578520ee68403bf8aaf0dbbb2ce55b0854d019f9350ad61040a" dependencies = [ "cedarwood", "fxhash", @@ -3293,10 +3337,16 @@ dependencies = [ ] [[package]] -name = "libc" -version = "0.2.172" +name = "libbz2-rs-sys" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" +checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" + +[[package]] +name = "libc" +version = "0.2.175" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" [[package]] name = "libflate" @@ -3324,9 +3374,9 @@ dependencies = [ [[package]] name = "libgit2-sys" -version = "0.18.1+1.9.0" +version = "0.18.2+1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1dcb20f84ffcdd825c7a311ae347cce604a6f084a767dec4a4929829645290e" +checksum = "1c42fe03df2bd3c53a3a9c7317ad91d80c81cd1fb0caec8d7cc4cd2bfa10c222" dependencies = [ "cc", "libc", @@ -3341,14 +3391,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "07033963ba89ebaf1584d767badaa2e8fcec21aedea6b8c0346d487d49c28667" dependencies = [ "cfg-if", - "windows-targets 0.52.6", + "windows-targets 0.53.3", ] [[package]] name = "liblzma" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66352d7a8ac12d4877b6e6ea5a9b7650ee094257dc40889955bea5bc5b08c1d0" +checksum = "0791ab7e08ccc8e0ce893f6906eb2703ed8739d8e89b57c0714e71bad09024c8" dependencies = [ "liblzma-sys", ] @@ -3393,9 +3443,9 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.3" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c0ff37bd590ca25063e35af745c343cb7a0271906fb7b37e4813e8f79f00268d" +checksum = "391290121bad3d37fbddad76d8f5d1c1c314cfc646d143d7e07a3086ddff0ce3" dependencies = [ "bitflags 2.9.1", "libc", @@ -3425,12 +3475,12 @@ dependencies = [ [[package]] name = "lindera" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f20720cb4206e87b6844b05c66b23301e7bb532718f200ff55bbbdfbce9b7f2b" +checksum = "877750979d709bb5fb2d616a2c9968301ead80147db2270c8f23d8239467f159" dependencies = [ "anyhow", - "bincode", + "bincode 2.0.1", "byteorder", "csv", "kanaria", @@ -3455,11 +3505,11 @@ dependencies = [ [[package]] name = "lindera-cc-cedict" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f6ddd4aeaeaf1ce47ea5785bd6a273179d32df4af4b306d9b65a7a7f81a0e61" +checksum = "4efb27e037efbd41fdce30e3771084a46040cdfdbec718425879f8aa7dfa16f2" dependencies = [ - "bincode", + "bincode 2.0.1", "byteorder", "lindera-dictionary", "once_cell", @@ -3468,12 +3518,12 @@ dependencies = [ [[package]] name = "lindera-dictionary" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9b5e417c4c6e001459e019b178f65f759be9c2cbf2d9bd803ec5d8ed0e62124" +checksum = "4cbb45e81527092e1af24c45bf068376df34faaa9f98d44a99ec59c7edfdb45a" dependencies = [ "anyhow", - "bincode", + "bincode 2.0.1", "byteorder", "csv", "derive_builder 0.20.2", @@ -3485,22 +3535,22 @@ dependencies = [ "log", "md5", "once_cell", - "rand 0.9.1", + "rand 0.9.2", "reqwest", "serde", "tar", - "thiserror 2.0.12", + "thiserror 2.0.14", "tokio", "yada", ] [[package]] name = "lindera-ipadic" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2867975f1b92d1093ccbb52c5c1664a56dfbd27a2fece0166c765ad1f043f31" +checksum = "447887ebb06c9faf7e9b41c03b491ae3e12aca2c0e119a865b682863a1b538aa" dependencies = [ - "bincode", + "bincode 2.0.1", "byteorder", "lindera-dictionary", "once_cell", @@ -3509,11 +3559,11 @@ dependencies = [ [[package]] name = "lindera-ipadic-neologd" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c54c4c2d3fb8b380d0ace5ae97111ca444bcfa7721966f552117d57f07d8b3b1" +checksum = "8da7829c17dee5b1068f241c1989f83a2d98307ed5d3b87fcc91bd06b7a7b44e" dependencies = [ - "bincode", + "bincode 2.0.1", "byteorder", "lindera-dictionary", "once_cell", @@ -3522,11 +3572,11 @@ dependencies = [ [[package]] name = "lindera-ko-dic" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f495e64f62deee60d9b71dbe3fd39b69b8688c9d591842f81f94e200eb4d81f" +checksum = "8dc88354c6f8fbdeb7aa3d44673eb0ca03b7ad6e8d4edb62332604bb7806d17b" dependencies = [ - "bincode", + "bincode 2.0.1", "byteorder", "lindera-dictionary", "once_cell", @@ -3535,11 +3585,11 @@ dependencies = [ [[package]] name = "lindera-unidic" -version = "0.43.1" +version = "0.43.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e85ff97ce04c519fbca0f05504ea028761ccc456b1e84cf1e75fac57f9b3caf1" +checksum = "4cf374587e0202193c73f443cb0ba0d34ff0013949b2355aeeeb0f840679681e" dependencies = [ - "bincode", + "bincode 2.0.1", "byteorder", "lindera-dictionary", "once_cell", @@ -3601,7 +3651,7 @@ checksum = "de66c928222984aea59fcaed8ba627f388aaac3c1f57dcb05cc25495ef8faefe" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -3674,7 +3724,7 @@ version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f8cc7106155f10bdf99a6f379688f543ad6596a415375b36a59a054ceda1198" dependencies = [ - "hashbrown 0.15.4", + "hashbrown 0.15.5", ] [[package]] @@ -3708,7 +3758,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -3813,10 +3863,10 @@ dependencies = [ "temp-env", "tempfile", "termcolor", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", "tokio", - "toml", + "toml 0.8.23", "tracing", "tracing-actix-web", "tracing-subscriber", @@ -3828,7 +3878,7 @@ dependencies = [ "uuid", "wiremock", "yaup", - "zip 4.1.0", + "zip 4.3.0", ] [[package]] @@ -3845,7 +3895,7 @@ dependencies = [ "serde", "serde_json", "sha2", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", "uuid", ] @@ -3878,7 +3928,7 @@ dependencies = [ "serde_json", "tar", "tempfile", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", "tokio", "utoipa", @@ -3928,7 +3978,7 @@ dependencies = [ "bbqueue", "big_s", "bimap", - "bincode", + "bincode 1.3.3", "bstr", "bumpalo", "bumparaw-collections", @@ -3952,7 +4002,8 @@ dependencies = [ "fxhash", "geoutils", "grenad", - "hashbrown 0.15.4", + "hannoy", + "hashbrown 0.15.5", "heed", "hf-hub", "indexmap", @@ -3983,8 +4034,9 @@ dependencies = [ "smallstr", "smallvec", "smartstring", + "steppe", "tempfile", - "thiserror 2.0.12", + "thiserror 2.0.14", "thread_local", "tiktoken-rs", "time", @@ -4022,6 +4074,12 @@ dependencies = [ "unicase", ] +[[package]] +name = "min-max-heap" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2687e6cf9c00f48e9284cf9fd15f2ef341d03cc7743abf9df4c5f07fdee50b18" + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -4030,9 +4088,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" [[package]] name = "miniz_oxide" -version = "0.8.8" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" dependencies = [ "adler2", ] @@ -4045,7 +4103,7 @@ checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" dependencies = [ "libc", "log", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.59.0", ] @@ -4067,7 +4125,7 @@ checksum = "c402a4092d5e204f32c9e155431046831fa712637043c58cb73bc6bc6c9663b5" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -4229,23 +4287,24 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e613fc340b2220f734a8595782c551f1250e969d87d3be1ae0579e8d4065179" +checksum = "a973b4e44ce6cad84ce69d797acf9a044532e4184c4f267913d1b546a0727b7a" dependencies = [ "num_enum_derive", + "rustversion", ] [[package]] name = "num_enum_derive" -version = "0.7.3" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af1844ef2428cc3e1cb900be36181049ef3d3193c63e43026cfe202983b27a56" +checksum = "77e878c846a8abae00dd069496dbe8751b16ac1c3d6bd2a7283a938e8228f90d" dependencies = [ "proc-macro-crate", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -4392,9 +4451,9 @@ dependencies = [ [[package]] name = "owo-colors" -version = "4.2.1" +version = "4.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26995317201fa17f3656c36716aed4a7c81743a9634ac4c99c0eeda495db0cec" +checksum = "48dd4f4a2c8405440fd0462561f0e5806bd0f77e86f51c761481bdd4018b545e" [[package]] name = "page_size" @@ -4406,6 +4465,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "papaya" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f92dd0b07c53a0a0c764db2ace8c541dc47320dad97c2200c2a637ab9dd2328f" +dependencies = [ + "equivalent", + "seize", +] + [[package]] name = "parking_lot" version = "0.12.4" @@ -4486,20 +4555,20 @@ dependencies = [ [[package]] name = "pest" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "198db74531d58c70a361c42201efde7e2591e976d518caf7662a47dc5720e7b6" +checksum = "1db05f56d34358a8b1066f67cbb203ee3e7ed2ba674a6263a1d5ec6db2204323" dependencies = [ "memchr", - "thiserror 2.0.12", + "thiserror 2.0.14", "ucd-trie", ] [[package]] name = "pest_derive" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d725d9cfd79e87dccc9341a2ef39d1b6f6353d68c4b33c177febbe1a402c97c5" +checksum = "bb056d9e8ea77922845ec74a1c4e8fb17e7c218cc4fc11a15c5d25e189aa40bc" dependencies = [ "pest", "pest_generator", @@ -4507,24 +4576,23 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "db7d01726be8ab66ab32f9df467ae8b1148906685bbe75c82d1e65d7f5b3f841" +checksum = "87e404e638f781eb3202dc82db6760c8ae8a1eeef7fb3fa8264b2ef280504966" dependencies = [ "pest", "pest_meta", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] name = "pest_meta" -version = "2.8.0" +version = "2.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f9f832470494906d1fca5329f8ab5791cc60beb230c74815dff541cbd2b5ca0" +checksum = "edd1101f170f5903fde0914f899bb503d9ff5271d7ba76bbb70bea63690cc0d5" dependencies = [ - "once_cell", "pest", "sha2", ] @@ -4569,7 +4637,7 @@ dependencies = [ "phf_shared", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -4598,7 +4666,7 @@ checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -4664,9 +4732,9 @@ dependencies = [ [[package]] name = "portable-atomic" -version = "1.11.0" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" +checksum = "f84267b20a16ea918e43c6a88433c2d54fa145c92a811b5b047ccbe153674483" [[package]] name = "potential_utf" @@ -4683,6 +4751,12 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "ppmd-rust" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c834641d8ad1b348c9ee86dec3b9840d805acd5f24daa5f90c788951a52ff59b" + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -4703,9 +4777,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.95" +version = "1.0.97" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +checksum = "d61789d7719defeb74ea5fe81f2fdfdbd28a803847077cecce2ff14e1472f6f1" dependencies = [ "unicode-ident", ] @@ -4746,7 +4820,7 @@ dependencies = [ "parking_lot", "procfs", "protobuf", - "thiserror 2.0.12", + "thiserror 2.0.14", ] [[package]] @@ -4828,8 +4902,8 @@ dependencies = [ "quinn-udp", "rustc-hash 2.1.1", "rustls", - "socket2", - "thiserror 2.0.12", + "socket2 0.5.10", + "thiserror 2.0.14", "tokio", "tracing", "web-time", @@ -4844,13 +4918,13 @@ dependencies = [ "bytes", "getrandom 0.3.3", "lru-slab", - "rand 0.9.1", + "rand 0.9.2", "ring", "rustc-hash 2.1.1", "rustls", "rustls-pki-types", "slab", - "thiserror 2.0.12", + "thiserror 2.0.14", "tinyvec", "tracing", "web-time", @@ -4858,14 +4932,14 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.12" +version = "0.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee4e529991f949c5e25755532370b8af5d114acae52326361d68d47af64aa842" +checksum = "fcebb1209ee276352ef14ff8732e24cc2b02bbac986cd74a4c81bcb2f9881970" dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.5.10", "tracing", "windows-sys 0.59.0", ] @@ -4881,9 +4955,9 @@ dependencies = [ [[package]] name = "r-efi" -version = "5.2.0" +version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" [[package]] name = "radium" @@ -4904,9 +4978,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.1" +version = "0.9.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.3", @@ -4957,7 +5031,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" dependencies = [ "num-traits", - "rand 0.9.1", + "rand 0.9.2", ] [[package]] @@ -4980,9 +5054,9 @@ dependencies = [ [[package]] name = "rayon" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" dependencies = [ "either", "rayon-core", @@ -5001,9 +5075,9 @@ dependencies = [ [[package]] name = "rayon-core" -version = "1.12.1" +version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" dependencies = [ "crossbeam-deque", "crossbeam-utils", @@ -5017,9 +5091,9 @@ checksum = "03251193000f4bd3b042892be858ee50e8b3719f2b08e5833ac4353724632430" [[package]] name = "redox_syscall" -version = "0.5.12" +version = "0.5.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "928fca9cf2aa042393a8325b9ead81d2f0df4cb12e1e24cef072922ccd99c5af" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" dependencies = [ "bitflags 2.9.1", ] @@ -5081,9 +5155,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.20" +version = "0.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eabf4c97d9130e2bf606614eb937e86edac8292eaa6f422f995d7e8de1eb1813" +checksum = "d429f34c8092b2d42c7c93cec323bb4adeb7c67698f70839adec842ec10c7ceb" dependencies = [ "base64 0.22.1", "bytes", @@ -5120,7 +5194,7 @@ dependencies = [ "wasm-bindgen-futures", "wasm-streams", "web-sys", - "webpki-roots 1.0.0", + "webpki-roots 1.0.2", ] [[package]] @@ -5166,7 +5240,7 @@ checksum = "a5a11a05ee1ce44058fa3d5961d05194fdbe3ad6b40f904af764d81b86450e6b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -5243,9 +5317,9 @@ dependencies = [ [[package]] name = "rust_decimal" -version = "1.37.1" +version = "1.37.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "faa7de2ba56ac291bd90c6b9bece784a52ae1411f9506544b3eae36dd2356d50" +checksum = "b203a6425500a03e0919c42d3c47caca51e79f1132046626d2c8871c5092035d" dependencies = [ "arrayvec", "borsh", @@ -5259,9 +5333,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.24" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" [[package]] name = "rustc-hash" @@ -5290,22 +5364,22 @@ dependencies = [ [[package]] name = "rustix" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c71e83d6afe7ff64890ec6b71d6a69bb8a610ab78ce364b3352876bb4c801266" +checksum = "11181fbabf243db407ef8df94a6ce0b2f9a733bd8be4ad02b4eda9602296cac8" dependencies = [ "bitflags 2.9.1", "errno", "libc", "linux-raw-sys 0.9.4", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] name = "rustls" -version = "0.23.28" +version = "0.23.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7160e3e10bf4535308537f3c4e1641468cd0e485175d6163087c0393c7d46643" +checksum = "c0ebcbd2f03de0fc1122ad9bb24b127a5a6cd51d72604a3f3c50ac459762b6cc" dependencies = [ "log", "once_cell", @@ -5349,9 +5423,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.3" +version = "0.103.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4a72fe2bcf7a6ac6fd7d0b9e5cb68aeb7d4c0a0271730218b3e92d43b4eb435" +checksum = "0a17884ae0c1b773f1ccd2bd4a8c72f16da897310a98b0e84bf349ad5ead92fc" dependencies = [ "ring", "rustls-pki-types", @@ -5360,9 +5434,9 @@ dependencies = [ [[package]] name = "rustversion" -version = "1.0.21" +version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" [[package]] name = "ryu" @@ -5422,9 +5496,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.2.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271720403f46ca04f7ba6f55d438f8bd878d6b8ca0a1046e8228c4145bcbb316" +checksum = "80fb1d92c5028aa318b4b8bd7302a5bfcf48be96a37fc6fc790f806b0004ee0c" dependencies = [ "bitflags 2.9.1", "core-foundation", @@ -5453,10 +5527,20 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", ] +[[package]] +name = "seize" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "semver" version = "1.0.26" @@ -5492,9 +5576,9 @@ dependencies = [ [[package]] name = "serde-untagged" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "299d9c19d7d466db4ab10addd5703e4c615dec2a5a16dbbafe191045e87ee66e" +checksum = "34836a629bcbc6f1afdf0907a744870039b1e14c0561cb26094fa683b158eff3" dependencies = [ "erased-serde", "serde", @@ -5519,7 +5603,7 @@ checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -5537,9 +5621,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.140" +version = "1.0.142" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" +checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" dependencies = [ "indexmap", "itoa", @@ -5576,6 +5660,15 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_spanned" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40734c41988f7306bb04f0ecf60ec0f3f1caa34290e4e8ea471dcd3346483b83" +dependencies = [ + "serde", +] + [[package]] name = "serde_urlencoded" version = "0.7.1" @@ -5651,9 +5744,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook-registry" -version = "1.4.5" +version = "1.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9203b8055f63a2a00e2f593bb0510367fe707d7ff1e5c872de2f537b339e5410" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" dependencies = [ "libc", ] @@ -5684,7 +5777,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.12", + "thiserror 2.0.14", "time", ] @@ -5696,12 +5789,9 @@ checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" [[package]] name = "slab" -version = "0.4.9" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f92a496fb766b417c996b9c5e57daf2f7ad3b0bebe1ccfca4856390e3d3bb67" -dependencies = [ - "autocfg", -] +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" [[package]] name = "slice-group-by" @@ -5711,9 +5801,9 @@ checksum = "826167069c09b99d56f31e9ae5c99049e932a98c9dc2dac47645b08dbbf76ba7" [[package]] name = "smallstr" -version = "0.3.0" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63b1aefdf380735ff8ded0b15f31aab05daf1f70216c01c02a12926badd1df9d" +checksum = "862077b1e764f04c251fe82a2ef562fd78d7cadaeb072ca7c2bcaf7217b1ff3b" dependencies = [ "serde", "smallvec", @@ -5750,6 +5840,16 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + [[package]] name = "socks" version = "0.3.4" @@ -5811,6 +5911,16 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" +[[package]] +name = "steppe" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dead99cdf718f37bcd1d22dda9b498f35c5aa22894b755bfd94bf8c2daec9427" +dependencies = [ + "convert_case 0.8.0", + "indexmap", +] + [[package]] name = "strsim" version = "0.10.0" @@ -5825,24 +5935,23 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" [[package]] name = "strum" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f64def088c51c9510a8579e3c5d67c65349dcf755e5479ad3d010aa6454e2c32" +checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" dependencies = [ "strum_macros", ] [[package]] name = "strum_macros" -version = "0.27.1" +version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c77a8c5abcaf0f9ce05d62342b7d298c346515365c36b673df4ebe3ced01fde8" +checksum = "7695ce3845ea4b33927c055a39dc438a45b059f7c1b3d91d38d10355fb8cbca7" dependencies = [ "heck", "proc-macro2", "quote", - "rustversion", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -5864,9 +5973,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.101" +version = "2.0.105" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +checksum = "7bc3fcb250e53458e712715cf74285c1f889686520d79294a9ef3bd7aa1fc619" dependencies = [ "proc-macro2", "quote", @@ -5899,7 +6008,7 @@ checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -5979,7 +6088,7 @@ dependencies = [ "fastrand", "getrandom 0.3.3", "once_cell", - "rustix 1.0.7", + "rustix 1.0.8", "windows-sys 0.59.0", ] @@ -6012,11 +6121,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.12" +version = "2.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +checksum = "0b0949c3a6c842cbde3f1686d6eea5a010516deb7085f79db747562d4102f41e" dependencies = [ - "thiserror-impl 2.0.12", + "thiserror-impl 2.0.14", ] [[package]] @@ -6027,18 +6136,18 @@ checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] name = "thiserror-impl" -version = "2.0.12" +version = "2.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +checksum = "cc5b44b4ab9c2fdd0e0512e6bece8388e214c0749f5862b114cc5b7a25daf227" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -6174,20 +6283,22 @@ dependencies = [ [[package]] name = "tokio" -version = "1.45.1" +version = "1.47.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75ef51a33ef1da925cea3e4eb122833cb377c61439ca401b770f54902b806779" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" dependencies = [ "backtrace", "bytes", + "io-uring", "libc", "mio", "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "slab", + "socket2 0.6.0", "tokio-macros", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -6198,7 +6309,7 @@ checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -6224,9 +6335,9 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.15" +version = "0.7.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "66a539a9ad6d5d281510d5bd368c973d636c02dbf8a67300bfb6b950696ad7df" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" dependencies = [ "bytes", "futures-core", @@ -6242,11 +6353,26 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" dependencies = [ "serde", - "serde_spanned", - "toml_datetime", + "serde_spanned 0.6.9", + "toml_datetime 0.6.11", "toml_edit", ] +[[package]] +name = "toml" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75129e1dc5000bfbaa9fee9d1b21f974f9fbad9daec557a521ee6e080825f6e8" +dependencies = [ + "indexmap", + "serde", + "serde_spanned 1.0.0", + "toml_datetime 0.7.0", + "toml_parser", + "toml_writer", + "winnow", +] + [[package]] name = "toml_datetime" version = "0.6.11" @@ -6256,6 +6382,15 @@ dependencies = [ "serde", ] +[[package]] +name = "toml_datetime" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bade1c3e902f58d73d3f294cd7f20391c1cb2fbcb643b73566bc773971df91e3" +dependencies = [ + "serde", +] + [[package]] name = "toml_edit" version = "0.22.27" @@ -6264,18 +6399,33 @@ checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" dependencies = [ "indexmap", "serde", - "serde_spanned", - "toml_datetime", + "serde_spanned 0.6.9", + "toml_datetime 0.6.11", "toml_write", "winnow", ] +[[package]] +name = "toml_parser" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b551886f449aa90d4fe2bdaa9f4a2577ad2dde302c61ecf262d80b116db95c10" +dependencies = [ + "winnow", +] + [[package]] name = "toml_write" version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" +[[package]] +name = "toml_writer" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcc842091f2def52017664b53082ecbbeb5c7731092bad69d2c63050401dfd64" + [[package]] name = "tower" version = "0.5.2" @@ -6335,9 +6485,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.18" +version = "0.7.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2340b7722695166c7fc9b3e3cd1166e7c74fedb9075b8f0c74d3822d2e41caf5" +checksum = "5360edd490ec8dee9fedfc6a9fd83ac2f01b3e1996e3261b9ad18a61971fe064" dependencies = [ "actix-web", "mutually_exclusive_features", @@ -6348,20 +6498,20 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.28" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" +checksum = "81383ab64e72a7a8b8e13130c49e3dab29def6d0c7d76a03087b3cf71c5c6903" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] name = "tracing-core" -version = "0.1.33" +version = "0.1.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" +checksum = "b9d12581f227e93f094d3af2ae690a574abb8a2b9b7a96e7cfe9647b2b617678" dependencies = [ "once_cell", "valuable", @@ -6501,7 +6651,7 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c01d12e3a56a4432a8b436f293c25f4808bdf9e9f9f98f9260bba1f1bc5a1f26" dependencies = [ - "thiserror 2.0.12", + "thiserror 2.0.14", ] [[package]] @@ -6548,9 +6698,9 @@ checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" [[package]] name = "unicode-width" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" +checksum = "4a1a07cc7db3810833284e8d372ccdc6da29741639ecc70c9ec107df0fa6154c" [[package]] name = "unicode-xid" @@ -6576,6 +6726,12 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "unty" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d49784317cd0d1ee7ec5c716dd598ec5b4483ea832a2dced265471cc0f690ae" + [[package]] name = "ureq" version = "2.12.1" @@ -6652,7 +6808,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.101", + "syn 2.0.105", "uuid", ] @@ -6670,9 +6826,9 @@ dependencies = [ [[package]] name = "uuid" -version = "1.17.0" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cf4199d1e5d15ddd86a694e4d0dffa9c323ce759fea589f00fef9d81cc1931d" +checksum = "f33196643e165781c20a5ead5582283a7dacbb87855d867fbc2df3f81eddc1be" dependencies = [ "getrandom 0.3.3", "js-sys", @@ -6736,6 +6892,12 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "virtue" +version = "0.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "051eb1abcf10076295e815102942cc58f9d5e3b4560e46e53c21e8ff6f3af7b1" + [[package]] name = "walkdir" version = "2.5.0" @@ -6768,9 +6930,9 @@ dependencies = [ [[package]] name = "wasi" -version = "0.11.0+wasi-snapshot-preview1" +version = "0.11.1+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasi" @@ -6803,7 +6965,7 @@ dependencies = [ "log", "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", "wasm-bindgen-shared", ] @@ -6838,7 +7000,7 @@ checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -6891,14 +7053,14 @@ version = "0.26.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" dependencies = [ - "webpki-roots 1.0.0", + "webpki-roots 1.0.2", ] [[package]] name = "webpki-roots" -version = "1.0.0" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2853738d1cc4f2da3a225c18ec6c3721abb31961096e9dbf5ab35fa88b19cfdb" +checksum = "7e8983c3ab33d6fb807cfcdad2491c4ea8cbc8ed839181c7dfd9c67c83e261b2" dependencies = [ "rustls-pki-types", ] @@ -6998,7 +7160,7 @@ checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -7009,7 +7171,7 @@ checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -7073,6 +7235,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.3", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -7097,13 +7268,30 @@ dependencies = [ "windows_aarch64_gnullvm 0.52.6", "windows_aarch64_msvc 0.52.6", "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm", + "windows_i686_gnullvm 0.52.6", "windows_i686_msvc 0.52.6", "windows_x86_64_gnu 0.52.6", "windows_x86_64_gnullvm 0.52.6", "windows_x86_64_msvc 0.52.6", ] +[[package]] +name = "windows-targets" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + [[package]] name = "windows-threading" version = "0.1.0" @@ -7125,6 +7313,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -7137,6 +7331,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -7149,12 +7349,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + [[package]] name = "windows_i686_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -7167,6 +7379,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -7179,6 +7397,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" @@ -7191,6 +7415,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -7204,19 +7434,25 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "winnow" -version = "0.7.10" +name = "windows_x86_64_msvc" +version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06928c8748d81b05c9be96aad92e1b6ff01833332f281e8cfca3be4b35fc9ec" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + +[[package]] +name = "winnow" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3edebf492c8125044983378ecb5766203ad3b4c2f7a922bd7dd207f6d443e95" dependencies = [ "memchr", ] [[package]] name = "wiremock" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "101681b74cd87b5899e87bcf5a64e83334dd313fcd3053ea72e6dba18928e301" +checksum = "a2b8b99d4cdbf36b239a9532e31fe4fb8acc38d1897c1761e161550a7dc78e6a" dependencies = [ "assert-json-diff", "async-trait", @@ -7262,12 +7498,12 @@ dependencies = [ [[package]] name = "xattr" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d65cbf2f12c15564212d48f4e3dfb87923d25d611f2aed18f4cb23f0413d89e" +checksum = "af3a19837351dc82ba89f8a125e22a3c475f05aba604acc023d62b2739ae2909" dependencies = [ "libc", - "rustix 1.0.7", + "rustix 1.0.8", ] [[package]] @@ -7342,7 +7578,7 @@ checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", "synstructure", ] @@ -7354,28 +7590,28 @@ checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", "synstructure", ] [[package]] name = "zerocopy" -version = "0.8.25" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.25" +version = "0.8.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -7395,7 +7631,7 @@ checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", "synstructure", ] @@ -7416,7 +7652,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -7432,9 +7668,9 @@ dependencies = [ [[package]] name = "zerovec" -version = "0.11.2" +version = "0.11.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a05eb080e015ba39cc9e23bbe5e7fb04d5fb040350f99f34e338d5fdd294428" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" dependencies = [ "yoke 0.8.0", "zerofrom", @@ -7449,7 +7685,7 @@ checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.101", + "syn 2.0.105", ] [[package]] @@ -7469,9 +7705,9 @@ dependencies = [ [[package]] name = "zip" -version = "4.1.0" +version = "4.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7dcdb4229c0e79c2531a24de7726a0e980417a74fb4d030a35f535665439a0" +checksum = "9aed4ac33e8eb078c89e6cbb1d5c4c7703ec6d299fc3e7c3695af8f8b423468b" dependencies = [ "aes", "arbitrary", @@ -7486,6 +7722,7 @@ dependencies = [ "liblzma", "memchr", "pbkdf2", + "ppmd-rust", "sha1", "time", "zeroize", diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index a2b72e0e5..fdbd701be 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -331,6 +331,7 @@ pub(crate) mod test { facet_search: Setting::NotSet, prefix_search: Setting::NotSet, chat: Setting::NotSet, + vector_store: Setting::NotSet, _kind: std::marker::PhantomData, }; settings.check() diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index cdfa6847e..8261746e0 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -421,6 +421,7 @@ impl From> for v6::Settings { facet_search: v6::Setting::NotSet, prefix_search: v6::Setting::NotSet, chat: v6::Setting::NotSet, + vector_store: v6::Setting::NotSet, _kind: std::marker::PhantomData, } } diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 75ff2ebe6..b5549ec65 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -4,7 +4,7 @@ use std::io::{BufRead, BufReader, ErrorKind}; use std::path::Path; pub use meilisearch_types::milli; -use meilisearch_types::milli::vector::hf::OverridePooling; +use meilisearch_types::milli::vector::embedder::hf::OverridePooling; use tempfile::TempDir; use time::OffsetDateTime; use tracing::debug; diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index 1b01f89de..5646a5d80 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -158,6 +158,19 @@ impl RoFeatures { .into()) } } + + pub fn check_vector_store_setting(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.vector_store_setting { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "vector_store_setting", + issue_link: "https://github.com/orgs/meilisearch/discussions/860", + } + .into()) + } + } } impl FeatureData { diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index d578b03dd..6103fe7fc 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -143,10 +143,10 @@ impl IndexStats { /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { - let arroy_stats = index.arroy_stats(rtxn)?; + let vector_store_stats = index.vector_store_stats(rtxn)?; Ok(IndexStats { - number_of_embeddings: Some(arroy_stats.number_of_embeddings), - number_of_embedded_documents: Some(arroy_stats.documents.len()), + number_of_embeddings: Some(vector_store_stats.number_of_embeddings), + number_of_embedded_documents: Some(vector_store_stats.documents.len()), documents_database_stats: index.documents_stats(rtxn)?.unwrap_or_default(), number_of_documents: None, database_size: index.on_disk_size()?, diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 4129c57af..efa137cdb 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -147,7 +147,6 @@ impl IndexScheduler { }; let mut index_wtxn = index.write_txn()?; - let index_version = index.get_version(&index_wtxn)?.unwrap_or((1, 12, 0)); let package_version = (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH); if index_version != package_version { diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap index a52f18079..568635d05 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap index b99e15852..c9c8869a1 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap index 12e03a28b..ecfecbd9c 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap index 2ea2ebb17..64d6d7713 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap index a2a263b6f..d6fd6f2bf 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap index 29fc6abf4..af0ad504d 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap index ae943bf48..b238b65af 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap index 9ada7580a..95283cec6 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap index 96d93de51..32d40e3a7 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_documents.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap index 76a77e5c0..6885e9a48 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_adding_the_settings.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap index 422bed51f..0685ab6fa 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/after_removing_the_documents.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_document_ids: 1, deleted_documents: Some(1) }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, batch_uid: 2, status: failed, error: ResponseError { code: 200, message: "Index `doggos`: Invalid type for filter subexpression: expected: String, Array, found: true.", error_code: "invalid_document_filter", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#invalid_document_filter" }, details: { original_filter: true, deleted_documents: Some(0) }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap index d8996f82c..7c873a31a 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_document_deletions.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 3, indexed_documents: Some(3) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_document_ids: 1, deleted_documents: None }, kind: DocumentDeletion { index_uid: "doggos", documents_ids: ["1"] }} 3 {uid: 3, status: enqueued, details: { original_filter: true, deleted_documents: None }, kind: DocumentDeletionByFilter { index_uid: "doggos", filter_expr: Bool(true) }} diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap index e7b06eb31..bf4753018 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/fail_in_process_batch_for_document_deletion/registered_the_setting_and_document_addition.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: Set([Field("catto")]), sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: NotSet, search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, vector_store: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 3, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: ReplaceDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 3, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index f651b2352..0f1782568 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -396,6 +396,7 @@ InvalidDocumentEditionContext , InvalidRequest , BAD_REQU InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST ; InvalidSettingsIndexChat , InvalidRequest , BAD_REQUEST ; +InvalidSettingsVectorStore , InvalidRequest , BAD_REQUEST ; // Export InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index cf66422b2..9c2a6a135 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -21,6 +21,7 @@ pub struct RuntimeTogglableFeatures { pub composite_embedders: bool, pub chat_completions: bool, pub multimodal: bool, + pub vector_store_setting: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 9e107a5c3..f73a66c4b 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -14,6 +14,7 @@ use milli::proximity::ProximityPrecision; pub use milli::update::ChatSettings; use milli::update::Setting; use milli::vector::db::IndexEmbeddingConfig; +use milli::vector::VectorStoreBackend; use milli::{Criterion, CriterionError, FilterableAttributesRule, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; use utoipa::ToSchema; @@ -320,6 +321,11 @@ pub struct Settings { #[schema(value_type = Option)] pub chat: Setting, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option)] + pub vector_store: Setting, + #[serde(skip)] #[deserr(skip)] pub _kind: PhantomData, @@ -386,6 +392,7 @@ impl Settings { facet_search: Setting::Reset, prefix_search: Setting::Reset, chat: Setting::Reset, + vector_store: Setting::Reset, _kind: PhantomData, } } @@ -413,6 +420,7 @@ impl Settings { facet_search, prefix_search, chat, + vector_store, _kind, } = self; @@ -437,6 +445,7 @@ impl Settings { localized_attributes: localized_attributes_rules, facet_search, prefix_search, + vector_store, chat, _kind: PhantomData, } @@ -489,6 +498,7 @@ impl Settings { facet_search: self.facet_search, prefix_search: self.prefix_search, chat: self.chat, + vector_store: self.vector_store, _kind: PhantomData, } } @@ -569,6 +579,7 @@ impl Settings { facet_search: other.facet_search.or(self.facet_search), prefix_search: other.prefix_search.or(self.prefix_search), chat: other.chat.clone().or(self.chat.clone()), + vector_store: other.vector_store.or(self.vector_store), _kind: PhantomData, } } @@ -608,6 +619,7 @@ pub fn apply_settings_to_builder( facet_search, prefix_search, chat, + vector_store, _kind, } = settings; @@ -825,6 +837,12 @@ pub fn apply_settings_to_builder( Setting::Reset => builder.reset_chat(), Setting::NotSet => (), } + + match vector_store { + Setting::Set(vector_store) => builder.set_vector_store(*vector_store), + Setting::Reset => builder.reset_vector_store(), + Setting::NotSet => (), + } } pub enum SecretPolicy { @@ -922,6 +940,9 @@ pub fn settings( (name, SettingEmbeddingSettings { inner: Setting::Set(config.into()) }) }) .collect(); + + let vector_store = index.get_vector_store(rtxn)?; + let embedders = Setting::Set(embedders); let search_cutoff_ms = index.search_cutoff(rtxn)?; let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; @@ -968,6 +989,10 @@ pub fn settings( facet_search: Setting::Set(facet_search), prefix_search: Setting::Set(prefix_search.unwrap_or_default()), chat: Setting::Set(chat), + vector_store: match vector_store { + Some(vector_store) => Setting::Set(vector_store), + None => Setting::Reset, + }, _kind: PhantomData, }; @@ -1197,6 +1222,7 @@ pub(crate) mod test { facet_search: Setting::NotSet, prefix_search: Setting::NotSet, chat: Setting::NotSet, + vector_store: Setting::NotSet, _kind: PhantomData::, }; @@ -1229,6 +1255,7 @@ pub(crate) mod test { facet_search: Setting::NotSet, prefix_search: Setting::NotSet, chat: Setting::NotSet, + vector_store: Setting::NotSet, _kind: PhantomData::, }; diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index a2a0f0c05..04020a74f 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -205,6 +205,7 @@ struct Infos { experimental_no_snapshot_compaction: bool, experimental_no_edition_2024_for_dumps: bool, experimental_no_edition_2024_for_settings: bool, + experimental_vector_store_setting: bool, gpu_enabled: bool, db_path: bool, import_dump: bool, @@ -307,6 +308,7 @@ impl Infos { composite_embedders, chat_completions, multimodal, + vector_store_setting, } = features; // We're going to override every sensible information. @@ -332,6 +334,7 @@ impl Infos { experimental_embedding_cache_entries, experimental_no_snapshot_compaction, experimental_no_edition_2024_for_dumps, + experimental_vector_store_setting: vector_store_setting, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index 1a1f89b2d..3d4219a12 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -55,6 +55,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { composite_embedders: Some(false), chat_completions: Some(false), multimodal: Some(false), + vector_store_setting: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -103,6 +104,8 @@ pub struct RuntimeTogglableFeatures { pub chat_completions: Option, #[deserr(default)] pub multimodal: Option, + #[deserr(default)] + pub vector_store_setting: Option, } impl From for RuntimeTogglableFeatures { @@ -117,6 +120,7 @@ impl From for RuntimeTogg composite_embedders, chat_completions, multimodal, + vector_store_setting, } = value; Self { @@ -129,6 +133,7 @@ impl From for RuntimeTogg composite_embedders: Some(composite_embedders), chat_completions: Some(chat_completions), multimodal: Some(multimodal), + vector_store_setting: Some(vector_store_setting), } } } @@ -144,6 +149,7 @@ pub struct PatchExperimentalFeatureAnalytics { composite_embedders: bool, chat_completions: bool, multimodal: bool, + vector_store_setting: bool, } impl Aggregate for PatchExperimentalFeatureAnalytics { @@ -162,6 +168,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { composite_embedders: new.composite_embedders, chat_completions: new.chat_completions, multimodal: new.multimodal, + vector_store_setting: new.vector_store_setting, }) } @@ -189,6 +196,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { composite_embedders: Some(false), chat_completions: Some(false), multimodal: Some(false), + vector_store_setting: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -232,6 +240,10 @@ async fn patch_features( .unwrap_or(old_features.composite_embedders), chat_completions: new_features.0.chat_completions.unwrap_or(old_features.chat_completions), multimodal: new_features.0.multimodal.unwrap_or(old_features.multimodal), + vector_store_setting: new_features + .0 + .vector_store_setting + .unwrap_or(old_features.vector_store_setting), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because @@ -247,6 +259,7 @@ async fn patch_features( composite_embedders, chat_completions, multimodal, + vector_store_setting, } = new_features; analytics.publish( @@ -260,6 +273,7 @@ async fn patch_features( composite_embedders, chat_completions, multimodal, + vector_store_setting, }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index 10120ebff..cc825f893 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -520,6 +520,17 @@ make_setting_routes!( camelcase_attr: "chat", analytics: ChatAnalytics }, + { + route: "/vector-store", + update_verb: patch, + value_type: meilisearch_types::milli::vector::VectorStoreBackend, + err_type: meilisearch_types::deserr::DeserrJsonError< + meilisearch_types::error::deserr_codes::InvalidSettingsVectorStore, + >, + attr: vector_store, + camelcase_attr: "vectorStore", + analytics: VectorStoreAnalytics + }, ); #[utoipa::path( @@ -610,6 +621,7 @@ pub async fn update_all( facet_search: FacetSearchAnalytics::new(new_settings.facet_search.as_ref().set()), prefix_search: PrefixSearchAnalytics::new(new_settings.prefix_search.as_ref().set()), chat: ChatAnalytics::new(new_settings.chat.as_ref().set()), + vector_store: VectorStoreAnalytics::new(new_settings.vector_store.as_ref().set()), }, &req, ); @@ -665,10 +677,17 @@ pub async fn get_all( let index = index_scheduler.index(&index_uid)?; let rtxn = index.read_txn()?; let mut new_settings = settings(&index, &rtxn, SecretPolicy::HideSecrets)?; - if index_scheduler.features().check_chat_completions("showing index `chat` settings").is_err() { + + let features = index_scheduler.features(); + + if features.check_chat_completions("showing index `chat` settings").is_err() { new_settings.chat = Setting::NotSet; } + if features.check_vector_store_setting("showing index `vectorStore` settings").is_err() { + new_settings.vector_store = Setting::NotSet; + } + debug!(returns = ?new_settings, "Get all settings"); Ok(HttpResponse::Ok().json(new_settings)) } @@ -770,5 +789,9 @@ fn validate_settings( features.check_chat_completions("setting `chat` in the index settings")?; } + if let Setting::Set(_) = &settings.vector_store { + features.check_vector_store_setting("setting `vectorStore` in the index settings")?; + } + Ok(settings.validate()?) } diff --git a/crates/meilisearch/src/routes/indexes/settings_analytics.rs b/crates/meilisearch/src/routes/indexes/settings_analytics.rs index 1b8d0e244..cd573099f 100644 --- a/crates/meilisearch/src/routes/indexes/settings_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/settings_analytics.rs @@ -8,6 +8,7 @@ use std::collections::{BTreeMap, BTreeSet, HashSet}; use meilisearch_types::facet_values_sort::FacetValuesSort; use meilisearch_types::locales::{Locale, LocalizedAttributesRuleView}; use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::vector::VectorStoreBackend; use meilisearch_types::milli::FilterableAttributesRule; use meilisearch_types::settings::{ ChatSettings, FacetingSettings, PaginationSettings, PrefixSearchSettings, @@ -40,6 +41,7 @@ pub struct SettingsAnalytics { pub facet_search: FacetSearchAnalytics, pub prefix_search: PrefixSearchAnalytics, pub chat: ChatAnalytics, + pub vector_store: VectorStoreAnalytics, } impl Aggregate for SettingsAnalytics { @@ -200,6 +202,10 @@ impl Aggregate for SettingsAnalytics { value: new.prefix_search.value.or(self.prefix_search.value), }, chat: ChatAnalytics { set: new.chat.set | self.chat.set }, + vector_store: VectorStoreAnalytics { + set: new.vector_store.set | self.vector_store.set, + value: new.vector_store.value.or(self.vector_store.value), + }, }) } @@ -693,3 +699,19 @@ impl ChatAnalytics { SettingsAnalytics { chat: self, ..Default::default() } } } + +#[derive(Serialize, Default)] +pub struct VectorStoreAnalytics { + pub set: bool, + pub value: Option, +} + +impl VectorStoreAnalytics { + pub fn new(settings: Option<&VectorStoreBackend>) -> Self { + Self { set: settings.is_some(), value: settings.copied() } + } + + pub fn into_settings(self) -> SettingsAnalytics { + SettingsAnalytics { vector_store: self, ..Default::default() } + } +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index fd6e777de..5a6780cbb 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -102,7 +102,7 @@ mod webhooks; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, Export, WebhookSettings, WebhookResults, WebhookWithMetadata)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, Export, WebhookSettings, WebhookResults, WebhookWithMetadata, meilisearch_types::milli::vector::VectorStoreBackend)) )] pub struct MeilisearchApi; diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index f1bac5297..a1db8efcd 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2189,7 +2189,8 @@ async fn import_dump_v6_containing_experimental_features() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -2316,7 +2317,8 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -2423,7 +2425,8 @@ async fn generate_and_import_dump_containing_vectors() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); diff --git a/crates/meilisearch/tests/features/mod.rs b/crates/meilisearch/tests/features/mod.rs index ec5838d35..e0f1afb9b 100644 --- a/crates/meilisearch/tests/features/mod.rs +++ b/crates/meilisearch/tests/features/mod.rs @@ -26,7 +26,8 @@ async fn experimental_features() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -43,7 +44,8 @@ async fn experimental_features() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -60,7 +62,8 @@ async fn experimental_features() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -78,7 +81,8 @@ async fn experimental_features() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -96,7 +100,8 @@ async fn experimental_features() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); } @@ -121,7 +126,8 @@ async fn experimental_feature_metrics() { "getTaskDocumentsRoute": false, "compositeEmbedders": false, "chatCompletions": false, - "multimodal": false + "multimodal": false, + "vectorStoreSetting": false } "###); @@ -168,7 +174,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`, `multimodal`", + "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`, `multimodal`, `vectorStoreSetting`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 8419f640d..ae0c37048 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -194,6 +194,11 @@ test_setting_routes!( "searchParameters": {} } }, + { + setting: vector_store, + update_verb: patch, + default_value: null + }, ); #[actix_rt::test] diff --git a/crates/meilisearch/tests/vector/binary_quantized.rs b/crates/meilisearch/tests/vector/binary_quantized.rs index adb0da441..ff3fc470e 100644 --- a/crates/meilisearch/tests/vector/binary_quantized.rs +++ b/crates/meilisearch/tests/vector/binary_quantized.rs @@ -320,7 +320,7 @@ async fn binary_quantize_clear_documents() { } "###); - // Make sure the arroy DB has been cleared + // Make sure the vector DB has been cleared let (documents, _code) = index.search_post(json!({ "hybrid": { "embedder": "manual" }, "vector": [1, 1, 1] })).await; snapshot!(documents, @r#" @@ -335,3 +335,191 @@ async fn binary_quantize_clear_documents() { } "#); } + +#[actix_rt::test] +async fn binary_quantize_change_backend() { + let server = Server::new().await; + let index = server.unique_index(); + server.set_features(json!({"vectorStoreSetting": true})).await; + let (response, code) = index + .update_settings(json!({ + "vectorStore": "stable" + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + "binaryQuantized": true, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [-1.2, -2.3, 3.2] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [2.5, 1.5, -130] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + server.wait_task(value.uid()).await.succeeded(); + + // Make sure the documents are binary quantized + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + -1.0, + -1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (response, code) = index + .update_settings(json!({ + "vectorStore": "experimental" + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (response, code) = index + .update_settings(json!({ + "vectorStore": "stable" + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + -1.0, + -1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); +} diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 3c08b9e03..0ed4f4d3c 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -6,15 +6,17 @@ mod openai; mod rest; mod settings; +use std::collections::HashMap; use std::str::FromStr; use meili_snap::{json_string, snapshot}; use meilisearch::option::MaxThreads; +pub use rest::create_mock; +use serde_json::Value; use crate::common::index::Index; use crate::common::{default_settings, GetAllDocumentsOptions, Server}; use crate::json; -pub use rest::create_mock; pub async fn get_server_vector() -> Server { Server::new().await @@ -684,7 +686,7 @@ async fn clear_documents() { } "###); - // Make sure the arroy DB has been cleared + // Make sure the vector DB has been cleared let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "manual"} })).await; snapshot!(documents, @r#" @@ -781,3 +783,90 @@ async fn add_remove_one_vector_4588() { } "#); } + +#[actix_rt::test] +async fn change_backend() { + let server = Server::new().await; + let index = server.unique_index(); + server.set_features(json!({"vectorStoreSetting": true})).await; + let (response, code) = index + .update_settings(json!({ + "vectorStore": "stable" + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + "binaryQuantized": false, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [-1.2, -2.3, 3.2] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [2.5, 1.5, -130] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + server.wait_task(value.uid()).await.succeeded(); + + // Make sure the documents are binary quantized + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + let stable_embeddings: HashMap = documents["results"] + .as_array() + .unwrap() + .iter() + .map(|o| (o["id"].clone(), o["_vectors"]["manual"]["embeddings"].clone())) + .collect(); + + let (response, code) = index + .update_settings(json!({ + "vectorStore": "experimental" + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + + let experimental_embeddings: HashMap = documents["results"] + .as_array() + .unwrap() + .iter() + .map(|o| (o["id"].clone().clone(), o["_vectors"]["manual"]["embeddings"].clone())) + .collect(); + + let (response, code) = index + .update_settings(json!({ + "vectorStore": "stable" + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + let back_to_stable_embeddings: HashMap = documents["results"] + .as_array() + .unwrap() + .iter() + .map(|o| (o["id"].clone(), o["_vectors"]["manual"]["embeddings"].clone())) + .collect(); + + assert_eq!(stable_embeddings, experimental_embeddings); + assert_eq!(experimental_embeddings, back_to_stable_embeddings); +} diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index d26174faf..292451ec3 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -236,7 +236,7 @@ async fn reset_embedder_documents() { } "###); - // Make sure the arroy DB has been cleared + // Make sure the vector DB has been cleared let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1], "hybrid": {"embedder": "default"} })).await; snapshot!(json_string!(documents), @r###" diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index 170bbdcc8..831bcf209 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -142,8 +142,8 @@ enum Command { #[derive(Clone, ValueEnum)] enum IndexPart { - /// Will make the arroy index hot. - Arroy, + /// Will make the vector index hot. + Hannoy, } fn main() -> anyhow::Result<()> { @@ -658,12 +658,12 @@ fn hair_dryer( let rtxn = index.read_txn()?; for part in index_parts { match part { - IndexPart::Arroy => { + IndexPart::Hannoy => { let mut count = 0; - let total = index.vector_arroy.len(&rtxn)?; - eprintln!("Hair drying arroy for {uid}..."); + let total = index.vector_store.len(&rtxn)?; + eprintln!("Hair drying hannoy for {uid}..."); for (i, result) in index - .vector_arroy + .vector_store .remap_types::() .iter(&rtxn)? .enumerate() diff --git a/crates/meilitool/src/upgrade/v1_11.rs b/crates/meilitool/src/upgrade/v1_11.rs index 76d2fc24f..f1d5c1959 100644 --- a/crates/meilitool/src/upgrade/v1_11.rs +++ b/crates/meilitool/src/upgrade/v1_11.rs @@ -68,7 +68,7 @@ pub fn v1_10_to_v1_11( ) })?; let index_read_database = - try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_ARROY) + try_opening_poly_database(&index_env, &index_rtxn, db_name::VECTOR_STORE) .with_context(|| format!("while updating date format for index `{uid}`"))?; let mut index_wtxn = index_env.write_txn().with_context(|| { @@ -79,7 +79,7 @@ pub fn v1_10_to_v1_11( })?; let index_write_database = - try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_ARROY) + try_opening_poly_database(&index_env, &index_wtxn, db_name::VECTOR_STORE) .with_context(|| format!("while updating date format for index `{uid}`"))?; meilisearch_types::milli::arroy::upgrade::cosine_from_0_4_to_0_5( diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 68e894ca9..29a6b86cf 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -87,7 +87,8 @@ rhai = { version = "1.22.2", features = [ "no_time", "sync", ] } -arroy = "0.6.2" +arroy = "0.6.3" +hannoy = "0.0.5" rand = "0.8.5" tracing = "0.1.41" ureq = { version = "2.12.1", features = ["json"] } @@ -95,6 +96,7 @@ url = "2.5.4" hashbrown = "0.15.4" bumpalo = "3.18.1" bumparaw-collections = "0.1.4" +steppe = { version = "0.4.0", default-features = false } thread_local = "1.1.9" allocator-api2 = "0.3.0" rustc-hash = "2.1.1" @@ -109,7 +111,11 @@ utoipa = { version = "5.4.0", features = [ "openapi_extensions", ] } lru = "0.14.0" -twox-hash = { version = "2.1.1", default-features = false, features = ["std", "xxhash3_64", "xxhash64"] } +twox-hash = { version = "2.1.1", default-features = false, features = [ + "std", + "xxhash3_64", + "xxhash64", +] } [dev-dependencies] mimalloc = { version = "0.1.47", default-features = false } diff --git a/crates/milli/src/documents/geo_sort.rs b/crates/milli/src/documents/geo_sort.rs index 0750dfe5c..8e574ec7c 100644 --- a/crates/milli/src/documents/geo_sort.rs +++ b/crates/milli/src/documents/geo_sort.rs @@ -1,17 +1,13 @@ -use crate::{ - distance_between_two_points, - heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}, - lat_lng_to_xyz, - search::new::{facet_string_values, facet_values_prefix_key}, - GeoPoint, Index, -}; -use heed::{ - types::{Bytes, Unit}, - RoPrefix, RoTxn, -}; +use std::collections::VecDeque; + +use heed::types::{Bytes, Unit}; +use heed::{RoPrefix, RoTxn}; use roaring::RoaringBitmap; use rstar::RTree; -use std::collections::VecDeque; + +use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}; +use crate::search::new::{facet_string_values, facet_values_prefix_key}; +use crate::{distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index}; #[derive(Debug, Clone, Copy)] pub struct GeoSortParameter { diff --git a/crates/milli/src/documents/sort.rs b/crates/milli/src/documents/sort.rs index 3866d9e27..f76081847 100644 --- a/crates/milli/src/documents/sort.rs +++ b/crates/milli/src/documents/sort.rs @@ -1,19 +1,16 @@ use std::collections::{BTreeSet, VecDeque}; -use crate::{ - constants::RESERVED_GEO_FIELD_NAME, - documents::{geo_sort::next_bucket, GeoSortParameter}, - heed_codec::{ - facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, - BytesRefCodec, - }, - is_faceted, - search::facet::{ascending_facet_sort, descending_facet_sort}, - AscDesc, DocumentId, Member, UserError, -}; use heed::Database; use roaring::RoaringBitmap; +use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::documents::geo_sort::next_bucket; +use crate::documents::GeoSortParameter; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::BytesRefCodec; +use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; +use crate::{is_faceted, AscDesc, DocumentId, Member, UserError}; + #[derive(Debug, Clone, Copy)] enum AscDescId { Facet { field_id: u16, ascending: bool }, diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 76ad3fda0..11d7756c1 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -78,6 +78,8 @@ pub enum InternalError { #[error(transparent)] ArroyError(#[from] arroy::Error), #[error(transparent)] + HannoyError(#[from] hannoy::Error), + #[error(transparent)] VectorEmbeddingError(#[from] crate::vector::Error), } @@ -353,7 +355,7 @@ and can not be more than 511 bytes.", .document_id.to_string() context: crate::vector::settings::NestingContext, field: crate::vector::settings::MetaEmbeddingSetting, }, - #[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())] + #[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::embedder::openai::EmbeddingModel::supported_models())] InvalidOpenAiModel { embedder_name: String, model: String }, #[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source `{source_}`)")] MissingFieldForSource { @@ -441,6 +443,29 @@ impl From for Error { } } +impl From for Error { + fn from(value: hannoy::Error) -> Self { + match value { + hannoy::Error::Heed(heed) => heed.into(), + hannoy::Error::Io(io) => io.into(), + hannoy::Error::InvalidVecDimension { expected, received } => { + Error::UserError(UserError::InvalidVectorDimensions { expected, found: received }) + } + hannoy::Error::BuildCancelled => Error::InternalError(InternalError::AbortedIndexation), + hannoy::Error::DatabaseFull + | hannoy::Error::InvalidItemAppend + | hannoy::Error::UnmatchingDistance { .. } + | hannoy::Error::NeedBuild(_) + | hannoy::Error::MissingKey { .. } + | hannoy::Error::MissingMetadata(_) + | hannoy::Error::UnknownVersion { .. } + | hannoy::Error::CannotDecodeKeyMode { .. } => { + Error::InternalError(InternalError::HannoyError(value)) + } + } + } +} + #[derive(Error, Debug)] pub enum GeoError { #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 6429dabbc..4f018e56f 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -31,7 +31,7 @@ use crate::prompt::PromptData; use crate::proximity::ProximityPrecision; use crate::update::new::StdResult; use crate::vector::db::IndexEmbeddingConfigs; -use crate::vector::{ArroyStats, ArroyWrapper, Embedding}; +use crate::vector::{Embedding, VectorStore, VectorStoreBackend, VectorStoreStats}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -87,6 +87,7 @@ pub mod main_key { pub const DOCUMENTS_STATS: &str = "documents_stats"; pub const DISABLED_TYPOS_TERMS: &str = "disabled_typos_terms"; pub const CHAT: &str = "chat"; + pub const VECTOR_STORE_BACKEND: &str = "vector_store_backend"; } pub mod db_name { @@ -113,7 +114,7 @@ pub mod db_name { pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const VECTOR_EMBEDDER_CATEGORY_ID: &str = "vector-embedder-category-id"; - pub const VECTOR_ARROY: &str = "vector-arroy"; + pub const VECTOR_STORE: &str = "vector-arroy"; pub const DOCUMENTS: &str = "documents"; } const NUMBER_OF_DBS: u32 = 25; @@ -177,10 +178,10 @@ pub struct Index { /// Maps the document id, the facet field id and the strings. pub field_id_docid_facet_strings: Database, - /// Maps an embedder name to its id in the arroy store. + /// Maps an embedder name to its id in the vector store. pub(crate) embedder_category_id: Database, - /// Vector store based on arroyâ„¢. - pub vector_arroy: arroy::Database, + /// Vector store based on hannoyâ„¢. + pub vector_store: hannoy::Database, /// Maps the document id to the document as an obkv store. pub(crate) documents: Database, @@ -237,7 +238,7 @@ impl Index { // vector stuff let embedder_category_id = env.create_database(&mut wtxn, Some(VECTOR_EMBEDDER_CATEGORY_ID))?; - let vector_arroy = env.create_database(&mut wtxn, Some(VECTOR_ARROY))?; + let vector_store = env.create_database(&mut wtxn, Some(VECTOR_STORE))?; let documents = env.create_database(&mut wtxn, Some(DOCUMENTS))?; @@ -264,7 +265,7 @@ impl Index { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_arroy, + vector_store, embedder_category_id, documents, }; @@ -454,6 +455,34 @@ impl Index { self.main.remap_types::().get(rtxn, main_key::VERSION_KEY) } + /* vector store */ + /// Writes the vector store + pub(crate) fn put_vector_store( + &self, + wtxn: &mut RwTxn<'_>, + backend: VectorStoreBackend, + ) -> Result<()> { + Ok(self.main.remap_types::>().put( + wtxn, + main_key::VECTOR_STORE_BACKEND, + &backend, + )?) + } + + pub fn get_vector_store(&self, rtxn: &RoTxn<'_>) -> Result> { + Ok(self + .main + .remap_types::>() + .get(rtxn, main_key::VECTOR_STORE_BACKEND)?) + } + + pub(crate) fn delete_vector_store(&self, wtxn: &mut RwTxn<'_>) -> Result { + Ok(self + .main + .remap_types::>() + .delete(wtxn, main_key::VECTOR_STORE_BACKEND)?) + } + /* documents ids */ /// Writes the documents ids that corresponds to the user-ids-documents-ids FST. @@ -1769,11 +1798,14 @@ impl Index { ) -> Result> { let mut res = BTreeMap::new(); let embedders = self.embedding_configs(); + let backend = self.get_vector_store(rtxn)?.unwrap_or_default(); + for config in embedders.embedding_configs(rtxn)? { let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); let has_fragments = config.config.embedder_options.has_fragments(); - let reader = ArroyWrapper::new( - self.vector_arroy, + let reader = VectorStore::new( + backend, + self.vector_store, embedder_info.embedder_id, config.config.quantized(), ); @@ -1792,13 +1824,19 @@ impl Index { Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) } - pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { - let mut stats = ArroyStats::default(); + pub fn vector_store_stats(&self, rtxn: &RoTxn<'_>) -> Result { + let mut stats = VectorStoreStats::default(); let embedding_configs = self.embedding_configs(); + let backend = self.get_vector_store(rtxn)?.unwrap_or_default(); + for config in embedding_configs.embedding_configs(rtxn)? { let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); - let reader = - ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let reader = VectorStore::new( + backend, + self.vector_store, + embedder_id, + config.config.quantized(), + ); reader.aggregate_stats(rtxn, &mut stats)?; } Ok(stats) @@ -1842,7 +1880,7 @@ impl Index { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_arroy, + vector_store, embedder_category_id, documents, } = self; @@ -1913,7 +1951,7 @@ impl Index { "field_id_docid_facet_strings", field_id_docid_facet_strings.stat(rtxn).map(compute_size)?, ); - sizes.insert("vector_arroy", vector_arroy.stat(rtxn).map(compute_size)?); + sizes.insert("vector_store", vector_store.stat(rtxn).map(compute_size)?); sizes.insert("embedder_category_id", embedder_category_id.stat(rtxn).map(compute_size)?); sizes.insert("documents", documents.stat(rtxn).map(compute_size)?); diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 6fdae86b3..ca867d6e0 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -53,7 +53,7 @@ pub use search::new::{ }; use serde_json::Value; pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; -pub use {arroy, charabia as tokenizer, heed, rhai}; +pub use {arroy, charabia as tokenizer, hannoy, heed, rhai}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::attribute_patterns::{AttributePatterns, PatternMatch}; diff --git a/crates/milli/src/progress.rs b/crates/milli/src/progress.rs index 6b1c46d11..68f9d6661 100644 --- a/crates/milli/src/progress.rs +++ b/crates/milli/src/progress.rs @@ -5,7 +5,7 @@ use std::sync::atomic::{AtomicU32, AtomicUsize, Ordering}; use std::sync::{Arc, RwLock}; use std::time::{Duration, Instant}; -use enum_iterator::Sequence; +use enum_iterator::Sequence as _; use indexmap::IndexMap; use itertools::Itertools; use serde::Serialize; @@ -278,6 +278,30 @@ impl Step for VariableNameStep { } } +// Integration with steppe + +impl steppe::Progress for Progress { + fn update(&self, sub_progress: impl steppe::Step) { + self.update_progress(Compat(sub_progress)); + } +} + +struct Compat(T); + +impl Step for Compat { + fn name(&self) -> Cow<'static, str> { + self.0.name() + } + + fn current(&self) -> u32 { + self.0.current().try_into().unwrap_or(u32::MAX) + } + + fn total(&self) -> u32 { + self.0.total().try_into().unwrap_or(u32::MAX) + } +} + impl Step for arroy::MainStep { fn name(&self) -> Cow<'static, str> { match self { @@ -292,6 +316,7 @@ impl Step for arroy::MainStep { arroy::MainStep::WritingNodesToDatabase => "writing nodes to database", arroy::MainStep::DeleteExtraneousTrees => "delete extraneous trees", arroy::MainStep::WriteTheMetadata => "write the metadata", + arroy::MainStep::ConvertingHannoyToArroy => "converting hannoy to arroy", } .into() } diff --git a/crates/milli/src/search/facet/filter_vector.rs b/crates/milli/src/search/facet/filter_vector.rs index 1ef4b8e3d..278179586 100644 --- a/crates/milli/src/search/facet/filter_vector.rs +++ b/crates/milli/src/search/facet/filter_vector.rs @@ -3,7 +3,7 @@ use roaring::{MultiOps, RoaringBitmap}; use crate::error::{DidYouMean, Error}; use crate::vector::db::IndexEmbeddingConfig; -use crate::vector::{ArroyStats, ArroyWrapper}; +use crate::vector::{VectorStore, VectorStoreStats}; use crate::Index; #[derive(Debug, thiserror::Error)] @@ -82,6 +82,7 @@ fn evaluate_inner( embedding_configs: &[IndexEmbeddingConfig], filter: &VectorFilter<'_>, ) -> crate::Result { + let backend = index.get_vector_store(rtxn)?.unwrap_or_default(); let embedder_name = embedder.value(); let available_embedders = || embedding_configs.iter().map(|c| c.name.clone()).collect::>(); @@ -96,8 +97,9 @@ fn evaluate_inner( .embedder_info(rtxn, embedder_name)? .ok_or_else(|| EmbedderDoesNotExist { embedder, available: available_embedders() })?; - let arroy_wrapper = ArroyWrapper::new( - index.vector_arroy, + let vector_store = VectorStore::new( + backend, + index.vector_store, embedder_info.embedder_id, embedding_config.config.quantized(), ); @@ -122,7 +124,7 @@ fn evaluate_inner( })?; let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); - arroy_wrapper.items_in_store(rtxn, fragment_config.id, |bitmap| { + vector_store.items_in_store(rtxn, fragment_config.id, |bitmap| { bitmap.clone() - user_provided_docids })? } @@ -132,8 +134,8 @@ fn evaluate_inner( } let user_provided_docids = embedder_info.embedding_status.user_provided_docids(); - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + let mut stats = VectorStoreStats::default(); + vector_store.aggregate_stats(rtxn, &mut stats)?; stats.documents - user_provided_docids.clone() } VectorFilter::UserProvided => { @@ -141,14 +143,14 @@ fn evaluate_inner( user_provided_docids.clone() } VectorFilter::Regenerate => { - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + let mut stats = VectorStoreStats::default(); + vector_store.aggregate_stats(rtxn, &mut stats)?; let skip_regenerate = embedder_info.embedding_status.skip_regenerate_docids(); stats.documents - skip_regenerate } VectorFilter::None => { - let mut stats = ArroyStats::default(); - arroy_wrapper.aggregate_stats(rtxn, &mut stats)?; + let mut stats = VectorStoreStats::default(); + vector_store.aggregate_stats(rtxn, &mut stats)?; stats.documents } }; diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index 2c201e899..5da4c7145 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; use super::VectorStoreStats; use crate::score_details::{self, ScoreDetails}; -use crate::vector::{ArroyWrapper, DistributionShift, Embedder}; +use crate::vector::{DistributionShift, Embedder, VectorStore}; use crate::{DocumentId, Result, SearchContext, SearchLogger}; pub struct VectorSort { @@ -54,9 +54,11 @@ impl VectorSort { vector_candidates: &RoaringBitmap, ) -> Result<()> { let target = &self.target; + let backend = ctx.index.get_vector_store(ctx.txn)?.unwrap_or_default(); let before = Instant::now(); - let reader = ArroyWrapper::new(ctx.index.vector_arroy, self.embedder_index, self.quantized); + let reader = + VectorStore::new(backend, ctx.index.vector_store, self.embedder_index, self.quantized); let results = reader.nns_by_vector(ctx.txn, target, self.limit, Some(vector_candidates))?; self.cached_sorted_docids = results.into_iter(); *ctx.vector_store_stats.get_or_insert_default() += VectorStoreStats { diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 2235f6436..b4933e1d0 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use roaring::RoaringBitmap; use crate::score_details::{self, ScoreDetails}; -use crate::vector::{ArroyWrapper, Embedder}; +use crate::vector::{Embedder, VectorStore}; use crate::{filtered_universe, DocumentId, Filter, Index, Result, SearchResult}; pub struct Similar<'a> { @@ -72,7 +72,10 @@ impl<'a> Similar<'a> { crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) })?; - let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); + let backend = self.index.get_vector_store(self.rtxn)?.unwrap_or_default(); + + let reader = + VectorStore::new(backend, self.index.vector_store, embedder_index, self.quantized); let results = reader.nns_by_item( self.rtxn, self.id, diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index 84eeca7f9..6cd389d42 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -2,7 +2,8 @@ use heed::RwTxn; use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{database_stats::DatabaseStats, FieldDistribution, Index, Result}; +use crate::database_stats::DatabaseStats; +use crate::{FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'i> { wtxn: &'t mut RwTxn<'i>, @@ -45,7 +46,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { facet_id_is_empty_docids, field_id_docid_facet_f64s, field_id_docid_facet_strings, - vector_arroy, + vector_store, embedder_category_id: _, documents, } = self.index; @@ -88,7 +89,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { field_id_docid_facet_f64s.clear(self.wtxn)?; field_id_docid_facet_strings.clear(self.wtxn)?; // vector - vector_arroy.clear(self.wtxn)?; + vector_store.clear(self.wtxn)?; documents.clear(self.wtxn)?; diff --git a/crates/milli/src/update/index_documents/extract/extract_word_docids.rs b/crates/milli/src/update/index_documents/extract/extract_word_docids.rs index a964c0bbe..6d28adb2b 100644 --- a/crates/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -2,9 +2,8 @@ use std::collections::BTreeSet; use std::fs::File; use std::io::{self, BufReader}; -use heed::{BytesDecode, BytesEncode}; +use heed::BytesDecode; use obkv::KvReaderU16; -use roaring::RoaringBitmap; use super::helpers::{ create_sorter, create_writer, try_split_array_at, writer_into_reader, GrenadParameters, @@ -16,7 +15,7 @@ use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::sorter_into_reader; use crate::update::settings::InnerIndexSettingsDiff; -use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result}; +use crate::{DocumentId, FieldId, Result}; /// Extracts the word and the documents ids where this word appear. /// @@ -201,45 +200,3 @@ fn words_into_sorter( Ok(()) } - -#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] -fn docids_into_writers( - word: &str, - deletions: &RoaringBitmap, - additions: &RoaringBitmap, - writer: &mut grenad::Writer, -) -> Result<()> -where - W: std::io::Write, -{ - if deletions == additions { - // if the same value is deleted and added, do nothing. - return Ok(()); - } - - // Write each value in the same KvDelAdd before inserting it in the final writer. - let mut obkv = KvWriterDelAdd::memory(); - // deletions: - if !deletions.is_empty() && !deletions.is_subset(additions) { - obkv.insert( - DelAdd::Deletion, - CboRoaringBitmapCodec::bytes_encode(deletions).map_err(|_| { - SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } - })?, - )?; - } - // additions: - if !additions.is_empty() { - obkv.insert( - DelAdd::Addition, - CboRoaringBitmapCodec::bytes_encode(additions).map_err(|_| { - SerializationError::Encoding { db_name: Some(DOCID_WORD_POSITIONS) } - })?, - )?; - } - - // insert everything in the same writer. - writer.insert(word.as_bytes(), obkv.into_inner().unwrap())?; - - Ok(()) -} diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 099879382..205f8ef5d 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -39,7 +39,7 @@ use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::vector::db::EmbedderInfo; -use crate::vector::{ArroyWrapper, RuntimeEmbedders}; +use crate::vector::{RuntimeEmbedders, VectorStore}; use crate::{CboRoaringBitmapCodec, Index, Result, UserError}; static MERGED_DATABASE_COUNT: usize = 7; @@ -485,6 +485,7 @@ where // If an embedder wasn't used in the typedchunk but must be binary quantized // we should insert it in `dimension` + let backend = self.index.get_vector_store(self.wtxn)?.unwrap_or_default(); for (name, action) in settings_diff.embedding_config_updates.iter() { if action.is_being_quantized && !dimension.contains_key(name.as_str()) { let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or( @@ -494,7 +495,7 @@ where }, )?; let reader = - ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); + VectorStore::new(backend, self.index.vector_store, index, action.was_quantized); let Some(dim) = reader.dimensions(self.wtxn)? else { continue; }; @@ -504,7 +505,7 @@ where for (embedder_name, dimension) in dimension { let wtxn = &mut *self.wtxn; - let vector_arroy = self.index.vector_arroy; + let vector_store = self.index.vector_store; let cancel = &self.should_abort; let embedder_index = @@ -523,11 +524,12 @@ where let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); pool.install(|| { - let mut writer = ArroyWrapper::new(vector_arroy, embedder_index, was_quantized); + let mut writer = + VectorStore::new(backend, vector_store, embedder_index, was_quantized); writer.build_and_quantize( wtxn, // In the settings we don't have any progress to share - &Progress::default(), + Progress::default(), &mut rng, dimension, is_quantizing, diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index e07483aff..aa100f2d5 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -32,7 +32,7 @@ use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::{RemoveFragments, WriteBackToDocuments}; -use crate::vector::ArroyWrapper; +use crate::vector::VectorStore; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; pub struct TransformOutput { @@ -834,15 +834,17 @@ impl<'a, 'i> Transform<'a, 'i> { None }; - let readers: BTreeMap<&str, (ArroyWrapper, &RoaringBitmap)> = settings_diff + let backend = self.index.get_vector_store(wtxn)?.unwrap_or_default(); + let readers: BTreeMap<&str, (VectorStore, &RoaringBitmap)> = settings_diff .embedding_config_updates .iter() .filter_map(|(name, action)| { if let Some(WriteBackToDocuments { embedder_id, user_provided }) = action.write_back() { - let reader = ArroyWrapper::new( - self.index.vector_arroy, + let reader = VectorStore::new( + backend, + self.index.vector_store, *embedder_id, action.was_quantized, ); @@ -882,10 +884,7 @@ impl<'a, 'i> Transform<'a, 'i> { InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, )?; - let injected_vectors: std::result::Result< - serde_json::Map, - arroy::Error, - > = readers + let injected_vectors: crate::Result<_> = readers .iter() .filter_map(|(name, (reader, user_provided))| { if !user_provided.contains(docid) { @@ -949,9 +948,13 @@ impl<'a, 'i> Transform<'a, 'i> { else { continue; }; - let arroy = - ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized); - let Some(dimensions) = arroy.dimensions(wtxn)? else { + let vector_store = VectorStore::new( + backend, + self.index.vector_store, + infos.embedder_id, + was_quantized, + ); + let Some(dimensions) = vector_store.dimensions(wtxn)? else { continue; }; for fragment_id in fragment_ids { @@ -959,17 +962,17 @@ impl<'a, 'i> Transform<'a, 'i> { if infos.embedding_status.user_provided_docids().is_empty() { // no user provided: clear store - arroy.clear_store(wtxn, *fragment_id, dimensions)?; + vector_store.clear_store(wtxn, *fragment_id, dimensions)?; continue; } // some user provided, remove only the ids that are not user provided - let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + let to_delete = vector_store.items_in_store(wtxn, *fragment_id, |items| { items - infos.embedding_status.user_provided_docids() })?; for to_delete in to_delete { - arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + vector_store.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; } } } diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index c93e3e0f7..eb2a0799b 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -27,7 +27,7 @@ use crate::update::index_documents::helpers::{ }; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig}; -use crate::vector::ArroyWrapper; +use crate::vector::VectorStore; use crate::{ lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, Result, SerializationError, U8StrStrCodec, @@ -619,6 +619,7 @@ pub(crate) fn write_typed_chunk_into_index( let _entered = span.enter(); let embedders = index.embedding_configs(); + let backend = index.get_vector_store(wtxn)?.unwrap_or_default(); let mut remove_vectors_builder = MergerBuilder::new(KeepFirst); let mut manual_vectors_builder = MergerBuilder::new(KeepFirst); @@ -677,7 +678,8 @@ pub(crate) fn write_typed_chunk_into_index( .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance - let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized); + let writer = + VectorStore::new(backend, index.vector_store, infos.embedder_id, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); diff --git a/crates/milli/src/update/indexer_config.rs b/crates/milli/src/update/indexer_config.rs index 845da5a51..39a013d13 100644 --- a/crates/milli/src/update/indexer_config.rs +++ b/crates/milli/src/update/indexer_config.rs @@ -1,7 +1,8 @@ use grenad::CompressionType; use super::GrenadParameters; -use crate::{thread_pool_no_abort::ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; +use crate::thread_pool_no_abort::ThreadPoolNoAbort; +use crate::ThreadPoolNoAbortBuilder; #[derive(Debug)] pub struct IndexerConfig { diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index aec192ace..884f133d6 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -255,9 +255,9 @@ impl<'a> From> for FrameWithHeader<'a> { #[repr(u8)] pub enum EntryHeader { DbOperation(DbOperation), - ArroyDeleteVector(ArroyDeleteVector), - ArroySetVectors(ArroySetVectors), - ArroySetVector(ArroySetVector), + DeleteVector(DeleteVector), + SetVectors(SetVectors), + SetVector(SetVector), } impl EntryHeader { @@ -268,9 +268,9 @@ impl EntryHeader { const fn variant_id(&self) -> u8 { match self { EntryHeader::DbOperation(_) => 0, - EntryHeader::ArroyDeleteVector(_) => 1, - EntryHeader::ArroySetVectors(_) => 2, - EntryHeader::ArroySetVector(_) => 3, + EntryHeader::DeleteVector(_) => 1, + EntryHeader::SetVectors(_) => 2, + EntryHeader::SetVector(_) => 3, } } @@ -286,26 +286,26 @@ impl EntryHeader { } const fn total_delete_vector_size() -> usize { - Self::variant_size() + mem::size_of::() + Self::variant_size() + mem::size_of::() } /// The `dimensions` corresponds to the number of `f32` in the embedding. fn total_set_vectors_size(count: usize, dimensions: usize) -> usize { let embedding_size = dimensions * mem::size_of::(); - Self::variant_size() + mem::size_of::() + embedding_size * count + Self::variant_size() + mem::size_of::() + embedding_size * count } fn total_set_vector_size(dimensions: usize) -> usize { let embedding_size = dimensions * mem::size_of::(); - Self::variant_size() + mem::size_of::() + embedding_size + Self::variant_size() + mem::size_of::() + embedding_size } fn header_size(&self) -> usize { let payload_size = match self { EntryHeader::DbOperation(op) => mem::size_of_val(op), - EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), - EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), - EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), + EntryHeader::DeleteVector(adv) => mem::size_of_val(adv), + EntryHeader::SetVectors(asvs) => mem::size_of_val(asvs), + EntryHeader::SetVector(asv) => mem::size_of_val(asv), }; Self::variant_size() + payload_size } @@ -319,19 +319,19 @@ impl EntryHeader { EntryHeader::DbOperation(header) } 1 => { - let header_bytes = &remaining[..mem::size_of::()]; + let header_bytes = &remaining[..mem::size_of::()]; let header = checked::pod_read_unaligned(header_bytes); - EntryHeader::ArroyDeleteVector(header) + EntryHeader::DeleteVector(header) } 2 => { - let header_bytes = &remaining[..mem::size_of::()]; + let header_bytes = &remaining[..mem::size_of::()]; let header = checked::pod_read_unaligned(header_bytes); - EntryHeader::ArroySetVectors(header) + EntryHeader::SetVectors(header) } 3 => { - let header_bytes = &remaining[..mem::size_of::()]; + let header_bytes = &remaining[..mem::size_of::()]; let header = checked::pod_read_unaligned(header_bytes); - EntryHeader::ArroySetVector(header) + EntryHeader::SetVector(header) } id => panic!("invalid variant id: {id}"), } @@ -341,9 +341,9 @@ impl EntryHeader { let (first, remaining) = header_bytes.split_first_mut().unwrap(); let payload_bytes = match self { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), - EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), - EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), - EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), + EntryHeader::DeleteVector(adv) => bytemuck::bytes_of(adv), + EntryHeader::SetVectors(asvs) => bytemuck::bytes_of(asvs), + EntryHeader::SetVector(asv) => bytemuck::bytes_of(asv), }; *first = self.variant_id(); remaining.copy_from_slice(payload_bytes); @@ -378,7 +378,7 @@ impl DbOperation { #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(transparent)] -pub struct ArroyDeleteVector { +pub struct DeleteVector { pub docid: DocumentId, } @@ -386,13 +386,13 @@ pub struct ArroyDeleteVector { #[repr(C)] /// The embeddings are in the remaining space and represents /// non-aligned [f32] each with dimensions f32s. -pub struct ArroySetVectors { +pub struct SetVectors { pub docid: DocumentId, pub embedder_id: u8, _padding: [u8; 3], } -impl ArroySetVectors { +impl SetVectors { fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { let skip = EntryHeader::variant_size() + mem::size_of::(); &frame[skip..] @@ -416,14 +416,14 @@ impl ArroySetVectors { #[repr(C)] /// The embeddings are in the remaining space and represents /// non-aligned [f32] each with dimensions f32s. -pub struct ArroySetVector { +pub struct SetVector { pub docid: DocumentId, pub embedder_id: u8, pub extractor_id: u8, _padding: [u8; 2], } -impl ArroySetVector { +impl SetVector { fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { let skip = EntryHeader::variant_size() + mem::size_of::(); &frame[skip..] @@ -553,7 +553,7 @@ impl<'b> ExtractorBbqueueSender<'b> { let refcell = self.producers.get().unwrap(); let mut producer = refcell.0.borrow_mut_or_yield(); - let payload_header = EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }); + let payload_header = EntryHeader::DeleteVector(DeleteVector { docid }); let total_length = EntryHeader::total_delete_vector_size(); if total_length > max_grant { panic!("The entry is larger ({total_length} bytes) than the BBQueue max grant ({max_grant} bytes)"); @@ -589,8 +589,8 @@ impl<'b> ExtractorBbqueueSender<'b> { // to zero to allocate no extra space at all let dimensions = embeddings.first().map_or(0, |emb| emb.len()); - let arroy_set_vector = ArroySetVectors { docid, embedder_id, _padding: [0; 3] }; - let payload_header = EntryHeader::ArroySetVectors(arroy_set_vector); + let set_vectors = SetVectors { docid, embedder_id, _padding: [0; 3] }; + let payload_header = EntryHeader::SetVectors(set_vectors); let total_length = EntryHeader::total_set_vectors_size(embeddings.len(), dimensions); if total_length > max_grant { let mut value_file = tempfile::tempfile().map(BufWriter::new)?; @@ -650,9 +650,8 @@ impl<'b> ExtractorBbqueueSender<'b> { // to zero to allocate no extra space at all let dimensions = embedding.as_ref().map_or(0, |emb| emb.len()); - let arroy_set_vector = - ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] }; - let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); + let set_vector = SetVector { docid, embedder_id, extractor_id, _padding: [0; 2] }; + let payload_header = EntryHeader::SetVector(set_vector); let total_length = EntryHeader::total_set_vector_size(dimensions); if total_length > max_grant { let mut value_file = tempfile::tempfile().map(BufWriter::new)?; diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 31d2ada0f..5f287851a 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -240,12 +240,12 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeDocumentE /// modifies them by adding or removing vector fields based on embedder actions, /// and then updates the database. #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents::extract")] -pub fn update_database_documents<'indexer, 'extractor, MSP, SD>( +pub fn update_database_documents<'indexer, MSP, SD>( documents: &'indexer DocumentsIndentifiers<'indexer>, indexing_context: IndexingContext, extractor_sender: &ExtractorBbqueueSender, settings_delta: &SD, - extractor_allocs: &'extractor mut ThreadLocal>, + extractor_allocs: &mut ThreadLocal>, ) -> Result<()> where MSP: Fn() -> bool + Sync, diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 71fa9bf09..f147de360 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -475,7 +475,7 @@ impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { } fn process_embedding_error( &mut self, - error: crate::vector::hf::EmbedError, + error: crate::vector::error::EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &UnusedVectorsDistributionBump, metadata: BVec<'doc, Metadata<'doc>>, diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index e18337623..f613ad0b6 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -8,7 +8,7 @@ use document_changes::{DocumentChanges, IndexingContext}; pub use document_deletion::DocumentDeletion; pub use document_operation::{DocumentOperation, PayloadStats}; use hashbrown::HashMap; -use heed::RwTxn; +use heed::{RoTxn, RwTxn}; pub use partial_dump::PartialDump; pub use post_processing::recompute_word_fst_from_word_docids_database; pub use update_by_function::UpdateByFunction; @@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress}; use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments}; -use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; +use crate::vector::{Embedder, RuntimeEmbedders, VectorStore}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; pub(crate) mod de; @@ -67,7 +67,7 @@ where let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); - let arroy_memory = grenad_parameters.max_memory; + let vector_memory = grenad_parameters.max_memory; let (grenad_parameters, total_bbbuffer_capacity) = indexer_memory_settings(pool.current_num_threads(), grenad_parameters); @@ -130,8 +130,9 @@ where let global_fields_ids_map = GlobalFieldsIdsMap::new(&new_fields_ids_map); - let vector_arroy = index.vector_arroy; - let arroy_writers: Result> = embedders + let vector_arroy = index.vector_store; + let backend = index.get_vector_store(wtxn)?.unwrap_or_default(); + let vector_stores: Result> = embedders .inner_as_ref() .iter() .map(|(embedder_name, runtime)| { @@ -144,7 +145,8 @@ where })?; let dimensions = runtime.embedder.dimensions(); - let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized); + let writer = + VectorStore::new(backend, vector_arroy, embedder_index, runtime.is_quantized); Ok(( embedder_index, @@ -153,10 +155,10 @@ where }) .collect(); - let mut arroy_writers = arroy_writers?; + let mut vector_stores = vector_stores?; let congestion = - write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; + write_to_db(writer_receiver, finished_extraction, index, wtxn, &vector_stores)?; indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); @@ -170,8 +172,8 @@ where wtxn, indexing_context.progress, index_embeddings, - arroy_memory, - &mut arroy_writers, + vector_memory, + &mut vector_stores, None, &indexing_context.must_stop_processing, ) @@ -227,7 +229,7 @@ where let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); - let arroy_memory = grenad_parameters.max_memory; + let vector_memory = grenad_parameters.max_memory; let (grenad_parameters, total_bbbuffer_capacity) = indexer_memory_settings(pool.current_num_threads(), grenad_parameters); @@ -284,15 +286,16 @@ where let new_embedders = settings_delta.new_embedders(); let embedder_actions = settings_delta.embedder_actions(); let index_embedder_category_ids = settings_delta.new_embedder_category_id(); - let mut arroy_writers = arroy_writers_from_embedder_actions( + let mut vector_stores = vector_stores_from_embedder_actions( index, + wtxn, embedder_actions, new_embedders, index_embedder_category_ids, )?; let congestion = - write_to_db(writer_receiver, finished_extraction, index, wtxn, &arroy_writers)?; + write_to_db(writer_receiver, finished_extraction, index, wtxn, &vector_stores)?; indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors); @@ -306,8 +309,8 @@ where wtxn, indexing_context.progress, index_embeddings, - arroy_memory, - &mut arroy_writers, + vector_memory, + &mut vector_stores, Some(embedder_actions), &indexing_context.must_stop_processing, ) @@ -337,13 +340,15 @@ where Ok(congestion) } -fn arroy_writers_from_embedder_actions<'indexer>( +fn vector_stores_from_embedder_actions<'indexer>( index: &Index, + rtxn: &RoTxn, embedder_actions: &'indexer BTreeMap, embedders: &'indexer RuntimeEmbedders, index_embedder_category_ids: &'indexer std::collections::HashMap, -) -> Result> { - let vector_arroy = index.vector_arroy; +) -> Result> { + let vector_arroy = index.vector_store; + let backend = index.get_vector_store(rtxn)?.unwrap_or_default(); embedders .inner_as_ref() @@ -361,8 +366,12 @@ fn arroy_writers_from_embedder_actions<'indexer>( }, ))); }; - let writer = - ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized); + let writer = VectorStore::new( + backend, + vector_arroy, + embedder_category_id, + action.was_quantized, + ); let dimensions = runtime.embedder.dimensions(); Some(Ok(( embedder_category_id, @@ -381,11 +390,13 @@ fn delete_old_embedders_and_fragments( where SD: SettingsDelta, { + let backend = index.get_vector_store(wtxn)?.unwrap_or_default(); for action in settings_delta.embedder_actions().values() { let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { continue; }; - let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); + let reader = + VectorStore::new(backend, index.vector_store, *embedder_id, action.was_quantized); let Some(dimensions) = reader.dimensions(wtxn)? else { continue; }; @@ -401,7 +412,7 @@ where let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { continue; }; - let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized); + let arroy = VectorStore::new(backend, index.vector_store, infos.embedder_id, was_quantized); let Some(dimensions) = arroy.dimensions(wtxn)? else { continue; }; diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index b8e3685f8..55ccfdf35 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -15,7 +15,7 @@ use crate::progress::Progress; use crate::update::settings::InnerIndexSettings; use crate::vector::db::IndexEmbeddingConfig; use crate::vector::settings::EmbedderAction; -use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders}; +use crate::vector::{Embedder, Embeddings, RuntimeEmbedders, VectorStore}; use crate::{Error, Index, InternalError, Result, UserError}; pub fn write_to_db( @@ -23,9 +23,9 @@ pub fn write_to_db( finished_extraction: &AtomicBool, index: &Index, wtxn: &mut RwTxn<'_>, - arroy_writers: &HashMap, + vector_stores: &HashMap, ) -> Result { - // Used by by the ArroySetVector to copy the embedding into an + // Used by by the HannoySetVector to copy the embedding into an // aligned memory area, required by arroy to accept a new vector. let mut aligned_embedding = Vec::new(); let span = tracing::trace_span!(target: "indexing::write_db", "all"); @@ -56,7 +56,7 @@ pub fn write_to_db( ReceiverAction::LargeVectors(large_vectors) => { let LargeVectors { docid, embedder_id, .. } = large_vectors; let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + vector_stores.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); for embedding in large_vectors.read_embeddings(*dimensions) { embeddings.push(embedding.to_vec()).unwrap(); @@ -68,7 +68,7 @@ pub fn write_to_db( large_vector @ LargeVector { docid, embedder_id, extractor_id, .. }, ) => { let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + vector_stores.get(&embedder_id).expect("requested a missing embedder"); let embedding = large_vector.read_embedding(*dimensions); writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; } @@ -80,12 +80,12 @@ pub fn write_to_db( &mut writer_receiver, index, wtxn, - arroy_writers, + vector_stores, &mut aligned_embedding, )?; } - write_from_bbqueue(&mut writer_receiver, index, wtxn, arroy_writers, &mut aligned_embedding)?; + write_from_bbqueue(&mut writer_receiver, index, wtxn, vector_stores, &mut aligned_embedding)?; Ok(ChannelCongestion { attempts: writer_receiver.sent_messages_attempts(), @@ -115,8 +115,8 @@ pub fn build_vectors( wtxn: &mut RwTxn<'_>, progress: &Progress, index_embeddings: Vec, - arroy_memory: Option, - arroy_writers: &mut HashMap, + vector_memory: Option, + vector_stores: &mut HashMap, embeder_actions: Option<&BTreeMap>, must_stop_processing: &MSP, ) -> Result<()> @@ -129,18 +129,18 @@ where let seed = rand::random(); let mut rng = rand::rngs::StdRng::seed_from_u64(seed); - for (_index, (embedder_name, _embedder, writer, dimensions)) in arroy_writers { + for (_index, (embedder_name, _embedder, writer, dimensions)) in vector_stores { let dimensions = *dimensions; let is_being_quantized = embeder_actions .and_then(|actions| actions.get(*embedder_name).map(|action| action.is_being_quantized)) .unwrap_or(false); writer.build_and_quantize( wtxn, - progress, + progress.clone(), &mut rng, dimensions, is_being_quantized, - arroy_memory, + vector_memory, must_stop_processing, )?; } @@ -181,7 +181,7 @@ pub fn write_from_bbqueue( writer_receiver: &mut WriterBbqueueReceiver<'_>, index: &Index, wtxn: &mut RwTxn<'_>, - arroy_writers: &HashMap, + vector_stores: &HashMap, aligned_embedding: &mut Vec, ) -> crate::Result<()> { while let Some(frame_with_header) = writer_receiver.recv_frame() { @@ -221,17 +221,17 @@ pub fn write_from_bbqueue( }, } } - EntryHeader::ArroyDeleteVector(ArroyDeleteVector { docid }) => { - for (_index, (_name, _embedder, writer, dimensions)) in arroy_writers { + EntryHeader::DeleteVector(DeleteVector { docid }) => { + for (_index, (_name, _embedder, writer, dimensions)) in vector_stores { let dimensions = *dimensions; writer.del_items(wtxn, dimensions, docid)?; } } - EntryHeader::ArroySetVectors(asvs) => { - let ArroySetVectors { docid, embedder_id, .. } = asvs; + EntryHeader::SetVectors(asvs) => { + let SetVectors { docid, embedder_id, .. } = asvs; let frame = frame_with_header.frame(); let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + vector_stores.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); writer.del_items(wtxn, *dimensions, docid)?; @@ -245,12 +245,10 @@ pub fn write_from_bbqueue( writer.add_items(wtxn, docid, &embeddings)?; } } - EntryHeader::ArroySetVector( - asv @ ArroySetVector { docid, embedder_id, extractor_id, .. }, - ) => { + EntryHeader::SetVector(asv @ SetVector { docid, embedder_id, extractor_id, .. }) => { let frame = frame_with_header.frame(); let (_, _, writer, dimensions) = - arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + vector_stores.get(&embedder_id).expect("requested a missing embedder"); let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding); if embedding.is_empty() { diff --git a/crates/milli/src/update/new/merger.rs b/crates/milli/src/update/new/merger.rs index 15f06c67d..44ba8e301 100644 --- a/crates/milli/src/update/new/merger.rs +++ b/crates/milli/src/update/new/merger.rs @@ -63,8 +63,8 @@ where } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_docids<'extractor, MSP, D>( - mut caches: Vec>, +pub fn merge_and_send_docids( + mut caches: Vec>, database: Database, index: &Index, docids_sender: WordDocidsSender, @@ -91,8 +91,8 @@ where } #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] -pub fn merge_and_send_facet_docids<'extractor>( - mut caches: Vec>, +pub fn merge_and_send_facet_docids( + mut caches: Vec>, database: FacetDatabases, index: &Index, rtxn: &RoTxn, diff --git a/crates/milli/src/update/new/steps.rs b/crates/milli/src/update/new/steps.rs index eabf9104e..438354a26 100644 --- a/crates/milli/src/update/new/steps.rs +++ b/crates/milli/src/update/new/steps.rs @@ -21,6 +21,14 @@ make_enum_progress! { } } +make_enum_progress! { + pub enum SettingsIndexerStep { + ChangingVectorStore, + UsingStableIndexer, + UsingExperimentalIndexer, + } +} + make_enum_progress! { pub enum PostProcessingFacets { StringsBulk, diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index b59984248..f74d79d47 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -14,7 +14,7 @@ use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::documents::FieldIdMapper; use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig}; use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; -use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders}; +use crate::vector::{Embedding, RuntimeEmbedders, VectorStore}; use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] @@ -120,8 +120,13 @@ impl<'t> VectorDocumentFromDb<'t> { config: &IndexEmbeddingConfig, status: &EmbeddingStatus, ) -> Result> { - let reader = - ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); + let backend = self.index.get_vector_store(self.rtxn)?.unwrap_or_default(); + let reader = VectorStore::new( + backend, + self.index.vector_store, + embedder_id, + config.config.quantized(), + ); let vectors = reader.item_vectors(self.rtxn, self.docid)?; Ok(VectorEntry { @@ -149,7 +154,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { name, entry_from_raw_value(value, false).map_err(|_| { InternalError::Serialization(crate::SerializationError::Decoding { - db_name: Some(crate::index::db_name::VECTOR_ARROY), + db_name: Some(crate::index::db_name::VECTOR_STORE), }) })?, )) @@ -167,7 +172,7 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { Some(embedding_from_doc) => { Some(entry_from_raw_value(embedding_from_doc, false).map_err(|_| { InternalError::Serialization(crate::SerializationError::Decoding { - db_name: Some(crate::index::db_name::VECTOR_ARROY), + db_name: Some(crate::index::db_name::VECTOR_STORE), }) })?) } diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index bca8fbc59..b8a8fed1c 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -26,13 +26,15 @@ use crate::index::{ DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; -use crate::progress::{EmbedderStats, Progress}; +use crate::progress::{EmbedderStats, Progress, VariableNameStep}; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::new::indexer::reindex; +use crate::update::new::steps::SettingsIndexerStep; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; +use crate::vector::embedder::{openai, rest}; use crate::vector::json_template::JsonTemplate; use crate::vector::settings::{ EmbedderAction, EmbedderSource, EmbeddingSettings, EmbeddingValidationContext, NestingContext, @@ -40,6 +42,7 @@ use crate::vector::settings::{ }; use crate::vector::{ Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, + VectorStoreBackend, }; use crate::{ ChannelCongestion, FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, @@ -198,6 +201,7 @@ pub struct Settings<'a, 't, 'i> { prefix_search: Setting, facet_search: Setting, chat: Setting, + vector_store: Setting, } impl<'a, 't, 'i> Settings<'a, 't, 'i> { @@ -237,6 +241,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { prefix_search: Setting::NotSet, facet_search: Setting::NotSet, chat: Setting::NotSet, + vector_store: Setting::NotSet, indexer_config, } } @@ -475,6 +480,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.chat = Setting::Reset; } + pub fn set_vector_store(&mut self, value: VectorStoreBackend) { + self.vector_store = Setting::Set(value); + } + + pub fn reset_vector_store(&mut self) { + self.vector_store = Setting::Reset; + } + #[tracing::instrument( level = "trace" skip(self, progress_callback, should_abort, settings_diff, embedder_stats), @@ -1416,7 +1429,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } - pub fn legacy_execute( + fn legacy_execute( mut self, progress_callback: FP, should_abort: FA, @@ -1485,6 +1498,70 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(()) } + fn execute_vector_backend<'indexer, MSP>( + &mut self, + must_stop_processing: &'indexer MSP, + progress: &'indexer Progress, + ) -> Result<()> + where + MSP: Fn() -> bool + Sync, + { + let old_backend = self.index.get_vector_store(self.wtxn)?.unwrap_or_default(); + + let new_backend = match self.vector_store { + Setting::Set(new_backend) => { + self.index.put_vector_store(self.wtxn, new_backend)?; + new_backend + } + Setting::Reset => { + self.index.delete_vector_store(self.wtxn)?; + VectorStoreBackend::default() + } + Setting::NotSet => return Ok(()), + }; + + if old_backend == new_backend { + return Ok(()); + } + + let embedders = self.index.embedding_configs(); + let embedding_configs = embedders.embedding_configs(self.wtxn)?; + enum VectorStoreBackendChangeIndex {} + let embedder_count = embedding_configs.len(); + + let rtxn = self.index.read_txn()?; + + for (i, config) in embedding_configs.into_iter().enumerate() { + if must_stop_processing() { + return Err(crate::InternalError::AbortedIndexation.into()); + } + let embedder_name = &config.name; + progress.update_progress(VariableNameStep::::new( + format!("Changing vector store backend for embedder `{embedder_name}`"), + i as u32, + embedder_count as u32, + )); + let quantized = config.config.quantized(); + let embedder_id = embedders.embedder_id(self.wtxn, &config.name)?.unwrap(); + let vector_store = crate::vector::VectorStore::new( + old_backend, + self.index.vector_store, + embedder_id, + quantized, + ); + + vector_store.change_backend( + &rtxn, + self.wtxn, + progress.clone(), + must_stop_processing, + self.indexer_config.max_memory, + )?; + } + + Ok(()) + } + pub fn execute<'indexer, MSP>( mut self, must_stop_processing: &'indexer MSP, @@ -1494,8 +1571,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { where MSP: Fn() -> bool + Sync, { + progress.update_progress(SettingsIndexerStep::ChangingVectorStore); + // execute any pending vector store backend change + self.execute_vector_backend(must_stop_processing, progress)?; + // force the old indexer if the environment says so if self.indexer_config.experimental_no_edition_2024_for_settings { + progress.update_progress(SettingsIndexerStep::UsingStableIndexer); return self .legacy_execute( |indexing_step| tracing::debug!(update = ?indexing_step), @@ -1535,11 +1617,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { facet_search: Setting::NotSet, disable_on_numbers: Setting::NotSet, chat: Setting::NotSet, + vector_store: Setting::NotSet, wtxn: _, index: _, indexer_config: _, } = &self { + progress.update_progress(SettingsIndexerStep::UsingExperimentalIndexer); + self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; let old_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; @@ -1578,6 +1663,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(None) } } else { + progress.update_progress(SettingsIndexerStep::UsingStableIndexer); + self.legacy_execute( |indexing_step| tracing::debug!(update = ?indexing_step), must_stop_processing, @@ -2208,39 +2295,29 @@ pub fn validate_embedding_settings( if let Some(request) = request.as_ref().set() { let request = match with_fragments { WithFragments::Yes { indexing_fragments, search_fragments } => { - crate::vector::rest::RequestData::new( - request.to_owned(), - indexing_fragments, - search_fragments, - ) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + rest::RequestData::new(request.to_owned(), indexing_fragments, search_fragments) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + WithFragments::No => { + rest::RequestData::new(request.to_owned(), Default::default(), Default::default()) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) } - WithFragments::No => crate::vector::rest::RequestData::new( - request.to_owned(), - Default::default(), - Default::default(), - ) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())), WithFragments::Maybe => { let mut indexing_fragments = BTreeMap::new(); indexing_fragments.insert("test".to_string(), serde_json::json!("test")); - crate::vector::rest::RequestData::new( - request.to_owned(), - indexing_fragments, - Default::default(), - ) - .or_else(|_| { - crate::vector::rest::RequestData::new( - request.to_owned(), - Default::default(), - Default::default(), - ) - }) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + rest::RequestData::new(request.to_owned(), indexing_fragments, Default::default()) + .or_else(|_| { + rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + }) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) } }?; if let Some(response) = response.as_ref().set() { - crate::vector::rest::Response::new(response.to_owned(), &request) + rest::Response::new(response.to_owned(), &request) .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; } } @@ -2293,11 +2370,12 @@ pub fn validate_embedding_settings( match inferred_source { EmbedderSource::OpenAi => { if let Setting::Set(model) = &model { - let model = crate::vector::openai::EmbeddingModel::from_name(model.as_str()) - .ok_or(crate::error::UserError::InvalidOpenAiModel { + let model = openai::EmbeddingModel::from_name(model.as_str()).ok_or( + crate::error::UserError::InvalidOpenAiModel { embedder_name: name.to_owned(), model: model.clone(), - })?; + }, + )?; if let Setting::Set(dimensions) = dimensions { if !model.supports_overriding_dimensions() && dimensions != model.default_dimensions() diff --git a/crates/milli/src/update/test_settings.rs b/crates/milli/src/update/test_settings.rs index 59e8d9ff1..9e4579667 100644 --- a/crates/milli/src/update/test_settings.rs +++ b/crates/milli/src/update/test_settings.rs @@ -898,6 +898,7 @@ fn test_correct_settings_init() { facet_search, disable_on_numbers, chat, + vector_store, } = settings; assert!(matches!(searchable_fields, Setting::NotSet)); assert!(matches!(displayed_fields, Setting::NotSet)); @@ -927,6 +928,7 @@ fn test_correct_settings_init() { assert!(matches!(facet_search, Setting::NotSet)); assert!(matches!(disable_on_numbers, Setting::NotSet)); assert!(matches!(chat, Setting::NotSet)); + assert!(matches!(vector_store, Setting::NotSet)); }) .unwrap(); } diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index c41d47732..88c067e79 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -3,15 +3,16 @@ mod v1_13; mod v1_14; mod v1_15; mod v1_16; + use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; use v1_14::Latest_V1_13_To_Latest_V1_14; use v1_15::Latest_V1_14_To_Latest_V1_15; +use v1_16::Latest_V1_15_To_V1_16_0; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; use crate::progress::{Progress, VariableNameStep}; -use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0; use crate::{Index, InternalError, Result}; trait UpgradeIndex { @@ -34,6 +35,9 @@ const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[ &Latest_V1_13_To_Latest_V1_14 {}, &Latest_V1_14_To_Latest_V1_15 {}, &Latest_V1_15_To_V1_16_0 {}, + &ToTargetNoOp { target: (1, 18, 0) }, + &ToTargetNoOp { target: (1, 19, 0) }, + &ToTargetNoOp { target: (1, 20, 0) }, // This is the last upgrade function, it will be called when the index is up to date. // any other upgrade function should be added before this one. &ToCurrentNoOp {}, @@ -61,11 +65,10 @@ const fn start(from: (u32, u32, u32)) -> Option { (1, 14, _) => function_index!(5), // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 15, _) => function_index!(6), - (1, 16, _) => function_index!(7), - (1, 17, _) => function_index!(7), - (1, 18, _) => function_index!(7), - (1, 19, _) => function_index!(7), - (1, 20, _) => function_index!(7), + (1, 16, _) | (1, 17, _) => function_index!(7), + (1, 18, _) => function_index!(8), + (1, 19, _) => function_index!(9), + (1, 20, _) => function_index!(10), // We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually // considering dumpless upgrade. (_major, _minor, _patch) => return None, @@ -148,3 +151,25 @@ impl UpgradeIndex for ToCurrentNoOp { (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) } } + +/// Perform no operation during the upgrade except changing to the specified target version. +#[allow(non_camel_case_types)] +struct ToTargetNoOp { + pub target: (u32, u32, u32), +} + +impl UpgradeIndex for ToTargetNoOp { + fn upgrade( + &self, + _wtxn: &mut RwTxn, + _index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + self.target + } +} diff --git a/crates/milli/src/update/upgrade/v1_14.rs b/crates/milli/src/update/upgrade/v1_14.rs index 039734b75..9950be706 100644 --- a/crates/milli/src/update/upgrade/v1_14.rs +++ b/crates/milli/src/update/upgrade/v1_14.rs @@ -27,9 +27,9 @@ impl UpgradeIndex for Latest_V1_13_To_Latest_V1_14 { let rtxn = index.read_txn()?; arroy::upgrade::from_0_5_to_0_6::( &rtxn, - index.vector_arroy.remap_data_type(), + index.vector_store.remap_types(), wtxn, - index.vector_arroy.remap_data_type(), + index.vector_store.remap_types(), )?; Ok(false) diff --git a/crates/milli/src/vector/distribution.rs b/crates/milli/src/vector/distribution.rs new file mode 100644 index 000000000..b17ad9204 --- /dev/null +++ b/crates/milli/src/vector/distribution.rs @@ -0,0 +1,128 @@ +use deserr::{DeserializeError, Deserr}; +use ordered_float::OrderedFloat; +use serde::{Deserialize, Serialize}; +use utoipa::ToSchema; + +/// Describes the mean and sigma of distribution of embedding similarity in the embedding space. +/// +/// The intended use is to make the similarity score more comparable to the regular ranking score. +/// This allows to correct effects where results are too "packed" around a certain value. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize, Serialize, ToSchema)] +#[serde(from = "DistributionShiftSerializable")] +#[serde(into = "DistributionShiftSerializable")] +pub struct DistributionShift { + /// Value where the results are "packed". + /// + /// Similarity scores are translated so that they are packed around 0.5 instead + #[schema(value_type = f32)] + pub current_mean: OrderedFloat, + + /// standard deviation of a similarity score. + /// + /// Set below 0.4 to make the results less packed around the mean, and above 0.4 to make them more packed. + #[schema(value_type = f32)] + pub current_sigma: OrderedFloat, +} + +impl Deserr for DistributionShift +where + E: DeserializeError, +{ + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef<'_>, + ) -> Result { + let value = DistributionShiftSerializable::deserialize_from_value(value, location)?; + if value.mean < 0. || value.mean > 1. { + return Err(deserr::take_cf_content(E::error::( + None, + deserr::ErrorKind::Unexpected { + msg: format!( + "the distribution mean must be in the range [0, 1], got {}", + value.mean + ), + }, + location, + ))); + } + if value.sigma <= 0. || value.sigma > 1. { + return Err(deserr::take_cf_content(E::error::( + None, + deserr::ErrorKind::Unexpected { + msg: format!( + "the distribution sigma must be in the range ]0, 1], got {}", + value.sigma + ), + }, + location, + ))); + } + + Ok(value.into()) + } +} + +#[derive(Serialize, Deserialize, Deserr)] +#[serde(deny_unknown_fields)] +#[deserr(deny_unknown_fields)] +struct DistributionShiftSerializable { + mean: f32, + sigma: f32, +} + +impl From for DistributionShiftSerializable { + fn from( + DistributionShift { + current_mean: OrderedFloat(current_mean), + current_sigma: OrderedFloat(current_sigma), + }: DistributionShift, + ) -> Self { + Self { mean: current_mean, sigma: current_sigma } + } +} + +impl From for DistributionShift { + fn from(DistributionShiftSerializable { mean, sigma }: DistributionShiftSerializable) -> Self { + Self { current_mean: OrderedFloat(mean), current_sigma: OrderedFloat(sigma) } + } +} + +impl DistributionShift { + /// `None` if sigma <= 0. + pub fn new(mean: f32, sigma: f32) -> Option { + if sigma <= 0.0 { + None + } else { + Some(Self { current_mean: OrderedFloat(mean), current_sigma: OrderedFloat(sigma) }) + } + } + + pub fn shift(&self, score: f32) -> f32 { + let current_mean = self.current_mean.0; + let current_sigma = self.current_sigma.0; + // + // We're somewhat abusively mapping the distribution of distances to a gaussian. + // The parameters we're given is the mean and sigma of the native result distribution. + // We're using them to retarget the distribution to a gaussian centered on 0.5 with a sigma of 0.4. + + let target_mean = 0.5; + let target_sigma = 0.4; + + // a^2 sig1^2 = sig2^2 => a^2 = sig2^2 / sig1^2 => a = sig2 / sig1, assuming a, sig1, and sig2 positive. + let factor = target_sigma / current_sigma; + // a*mu1 + b = mu2 => b = mu2 - a*mu1 + let offset = target_mean - (factor * current_mean); + + let mut score = factor * score + offset; + + // clamp the final score in the ]0, 1] interval. + if score <= 0.0 { + score = f32::EPSILON; + } + if score > 1.0 { + score = 1.0; + } + + score + } +} diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/embedder/composite.rs similarity index 94% rename from crates/milli/src/vector/composite.rs rename to crates/milli/src/vector/embedder/composite.rs index 2e31da094..c34c31b41 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/embedder/composite.rs @@ -1,15 +1,15 @@ use std::time::Instant; -use arroy::Distance; +use hannoy::Distance; -use super::error::CompositeEmbedderContainsHuggingFace; -use super::{ - hf, manual, ollama, openai, rest, DistributionShift, EmbedError, Embedding, EmbeddingCache, - NewEmbedderError, -}; +use super::{hf, manual, ollama, openai, rest, Embedding, EmbeddingCache}; use crate::progress::EmbedderStats; +use crate::vector::error::{CompositeEmbedderContainsHuggingFace, EmbedError, NewEmbedderError}; +use crate::vector::DistributionShift; use crate::ThreadPoolNoAbort; +pub(in crate::vector) const MAX_COMPOSITE_DISTANCE: f32 = 0.01; + #[derive(Debug)] pub enum SubEmbedder { /// An embedder based on running local models, fetched from the Hugging Face Hub. @@ -324,20 +324,19 @@ fn check_similarity( } for (left, right) in left.into_iter().zip(right) { - let left = arroy::internals::UnalignedVector::from_slice(&left); - let right = arroy::internals::UnalignedVector::from_slice(&right); - let left = arroy::internals::Leaf { - header: arroy::distances::Cosine::new_header(&left), + let left = hannoy::internals::UnalignedVector::from_slice(&left); + let right = hannoy::internals::UnalignedVector::from_slice(&right); + let left = hannoy::internals::Item { + header: hannoy::distances::Cosine::new_header(&left), vector: left, }; - let right = arroy::internals::Leaf { - header: arroy::distances::Cosine::new_header(&right), + let right = hannoy::internals::Item { + header: hannoy::distances::Cosine::new_header(&right), vector: right, }; - let distance = arroy::distances::Cosine::built_distance(&left, &right); - - if distance > super::MAX_COMPOSITE_DISTANCE { + let distance = hannoy::distances::Cosine::distance(&left, &right); + if distance > crate::vector::embedder::composite::MAX_COMPOSITE_DISTANCE { return Err(NewEmbedderError::composite_embedding_value_mismatch(distance, hint)); } } diff --git a/crates/milli/src/vector/hf.rs b/crates/milli/src/vector/embedder/hf.rs similarity index 98% rename from crates/milli/src/vector/hf.rs rename to crates/milli/src/vector/embedder/hf.rs index 1e5c7bd1c..18f80dec1 100644 --- a/crates/milli/src/vector/hf.rs +++ b/crates/milli/src/vector/embedder/hf.rs @@ -6,8 +6,9 @@ use hf_hub::api::sync::Api; use hf_hub::{Repo, RepoType}; use tokenizers::{PaddingParams, Tokenizer}; -pub use super::error::{EmbedError, Error, NewEmbedderError}; -use super::{DistributionShift, Embedding, EmbeddingCache}; +use super::EmbeddingCache; +use crate::vector::error::{EmbedError, NewEmbedderError}; +use crate::vector::{DistributionShift, Embedding}; #[derive( Debug, diff --git a/crates/milli/src/vector/manual.rs b/crates/milli/src/vector/embedder/manual.rs similarity index 93% rename from crates/milli/src/vector/manual.rs rename to crates/milli/src/vector/embedder/manual.rs index b95bf0ea2..132aab0bf 100644 --- a/crates/milli/src/vector/manual.rs +++ b/crates/milli/src/vector/embedder/manual.rs @@ -1,6 +1,5 @@ -use super::error::EmbedError; -use super::DistributionShift; -use crate::vector::Embedding; +use crate::vector::error::EmbedError; +use crate::vector::{DistributionShift, Embedding}; #[derive(Debug, Clone, Copy)] pub struct Embedder { diff --git a/crates/milli/src/vector/embedder/mod.rs b/crates/milli/src/vector/embedder/mod.rs new file mode 100644 index 000000000..b7f7b1de4 --- /dev/null +++ b/crates/milli/src/vector/embedder/mod.rs @@ -0,0 +1,381 @@ +pub mod composite; +pub mod hf; +pub mod manual; +pub mod ollama; +pub mod openai; +pub mod rest; + +use std::num::NonZeroUsize; +use std::sync::Mutex; +use std::time::Instant; + +use composite::SubEmbedderOptions; + +use crate::progress::EmbedderStats; +use crate::prompt::PromptData; +use crate::vector::error::{EmbedError, NewEmbedderError}; +use crate::vector::{DistributionShift, Embedding}; +use crate::ThreadPoolNoAbort; + +/// An embedder can be used to transform text into embeddings. +#[derive(Debug)] +pub enum Embedder { + /// An embedder based on running local models, fetched from the Hugging Face Hub. + HuggingFace(hf::Embedder), + /// An embedder based on making embedding queries against the OpenAI API. + OpenAi(openai::Embedder), + /// An embedder based on the user providing the embeddings in the documents and queries. + UserProvided(manual::Embedder), + /// An embedder based on making embedding queries against an embedding server. + Ollama(ollama::Embedder), + /// An embedder based on making embedding queries against a generic JSON/REST embedding server. + Rest(rest::Embedder), + /// An embedder composed of an embedder at search time and an embedder at indexing time. + Composite(composite::Embedder), +} + +/// Configuration for an embedder. +#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] +pub struct EmbeddingConfig { + /// Options of the embedder, specific to each kind of embedder + pub embedder_options: EmbedderOptions, + /// Document template + pub prompt: PromptData, + /// If this embedder is binary quantized + pub quantized: Option, + // TODO: add metrics and anything needed +} + +impl EmbeddingConfig { + pub fn quantized(&self) -> bool { + self.quantized.unwrap_or_default() + } +} + +/// Options of an embedder, specific to each kind of embedder. +#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] +pub enum EmbedderOptions { + HuggingFace(hf::EmbedderOptions), + OpenAi(openai::EmbedderOptions), + Ollama(ollama::EmbedderOptions), + UserProvided(manual::EmbedderOptions), + Rest(rest::EmbedderOptions), + Composite(composite::EmbedderOptions), +} + +impl EmbedderOptions { + pub fn fragment(&self, name: &str) -> Option<&serde_json::Value> { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => None, + EmbedderOptions::Rest(embedder_options) => { + embedder_options.indexing_fragments.get(name) + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + embedder_options.indexing_fragments.get(name) + } else { + None + } + } + } + } + + pub fn has_fragments(&self) -> bool { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => false, + EmbedderOptions::Rest(embedder_options) => { + !embedder_options.indexing_fragments.is_empty() + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + !embedder_options.indexing_fragments.is_empty() + } else { + false + } + } + } + } +} + +impl Default for EmbedderOptions { + fn default() -> Self { + Self::HuggingFace(Default::default()) + } +} + +impl Embedder { + /// Spawns a new embedder built from its options. + pub fn new( + options: EmbedderOptions, + cache_cap: usize, + ) -> std::result::Result { + Ok(match options { + EmbedderOptions::HuggingFace(options) => { + Self::HuggingFace(hf::Embedder::new(options, cache_cap)?) + } + EmbedderOptions::OpenAi(options) => { + Self::OpenAi(openai::Embedder::new(options, cache_cap)?) + } + EmbedderOptions::Ollama(options) => { + Self::Ollama(ollama::Embedder::new(options, cache_cap)?) + } + EmbedderOptions::UserProvided(options) => { + Self::UserProvided(manual::Embedder::new(options)) + } + EmbedderOptions::Rest(options) => Self::Rest(rest::Embedder::new( + options, + cache_cap, + rest::ConfigurationSource::User, + )?), + EmbedderOptions::Composite(options) => { + Self::Composite(composite::Embedder::new(options, cache_cap)?) + } + }) + } + + /// Embed in search context + + #[tracing::instrument(level = "debug", skip_all, target = "search")] + pub fn embed_search( + &self, + query: SearchQuery<'_>, + deadline: Option, + ) -> std::result::Result { + match query { + SearchQuery::Text(text) => self.embed_search_text(text, deadline), + SearchQuery::Media { q, media } => self.embed_search_media(q, media, deadline), + } + } + + pub fn embed_search_text( + &self, + text: &str, + deadline: Option, + ) -> std::result::Result { + if let Some(cache) = self.cache() { + if let Some(embedding) = cache.get(text) { + tracing::trace!(text, "embedding found in cache"); + return Ok(embedding); + } + } + let embedding = match self { + Embedder::HuggingFace(embedder) => embedder.embed_one(text), + Embedder::OpenAi(embedder) => embedder + .embed(&[text], deadline, None)? + .pop() + .ok_or_else(EmbedError::missing_embedding), + Embedder::Ollama(embedder) => embedder + .embed(&[text], deadline, None)? + .pop() + .ok_or_else(EmbedError::missing_embedding), + Embedder::UserProvided(embedder) => embedder.embed_one(text), + Embedder::Rest(embedder) => embedder.embed_one(SearchQuery::Text(text), deadline, None), + Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None), + }?; + + if let Some(cache) = self.cache() { + cache.put(text.to_owned(), embedding.clone()); + } + + Ok(embedding) + } + + pub fn embed_search_media( + &self, + q: Option<&str>, + media: Option<&serde_json::Value>, + deadline: Option, + ) -> std::result::Result { + let Embedder::Rest(embedder) = self else { + return Err(EmbedError::rest_media_not_a_rest()); + }; + embedder.embed_one(SearchQuery::Media { q, media }, deadline, None) + } + + /// Embed multiple chunks of texts. + /// + /// Each chunk is composed of one or multiple texts. + pub fn embed_index( + &self, + text_chunks: Vec>, + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result>, EmbedError> { + match self { + Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), + Embedder::OpenAi(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } + Embedder::Ollama(embedder) => { + embedder.embed_index(text_chunks, threads, embedder_stats) + } + Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks), + Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), + Embedder::Composite(embedder) => { + embedder.index.embed_index(text_chunks, threads, embedder_stats) + } + } + } + + /// Non-owning variant of [`Self::embed_index`]. + pub fn embed_index_ref( + &self, + texts: &[&str], + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + match self { + Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), + Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts), + Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), + Embedder::Composite(embedder) => { + embedder.index.embed_index_ref(texts, threads, embedder_stats) + } + } + } + + pub fn embed_index_ref_fragments( + &self, + fragments: &[serde_json::Value], + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + if let Embedder::Rest(embedder) = self { + embedder.embed_index_ref(fragments, threads, embedder_stats) + } else { + let Embedder::Composite(embedder) = self else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + let crate::vector::embedder::composite::SubEmbedder::Rest(embedder) = &embedder.index + else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + + embedder.embed_index_ref(fragments, threads, embedder_stats) + } + } + + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] + pub fn chunk_count_hint(&self) -> usize { + match self { + Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), + Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), + Embedder::Ollama(embedder) => embedder.chunk_count_hint(), + Embedder::UserProvided(_) => 100, + Embedder::Rest(embedder) => embedder.chunk_count_hint(), + Embedder::Composite(embedder) => embedder.index.chunk_count_hint(), + } + } + + /// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`] + pub fn prompt_count_in_chunk_hint(&self) -> usize { + match self { + Embedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), + Embedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), + Embedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), + Embedder::UserProvided(_) => 1, + Embedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), + Embedder::Composite(embedder) => embedder.index.prompt_count_in_chunk_hint(), + } + } + + /// Indicates the dimensions of a single embedding produced by the embedder. + pub fn dimensions(&self) -> usize { + match self { + Embedder::HuggingFace(embedder) => embedder.dimensions(), + Embedder::OpenAi(embedder) => embedder.dimensions(), + Embedder::Ollama(embedder) => embedder.dimensions(), + Embedder::UserProvided(embedder) => embedder.dimensions(), + Embedder::Rest(embedder) => embedder.dimensions(), + Embedder::Composite(embedder) => embedder.dimensions(), + } + } + + /// An optional distribution used to apply an affine transformation to the similarity score of a document. + pub fn distribution(&self) -> Option { + match self { + Embedder::HuggingFace(embedder) => embedder.distribution(), + Embedder::OpenAi(embedder) => embedder.distribution(), + Embedder::Ollama(embedder) => embedder.distribution(), + Embedder::UserProvided(embedder) => embedder.distribution(), + Embedder::Rest(embedder) => embedder.distribution(), + Embedder::Composite(embedder) => embedder.distribution(), + } + } + + pub fn uses_document_template(&self) -> bool { + match self { + Embedder::HuggingFace(_) + | Embedder::OpenAi(_) + | Embedder::Ollama(_) + | Embedder::Rest(_) => true, + Embedder::UserProvided(_) => false, + Embedder::Composite(embedder) => embedder.index.uses_document_template(), + } + } + + fn cache(&self) -> Option<&EmbeddingCache> { + match self { + Embedder::HuggingFace(embedder) => Some(embedder.cache()), + Embedder::OpenAi(embedder) => Some(embedder.cache()), + Embedder::UserProvided(_) => None, + Embedder::Ollama(embedder) => Some(embedder.cache()), + Embedder::Rest(embedder) => Some(embedder.cache()), + Embedder::Composite(embedder) => embedder.search.cache(), + } + } +} + +#[derive(Clone, Copy)] +pub enum SearchQuery<'a> { + Text(&'a str), + Media { q: Option<&'a str>, media: Option<&'a serde_json::Value> }, +} + +#[derive(Debug)] +struct EmbeddingCache { + data: Option>>, +} + +impl EmbeddingCache { + const MAX_TEXT_LEN: usize = 2000; + + pub fn new(cap: usize) -> Self { + let data = NonZeroUsize::new(cap).map(lru::LruCache::new).map(Mutex::new); + Self { data } + } + + /// Get the embedding corresponding to `text`, if any is present in the cache. + pub fn get(&self, text: &str) -> Option { + let data = self.data.as_ref()?; + if text.len() > Self::MAX_TEXT_LEN { + return None; + } + let mut cache = data.lock().unwrap(); + + cache.get(text).cloned() + } + + /// Puts a new embedding for the specified `text` + pub fn put(&self, text: String, embedding: Embedding) { + let Some(data) = self.data.as_ref() else { + return; + }; + if text.len() > Self::MAX_TEXT_LEN { + return; + } + tracing::trace!(text, "embedding added to cache"); + + let mut cache = data.lock().unwrap(); + + cache.put(text, embedding); + } +} diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/embedder/ollama.rs similarity index 96% rename from crates/milli/src/vector/ollama.rs rename to crates/milli/src/vector/embedder/ollama.rs index feec92cc0..6e2dc185f 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/embedder/ollama.rs @@ -3,12 +3,12 @@ use std::time::Instant; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; -use super::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM}; +use super::EmbeddingCache; use crate::error::FaultSource; use crate::progress::EmbedderStats; -use crate::vector::Embedding; +use crate::vector::error::{EmbedError, EmbedErrorKind, NewEmbedderError, NewEmbedderErrorKind}; +use crate::vector::{DistributionShift, Embedding, REQUEST_PARALLELISM}; use crate::ThreadPoolNoAbort; #[derive(Debug)] @@ -88,7 +88,7 @@ impl Embedder { Err(NewEmbedderError { kind: NewEmbedderErrorKind::CouldNotDetermineDimension(EmbedError { - kind: super::error::EmbedErrorKind::RestOtherStatusCode(404, error), + kind: EmbedErrorKind::RestOtherStatusCode(404, error), fault: _, }), fault: _, diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/embedder/openai.rs similarity index 98% rename from crates/milli/src/vector/openai.rs rename to crates/milli/src/vector/embedder/openai.rs index bf6c92978..4fec228e4 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/embedder/openai.rs @@ -5,13 +5,12 @@ use ordered_float::OrderedFloat; use rayon::iter::{IntoParallelIterator, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; -use super::error::{EmbedError, NewEmbedderError}; use super::rest::{Embedder as RestEmbedder, EmbedderOptions as RestEmbedderOptions}; -use super::{DistributionShift, EmbeddingCache, REQUEST_PARALLELISM}; +use super::{DistributionShift, EmbeddingCache}; use crate::error::FaultSource; use crate::progress::EmbedderStats; -use crate::vector::error::EmbedErrorKind; -use crate::vector::Embedding; +use crate::vector::error::{EmbedError, EmbedErrorKind, NewEmbedderError}; +use crate::vector::{Embedding, REQUEST_PARALLELISM}; use crate::ThreadPoolNoAbort; #[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/embedder/rest.rs similarity index 98% rename from crates/milli/src/vector/rest.rs rename to crates/milli/src/vector/embedder/rest.rs index 7a16f1a1e..7c0213c76 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/embedder/rest.rs @@ -8,14 +8,12 @@ use rayon::slice::ParallelSlice as _; use serde::{Deserialize, Serialize}; use serde_json::Value; -use super::error::EmbedErrorKind; -use super::json_template::{InjectableValue, JsonTemplate}; -use super::{ - DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, SearchQuery, - REQUEST_PARALLELISM, -}; +use super::EmbeddingCache; use crate::error::FaultSource; use crate::progress::EmbedderStats; +use crate::vector::error::{EmbedError, EmbedErrorKind, NewEmbedderError}; +use crate::vector::json_template::{InjectableValue, JsonTemplate}; +use crate::vector::{DistributionShift, Embedding, SearchQuery, REQUEST_PARALLELISM}; use crate::ThreadPoolNoAbort; // retrying in case of failure @@ -315,7 +313,7 @@ impl Embedder { } pub fn chunk_count_hint(&self) -> usize { - super::REQUEST_PARALLELISM + crate::vector::REQUEST_PARALLELISM } pub fn prompt_count_in_chunk_hint(&self) -> usize { diff --git a/crates/milli/src/vector/embeddings.rs b/crates/milli/src/vector/embeddings.rs new file mode 100644 index 000000000..467ebc81e --- /dev/null +++ b/crates/milli/src/vector/embeddings.rs @@ -0,0 +1,76 @@ +/// One or multiple embeddings stored consecutively in a flat vector. +#[derive(Debug, PartialEq)] +pub struct Embeddings { + data: Vec, + dimension: usize, +} + +impl Embeddings { + /// Declares an empty vector of embeddings of the specified dimensions. + pub fn new(dimension: usize) -> Self { + Self { data: Default::default(), dimension } + } + + /// Declares a vector of embeddings containing a single element. + /// + /// The dimension is inferred from the length of the passed embedding. + pub fn from_single_embedding(embedding: Vec) -> Self { + Self { dimension: embedding.len(), data: embedding } + } + + /// Declares a vector of embeddings from its components. + /// + /// `data.len()` must be a multiple of `dimension`, otherwise an error is returned. + pub fn from_inner(data: Vec, dimension: usize) -> Result> { + let mut this = Self::new(dimension); + this.append(data)?; + Ok(this) + } + + /// Returns the number of embeddings in this vector of embeddings. + pub fn embedding_count(&self) -> usize { + self.data.len() / self.dimension + } + + /// Dimension of a single embedding. + pub fn dimension(&self) -> usize { + self.dimension + } + + /// Deconstructs self into the inner flat vector. + pub fn into_inner(self) -> Vec { + self.data + } + + /// A reference to the inner flat vector. + pub fn as_inner(&self) -> &[F] { + &self.data + } + + /// Iterates over the embeddings contained in the flat vector. + pub fn iter(&self) -> impl Iterator + '_ { + self.data.as_slice().chunks_exact(self.dimension) + } + + /// Push an embedding at the end of the embeddings. + /// + /// If `embedding.len() != self.dimension`, then the push operation fails. + pub fn push(&mut self, mut embedding: Vec) -> Result<(), Vec> { + if embedding.len() != self.dimension { + return Err(embedding); + } + self.data.append(&mut embedding); + Ok(()) + } + + /// Append a flat vector of embeddings at the end of the embeddings. + /// + /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. + pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { + if embeddings.len() % self.dimension != 0 { + return Err(embeddings); + } + self.data.append(&mut embeddings); + Ok(()) + } +} diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 0d737cbfc..b4b90b24b 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -6,10 +6,10 @@ use hf_hub::api::sync::ApiError; use itertools::Itertools as _; use super::parsed_vectors::ParsedVectorsDiff; -use super::rest::ConfigurationSource; -use super::MAX_COMPOSITE_DISTANCE; use crate::error::FaultSource; use crate::update::new::vector_document::VectorDocument; +use crate::vector::embedder::composite::MAX_COMPOSITE_DISTANCE; +use crate::vector::embedder::rest::ConfigurationSource; use crate::{FieldDistribution, PanicCatched}; #[derive(Debug, thiserror::Error)] diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 1f07f6c4f..fa8b2dba0 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -1,1231 +1,29 @@ -use std::collections::HashMap; -use std::num::NonZeroUsize; -use std::sync::{Arc, Mutex}; -use std::time::Instant; - -use arroy::distances::{BinaryQuantizedCosine, Cosine}; -use arroy::ItemId; -use deserr::{DeserializeError, Deserr}; -use heed::{RoTxn, RwTxn, Unspecified}; -use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; -use serde::{Deserialize, Serialize}; -use utoipa::ToSchema; - -use self::error::{EmbedError, NewEmbedderError}; -use crate::progress::{EmbedderStats, Progress}; -use crate::prompt::{Prompt, PromptData}; -use crate::vector::composite::SubEmbedderOptions; -use crate::vector::json_template::JsonTemplate; -use crate::ThreadPoolNoAbort; - -pub mod composite; pub mod db; +mod distribution; +pub mod embedder; +mod embeddings; pub mod error; pub mod extractor; -pub mod hf; pub mod json_template; -pub mod manual; -pub mod openai; pub mod parsed_vectors; +mod runtime; pub mod session; pub mod settings; - -pub mod ollama; -pub mod rest; +mod store; pub use self::error::Error; pub type Embedding = Vec; +pub use distribution::DistributionShift; +pub use embedder::{Embedder, EmbedderOptions, EmbeddingConfig, SearchQuery}; +pub use embeddings::Embeddings; +pub use runtime::{RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; +pub use store::{VectorStore, VectorStoreBackend, VectorStoreStats}; + pub const REQUEST_PARALLELISM: usize = 40; -pub const MAX_COMPOSITE_DISTANCE: f32 = 0.01; - -pub struct ArroyWrapper { - quantized: bool, - embedder_index: u8, - database: arroy::Database, -} - -impl ArroyWrapper { - pub fn new( - database: arroy::Database, - embedder_index: u8, - quantized: bool, - ) -> Self { - Self { database, embedder_index, quantized } - } - - pub fn embedder_index(&self) -> u8 { - self.embedder_index - } - - fn readers<'a, D: arroy::Distance>( - &'a self, - rtxn: &'a RoTxn<'a>, - db: arroy::Database, - ) -> impl Iterator, arroy::Error>> + 'a { - arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { - match arroy::Reader::open(rtxn, index, db) { - Ok(reader) => match reader.is_empty(rtxn) { - Ok(false) => Some(Ok(reader)), - Ok(true) => None, - Err(e) => Some(Err(e)), - }, - Err(arroy::Error::MissingMetadata(_)) => None, - Err(e) => Some(Err(e)), - } - }) - } - - /// The item ids that are present in the store specified by its id. - /// - /// The ids are accessed via a lambda to avoid lifetime shenanigans. - pub fn items_in_store( - &self, - rtxn: &RoTxn, - store_id: u8, - with_items: F, - ) -> Result - where - F: FnOnce(&RoaringBitmap) -> O, - { - if self.quantized { - self._items_in_store(rtxn, self.quantized_db(), store_id, with_items) - } else { - self._items_in_store(rtxn, self.angular_db(), store_id, with_items) - } - } - - fn _items_in_store( - &self, - rtxn: &RoTxn, - db: arroy::Database, - store_id: u8, - with_items: F, - ) -> Result - where - F: FnOnce(&RoaringBitmap) -> O, - { - let index = arroy_store_for_embedder(self.embedder_index, store_id); - let reader = arroy::Reader::open(rtxn, index, db); - match reader { - Ok(reader) => Ok(with_items(reader.item_ids())), - Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), - Err(err) => Err(err), - } - } - - pub fn dimensions(&self, rtxn: &RoTxn) -> Result, arroy::Error> { - if self.quantized { - Ok(self - .readers(rtxn, self.quantized_db()) - .next() - .transpose()? - .map(|reader| reader.dimensions())) - } else { - Ok(self - .readers(rtxn, self.angular_db()) - .next() - .transpose()? - .map(|reader| reader.dimensions())) - } - } - - #[allow(clippy::too_many_arguments)] - pub fn build_and_quantize( - &mut self, - wtxn: &mut RwTxn, - progress: &Progress, - rng: &mut R, - dimension: usize, - quantizing: bool, - arroy_memory: Option, - cancel: &(impl Fn() -> bool + Sync + Send), - ) -> Result<(), arroy::Error> { - for index in arroy_store_range_for_embedder(self.embedder_index) { - if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if writer.need_build(wtxn)? { - writer.builder(rng).build(wtxn)? - } else if writer.is_empty(wtxn)? { - continue; - } - } else { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - // If we are quantizing the databases, we can't know from meilisearch - // if the db was empty but still contained the wrong metadata, thus we need - // to quantize everything and can't stop early. Since this operation can - // only happens once in the life of an embedder, it's not very performances - // sensitive. - if quantizing && !self.quantized { - let writer = writer.prepare_changing_distance::(wtxn)?; - writer - .builder(rng) - .available_memory(arroy_memory.unwrap_or(usize::MAX)) - .progress(|step| progress.update_progress_from_arroy(step)) - .cancel(cancel) - .build(wtxn)?; - } else if writer.need_build(wtxn)? { - writer - .builder(rng) - .available_memory(arroy_memory.unwrap_or(usize::MAX)) - .progress(|step| progress.update_progress_from_arroy(step)) - .cancel(cancel) - .build(wtxn)?; - } else if writer.is_empty(wtxn)? { - continue; - } - } - } - Ok(()) - } - - /// Overwrite all the embeddings associated with the index and item ID. - /// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings. - /// You should call `del_items` on the `item_id` before calling this method. - /// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored. - pub fn add_items( - &self, - wtxn: &mut RwTxn, - item_id: arroy::ItemId, - embeddings: &Embeddings, - ) -> Result<(), arroy::Error> { - let dimension = embeddings.dimension(); - for (index, vector) in - arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) - { - if self.quantized { - arroy::Writer::new(self.quantized_db(), index, dimension) - .add_item(wtxn, item_id, vector)? - } else { - arroy::Writer::new(self.angular_db(), index, dimension) - .add_item(wtxn, item_id, vector)? - } - } - Ok(()) - } - - /// Add one document int for this index where we can find an empty spot. - pub fn add_item( - &self, - wtxn: &mut RwTxn, - item_id: arroy::ItemId, - vector: &[f32], - ) -> Result<(), arroy::Error> { - if self.quantized { - self._add_item(wtxn, self.quantized_db(), item_id, vector) - } else { - self._add_item(wtxn, self.angular_db(), item_id, vector) - } - } - - fn _add_item( - &self, - wtxn: &mut RwTxn, - db: arroy::Database, - item_id: arroy::ItemId, - vector: &[f32], - ) -> Result<(), arroy::Error> { - let dimension = vector.len(); - - for index in arroy_store_range_for_embedder(self.embedder_index) { - let writer = arroy::Writer::new(db, index, dimension); - if !writer.contains_item(wtxn, item_id)? { - writer.add_item(wtxn, item_id, vector)?; - break; - } - } - Ok(()) - } - - /// Add a vector associated with a document in store specified by its id. - /// - /// Any existing vector associated with the document in the store will be replaced by the new vector. - pub fn add_item_in_store( - &self, - wtxn: &mut RwTxn, - item_id: arroy::ItemId, - store_id: u8, - vector: &[f32], - ) -> Result<(), arroy::Error> { - if self.quantized { - self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector) - } else { - self._add_item_in_store(wtxn, self.angular_db(), item_id, store_id, vector) - } - } - - fn _add_item_in_store( - &self, - wtxn: &mut RwTxn, - db: arroy::Database, - item_id: arroy::ItemId, - store_id: u8, - vector: &[f32], - ) -> Result<(), arroy::Error> { - let dimension = vector.len(); - - let index = arroy_store_for_embedder(self.embedder_index, store_id); - let writer = arroy::Writer::new(db, index, dimension); - writer.add_item(wtxn, item_id, vector) - } - - /// Delete all embeddings from a specific `item_id` - pub fn del_items( - &self, - wtxn: &mut RwTxn, - dimension: usize, - item_id: arroy::ItemId, - ) -> Result<(), arroy::Error> { - for index in arroy_store_range_for_embedder(self.embedder_index) { - if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - writer.del_item(wtxn, item_id)?; - } else { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - writer.del_item(wtxn, item_id)?; - } - } - - Ok(()) - } - - /// Removes the item specified by its id from the store specified by its id. - /// - /// Returns whether the item was removed. - /// - /// # Warning - /// - /// - This function will silently fail to remove the item if used against an arroy database that was never built. - pub fn del_item_in_store( - &self, - wtxn: &mut RwTxn, - item_id: arroy::ItemId, - store_id: u8, - dimensions: usize, - ) -> Result { - if self.quantized { - self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions) - } else { - self._del_item_in_store(wtxn, self.angular_db(), item_id, store_id, dimensions) - } - } - - fn _del_item_in_store( - &self, - wtxn: &mut RwTxn, - db: arroy::Database, - item_id: arroy::ItemId, - store_id: u8, - dimensions: usize, - ) -> Result { - let index = arroy_store_for_embedder(self.embedder_index, store_id); - let writer = arroy::Writer::new(db, index, dimensions); - writer.del_item(wtxn, item_id) - } - - /// Removes all items from the store specified by its id. - /// - /// # Warning - /// - /// - This function will silently fail to remove the items if used against an arroy database that was never built. - pub fn clear_store( - &self, - wtxn: &mut RwTxn, - store_id: u8, - dimensions: usize, - ) -> Result<(), arroy::Error> { - if self.quantized { - self._clear_store(wtxn, self.quantized_db(), store_id, dimensions) - } else { - self._clear_store(wtxn, self.angular_db(), store_id, dimensions) - } - } - - fn _clear_store( - &self, - wtxn: &mut RwTxn, - db: arroy::Database, - store_id: u8, - dimensions: usize, - ) -> Result<(), arroy::Error> { - let index = arroy_store_for_embedder(self.embedder_index, store_id); - let writer = arroy::Writer::new(db, index, dimensions); - writer.clear(wtxn) - } - - /// Delete one item from its value. - pub fn del_item( - &self, - wtxn: &mut RwTxn, - item_id: arroy::ItemId, - vector: &[f32], - ) -> Result { - if self.quantized { - self._del_item(wtxn, self.quantized_db(), item_id, vector) - } else { - self._del_item(wtxn, self.angular_db(), item_id, vector) - } - } - - fn _del_item( - &self, - wtxn: &mut RwTxn, - db: arroy::Database, - item_id: arroy::ItemId, - vector: &[f32], - ) -> Result { - let dimension = vector.len(); - - for index in arroy_store_range_for_embedder(self.embedder_index) { - let writer = arroy::Writer::new(db, index, dimension); - let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - continue; - }; - if candidate == vector { - return writer.del_item(wtxn, item_id); - } - } - Ok(false) - } - - pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - for index in arroy_store_range_for_embedder(self.embedder_index) { - if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if writer.is_empty(wtxn)? { - continue; - } - writer.clear(wtxn)?; - } else { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - if writer.is_empty(wtxn)? { - continue; - } - writer.clear(wtxn)?; - } - } - Ok(()) - } - - pub fn contains_item( - &self, - rtxn: &RoTxn, - dimension: usize, - item: arroy::ItemId, - ) -> Result { - for index in arroy_store_range_for_embedder(self.embedder_index) { - let contains = if self.quantized { - let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if writer.is_empty(rtxn)? { - continue; - } - writer.contains_item(rtxn, item)? - } else { - let writer = arroy::Writer::new(self.angular_db(), index, dimension); - if writer.is_empty(rtxn)? { - continue; - } - writer.contains_item(rtxn, item)? - }; - if contains { - return Ok(contains); - } - } - Ok(false) - } - - pub fn nns_by_item( - &self, - rtxn: &RoTxn, - item: ItemId, - limit: usize, - filter: Option<&RoaringBitmap>, - ) -> Result, arroy::Error> { - if self.quantized { - self._nns_by_item(rtxn, self.quantized_db(), item, limit, filter) - } else { - self._nns_by_item(rtxn, self.angular_db(), item, limit, filter) - } - } - - fn _nns_by_item( - &self, - rtxn: &RoTxn, - db: arroy::Database, - item: ItemId, - limit: usize, - filter: Option<&RoaringBitmap>, - ) -> Result, arroy::Error> { - let mut results = Vec::new(); - - for reader in self.readers(rtxn, db) { - let reader = reader?; - let mut searcher = reader.nns(limit); - if let Some(filter) = filter { - if reader.item_ids().is_disjoint(filter) { - continue; - } - searcher.candidates(filter); - } - - if let Some(mut ret) = searcher.by_item(rtxn, item)? { - results.append(&mut ret); - } - } - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); - Ok(results) - } - - pub fn nns_by_vector( - &self, - rtxn: &RoTxn, - vector: &[f32], - limit: usize, - filter: Option<&RoaringBitmap>, - ) -> Result, arroy::Error> { - if self.quantized { - self._nns_by_vector(rtxn, self.quantized_db(), vector, limit, filter) - } else { - self._nns_by_vector(rtxn, self.angular_db(), vector, limit, filter) - } - } - - fn _nns_by_vector( - &self, - rtxn: &RoTxn, - db: arroy::Database, - vector: &[f32], - limit: usize, - filter: Option<&RoaringBitmap>, - ) -> Result, arroy::Error> { - let mut results = Vec::new(); - - for reader in self.readers(rtxn, db) { - let reader = reader?; - let mut searcher = reader.nns(limit); - if let Some(filter) = filter { - if reader.item_ids().is_disjoint(filter) { - continue; - } - searcher.candidates(filter); - } - - results.append(&mut searcher.by_vector(rtxn, vector)?); - } - - results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); - - Ok(results) - } - - pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> Result>, arroy::Error> { - let mut vectors = Vec::new(); - - if self.quantized { - for reader in self.readers(rtxn, self.quantized_db()) { - if let Some(vec) = reader?.item_vector(rtxn, item_id)? { - vectors.push(vec); - } - } - } else { - for reader in self.readers(rtxn, self.angular_db()) { - if let Some(vec) = reader?.item_vector(rtxn, item_id)? { - vectors.push(vec); - } - } - } - Ok(vectors) - } - - fn angular_db(&self) -> arroy::Database { - self.database.remap_data_type() - } - - fn quantized_db(&self) -> arroy::Database { - self.database.remap_data_type() - } - - pub fn aggregate_stats( - &self, - rtxn: &RoTxn, - stats: &mut ArroyStats, - ) -> Result<(), arroy::Error> { - if self.quantized { - for reader in self.readers(rtxn, self.quantized_db()) { - let reader = reader?; - let documents = reader.item_ids(); - stats.documents |= documents; - stats.number_of_embeddings += documents.len(); - } - } else { - for reader in self.readers(rtxn, self.angular_db()) { - let reader = reader?; - let documents = reader.item_ids(); - stats.documents |= documents; - stats.number_of_embeddings += documents.len(); - } - } - - Ok(()) - } -} - -#[derive(Debug, Default, Clone)] -pub struct ArroyStats { - pub number_of_embeddings: u64, - pub documents: RoaringBitmap, -} -/// One or multiple embeddings stored consecutively in a flat vector. -#[derive(Debug, PartialEq)] -pub struct Embeddings { - data: Vec, - dimension: usize, -} - -impl Embeddings { - /// Declares an empty vector of embeddings of the specified dimensions. - pub fn new(dimension: usize) -> Self { - Self { data: Default::default(), dimension } - } - - /// Declares a vector of embeddings containing a single element. - /// - /// The dimension is inferred from the length of the passed embedding. - pub fn from_single_embedding(embedding: Vec) -> Self { - Self { dimension: embedding.len(), data: embedding } - } - - /// Declares a vector of embeddings from its components. - /// - /// `data.len()` must be a multiple of `dimension`, otherwise an error is returned. - pub fn from_inner(data: Vec, dimension: usize) -> Result> { - let mut this = Self::new(dimension); - this.append(data)?; - Ok(this) - } - - /// Returns the number of embeddings in this vector of embeddings. - pub fn embedding_count(&self) -> usize { - self.data.len() / self.dimension - } - - /// Dimension of a single embedding. - pub fn dimension(&self) -> usize { - self.dimension - } - - /// Deconstructs self into the inner flat vector. - pub fn into_inner(self) -> Vec { - self.data - } - - /// A reference to the inner flat vector. - pub fn as_inner(&self) -> &[F] { - &self.data - } - - /// Iterates over the embeddings contained in the flat vector. - pub fn iter(&self) -> impl Iterator + '_ { - self.data.as_slice().chunks_exact(self.dimension) - } - - /// Push an embedding at the end of the embeddings. - /// - /// If `embedding.len() != self.dimension`, then the push operation fails. - pub fn push(&mut self, mut embedding: Vec) -> Result<(), Vec> { - if embedding.len() != self.dimension { - return Err(embedding); - } - self.data.append(&mut embedding); - Ok(()) - } - - /// Append a flat vector of embeddings at the end of the embeddings. - /// - /// If `embeddings.len() % self.dimension != 0`, then the append operation fails. - pub fn append(&mut self, mut embeddings: Vec) -> Result<(), Vec> { - if embeddings.len() % self.dimension != 0 { - return Err(embeddings); - } - self.data.append(&mut embeddings); - Ok(()) - } -} - -/// An embedder can be used to transform text into embeddings. -#[derive(Debug)] -pub enum Embedder { - /// An embedder based on running local models, fetched from the Hugging Face Hub. - HuggingFace(hf::Embedder), - /// An embedder based on making embedding queries against the OpenAI API. - OpenAi(openai::Embedder), - /// An embedder based on the user providing the embeddings in the documents and queries. - UserProvided(manual::Embedder), - /// An embedder based on making embedding queries against an embedding server. - Ollama(ollama::Embedder), - /// An embedder based on making embedding queries against a generic JSON/REST embedding server. - Rest(rest::Embedder), - /// An embedder composed of an embedder at search time and an embedder at indexing time. - Composite(composite::Embedder), -} - -#[derive(Debug)] -struct EmbeddingCache { - data: Option>>, -} - -impl EmbeddingCache { - const MAX_TEXT_LEN: usize = 2000; - - pub fn new(cap: usize) -> Self { - let data = NonZeroUsize::new(cap).map(lru::LruCache::new).map(Mutex::new); - Self { data } - } - - /// Get the embedding corresponding to `text`, if any is present in the cache. - pub fn get(&self, text: &str) -> Option { - let data = self.data.as_ref()?; - if text.len() > Self::MAX_TEXT_LEN { - return None; - } - let mut cache = data.lock().unwrap(); - - cache.get(text).cloned() - } - - /// Puts a new embedding for the specified `text` - pub fn put(&self, text: String, embedding: Embedding) { - let Some(data) = self.data.as_ref() else { - return; - }; - if text.len() > Self::MAX_TEXT_LEN { - return; - } - tracing::trace!(text, "embedding added to cache"); - - let mut cache = data.lock().unwrap(); - - cache.put(text, embedding); - } -} - -/// Configuration for an embedder. -#[derive(Debug, Clone, Default, serde::Deserialize, serde::Serialize)] -pub struct EmbeddingConfig { - /// Options of the embedder, specific to each kind of embedder - pub embedder_options: EmbedderOptions, - /// Document template - pub prompt: PromptData, - /// If this embedder is binary quantized - pub quantized: Option, - // TODO: add metrics and anything needed -} - -impl EmbeddingConfig { - pub fn quantized(&self) -> bool { - self.quantized.unwrap_or_default() - } -} - -/// Map of runtime embedder data. -#[derive(Clone, Default)] -pub struct RuntimeEmbedders(HashMap>); - -pub struct RuntimeEmbedder { - pub embedder: Arc, - pub document_template: Prompt, - fragments: Vec, - pub is_quantized: bool, -} - -impl RuntimeEmbedder { - pub fn new( - embedder: Arc, - document_template: Prompt, - mut fragments: Vec, - is_quantized: bool, - ) -> Self { - fragments.sort_unstable_by(|left, right| left.name.cmp(&right.name)); - Self { embedder, document_template, fragments, is_quantized } - } - - /// The runtime fragments sorted by name. - pub fn fragments(&self) -> &[RuntimeFragment] { - self.fragments.as_slice() - } -} - -pub struct RuntimeFragment { - pub name: String, - pub id: u8, - pub template: JsonTemplate, -} - -impl RuntimeEmbedders { - /// Create the map from its internal component.s - pub fn new(data: HashMap>) -> Self { - Self(data) - } - - pub fn contains(&self, name: &str) -> bool { - self.0.contains_key(name) - } - - /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option<&Arc> { - self.0.get(name) - } - - pub fn inner_as_ref(&self) -> &HashMap> { - &self.0 - } - - pub fn into_inner(self) -> HashMap> { - self.0 - } - - pub fn len(&self) -> usize { - self.0.len() - } - - pub fn is_empty(&self) -> bool { - self.0.is_empty() - } -} - -impl IntoIterator for RuntimeEmbedders { - type Item = (String, Arc); - - type IntoIter = std::collections::hash_map::IntoIter>; - - fn into_iter(self) -> Self::IntoIter { - self.0.into_iter() - } -} - -/// Options of an embedder, specific to each kind of embedder. -#[derive(Debug, Clone, Hash, PartialEq, Eq, serde::Deserialize, serde::Serialize)] -pub enum EmbedderOptions { - HuggingFace(hf::EmbedderOptions), - OpenAi(openai::EmbedderOptions), - Ollama(ollama::EmbedderOptions), - UserProvided(manual::EmbedderOptions), - Rest(rest::EmbedderOptions), - Composite(composite::EmbedderOptions), -} - -impl EmbedderOptions { - pub fn fragment(&self, name: &str) -> Option<&serde_json::Value> { - match &self { - EmbedderOptions::HuggingFace(_) - | EmbedderOptions::OpenAi(_) - | EmbedderOptions::Ollama(_) - | EmbedderOptions::UserProvided(_) => None, - EmbedderOptions::Rest(embedder_options) => { - embedder_options.indexing_fragments.get(name) - } - EmbedderOptions::Composite(embedder_options) => { - if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { - embedder_options.indexing_fragments.get(name) - } else { - None - } - } - } - } - - pub fn has_fragments(&self) -> bool { - match &self { - EmbedderOptions::HuggingFace(_) - | EmbedderOptions::OpenAi(_) - | EmbedderOptions::Ollama(_) - | EmbedderOptions::UserProvided(_) => false, - EmbedderOptions::Rest(embedder_options) => { - !embedder_options.indexing_fragments.is_empty() - } - EmbedderOptions::Composite(embedder_options) => { - if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { - !embedder_options.indexing_fragments.is_empty() - } else { - false - } - } - } - } -} - -impl Default for EmbedderOptions { - fn default() -> Self { - Self::HuggingFace(Default::default()) - } -} - -impl Embedder { - /// Spawns a new embedder built from its options. - pub fn new( - options: EmbedderOptions, - cache_cap: usize, - ) -> std::result::Result { - Ok(match options { - EmbedderOptions::HuggingFace(options) => { - Self::HuggingFace(hf::Embedder::new(options, cache_cap)?) - } - EmbedderOptions::OpenAi(options) => { - Self::OpenAi(openai::Embedder::new(options, cache_cap)?) - } - EmbedderOptions::Ollama(options) => { - Self::Ollama(ollama::Embedder::new(options, cache_cap)?) - } - EmbedderOptions::UserProvided(options) => { - Self::UserProvided(manual::Embedder::new(options)) - } - EmbedderOptions::Rest(options) => Self::Rest(rest::Embedder::new( - options, - cache_cap, - rest::ConfigurationSource::User, - )?), - EmbedderOptions::Composite(options) => { - Self::Composite(composite::Embedder::new(options, cache_cap)?) - } - }) - } - - /// Embed in search context - - #[tracing::instrument(level = "debug", skip_all, target = "search")] - pub fn embed_search( - &self, - query: SearchQuery<'_>, - deadline: Option, - ) -> std::result::Result { - match query { - SearchQuery::Text(text) => self.embed_search_text(text, deadline), - SearchQuery::Media { q, media } => self.embed_search_media(q, media, deadline), - } - } - - pub fn embed_search_text( - &self, - text: &str, - deadline: Option, - ) -> std::result::Result { - if let Some(cache) = self.cache() { - if let Some(embedding) = cache.get(text) { - tracing::trace!(text, "embedding found in cache"); - return Ok(embedding); - } - } - let embedding = match self { - Embedder::HuggingFace(embedder) => embedder.embed_one(text), - Embedder::OpenAi(embedder) => embedder - .embed(&[text], deadline, None)? - .pop() - .ok_or_else(EmbedError::missing_embedding), - Embedder::Ollama(embedder) => embedder - .embed(&[text], deadline, None)? - .pop() - .ok_or_else(EmbedError::missing_embedding), - Embedder::UserProvided(embedder) => embedder.embed_one(text), - Embedder::Rest(embedder) => embedder.embed_one(SearchQuery::Text(text), deadline, None), - Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None), - }?; - - if let Some(cache) = self.cache() { - cache.put(text.to_owned(), embedding.clone()); - } - - Ok(embedding) - } - - pub fn embed_search_media( - &self, - q: Option<&str>, - media: Option<&serde_json::Value>, - deadline: Option, - ) -> std::result::Result { - let Embedder::Rest(embedder) = self else { - return Err(EmbedError::rest_media_not_a_rest()); - }; - embedder.embed_one(SearchQuery::Media { q, media }, deadline, None) - } - - /// Embed multiple chunks of texts. - /// - /// Each chunk is composed of one or multiple texts. - pub fn embed_index( - &self, - text_chunks: Vec>, - threads: &ThreadPoolNoAbort, - embedder_stats: &EmbedderStats, - ) -> std::result::Result>, EmbedError> { - match self { - Embedder::HuggingFace(embedder) => embedder.embed_index(text_chunks), - Embedder::OpenAi(embedder) => { - embedder.embed_index(text_chunks, threads, embedder_stats) - } - Embedder::Ollama(embedder) => { - embedder.embed_index(text_chunks, threads, embedder_stats) - } - Embedder::UserProvided(embedder) => embedder.embed_index(text_chunks), - Embedder::Rest(embedder) => embedder.embed_index(text_chunks, threads, embedder_stats), - Embedder::Composite(embedder) => { - embedder.index.embed_index(text_chunks, threads, embedder_stats) - } - } - } - - /// Non-owning variant of [`Self::embed_index`]. - pub fn embed_index_ref( - &self, - texts: &[&str], - threads: &ThreadPoolNoAbort, - embedder_stats: &EmbedderStats, - ) -> std::result::Result, EmbedError> { - match self { - Embedder::HuggingFace(embedder) => embedder.embed_index_ref(texts), - Embedder::OpenAi(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), - Embedder::Ollama(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), - Embedder::UserProvided(embedder) => embedder.embed_index_ref(texts), - Embedder::Rest(embedder) => embedder.embed_index_ref(texts, threads, embedder_stats), - Embedder::Composite(embedder) => { - embedder.index.embed_index_ref(texts, threads, embedder_stats) - } - } - } - - pub fn embed_index_ref_fragments( - &self, - fragments: &[serde_json::Value], - threads: &ThreadPoolNoAbort, - embedder_stats: &EmbedderStats, - ) -> std::result::Result, EmbedError> { - if let Embedder::Rest(embedder) = self { - embedder.embed_index_ref(fragments, threads, embedder_stats) - } else { - let Embedder::Composite(embedder) = self else { - unimplemented!("embedding fragments is only available for rest embedders") - }; - let crate::vector::composite::SubEmbedder::Rest(embedder) = &embedder.index else { - unimplemented!("embedding fragments is only available for rest embedders") - }; - - embedder.embed_index_ref(fragments, threads, embedder_stats) - } - } - - /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] - pub fn chunk_count_hint(&self) -> usize { - match self { - Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), - Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), - Embedder::Ollama(embedder) => embedder.chunk_count_hint(), - Embedder::UserProvided(_) => 100, - Embedder::Rest(embedder) => embedder.chunk_count_hint(), - Embedder::Composite(embedder) => embedder.index.chunk_count_hint(), - } - } - - /// Indicates the preferred number of texts in a single chunk passed to [`Self::embed`] - pub fn prompt_count_in_chunk_hint(&self) -> usize { - match self { - Embedder::HuggingFace(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::OpenAi(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::Ollama(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::UserProvided(_) => 1, - Embedder::Rest(embedder) => embedder.prompt_count_in_chunk_hint(), - Embedder::Composite(embedder) => embedder.index.prompt_count_in_chunk_hint(), - } - } - - /// Indicates the dimensions of a single embedding produced by the embedder. - pub fn dimensions(&self) -> usize { - match self { - Embedder::HuggingFace(embedder) => embedder.dimensions(), - Embedder::OpenAi(embedder) => embedder.dimensions(), - Embedder::Ollama(embedder) => embedder.dimensions(), - Embedder::UserProvided(embedder) => embedder.dimensions(), - Embedder::Rest(embedder) => embedder.dimensions(), - Embedder::Composite(embedder) => embedder.dimensions(), - } - } - - /// An optional distribution used to apply an affine transformation to the similarity score of a document. - pub fn distribution(&self) -> Option { - match self { - Embedder::HuggingFace(embedder) => embedder.distribution(), - Embedder::OpenAi(embedder) => embedder.distribution(), - Embedder::Ollama(embedder) => embedder.distribution(), - Embedder::UserProvided(embedder) => embedder.distribution(), - Embedder::Rest(embedder) => embedder.distribution(), - Embedder::Composite(embedder) => embedder.distribution(), - } - } - - pub fn uses_document_template(&self) -> bool { - match self { - Embedder::HuggingFace(_) - | Embedder::OpenAi(_) - | Embedder::Ollama(_) - | Embedder::Rest(_) => true, - Embedder::UserProvided(_) => false, - Embedder::Composite(embedder) => embedder.index.uses_document_template(), - } - } - - fn cache(&self) -> Option<&EmbeddingCache> { - match self { - Embedder::HuggingFace(embedder) => Some(embedder.cache()), - Embedder::OpenAi(embedder) => Some(embedder.cache()), - Embedder::UserProvided(_) => None, - Embedder::Ollama(embedder) => Some(embedder.cache()), - Embedder::Rest(embedder) => Some(embedder.cache()), - Embedder::Composite(embedder) => embedder.search.cache(), - } - } -} - -#[derive(Clone, Copy)] -pub enum SearchQuery<'a> { - Text(&'a str), - Media { q: Option<&'a str>, media: Option<&'a serde_json::Value> }, -} - -/// Describes the mean and sigma of distribution of embedding similarity in the embedding space. -/// -/// The intended use is to make the similarity score more comparable to the regular ranking score. -/// This allows to correct effects where results are too "packed" around a certain value. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Deserialize, Serialize, ToSchema)] -#[serde(from = "DistributionShiftSerializable")] -#[serde(into = "DistributionShiftSerializable")] -pub struct DistributionShift { - /// Value where the results are "packed". - /// - /// Similarity scores are translated so that they are packed around 0.5 instead - #[schema(value_type = f32)] - pub current_mean: OrderedFloat, - - /// standard deviation of a similarity score. - /// - /// Set below 0.4 to make the results less packed around the mean, and above 0.4 to make them more packed. - #[schema(value_type = f32)] - pub current_sigma: OrderedFloat, -} - -impl Deserr for DistributionShift -where - E: DeserializeError, -{ - fn deserialize_from_value( - value: deserr::Value, - location: deserr::ValuePointerRef<'_>, - ) -> Result { - let value = DistributionShiftSerializable::deserialize_from_value(value, location)?; - if value.mean < 0. || value.mean > 1. { - return Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::Unexpected { - msg: format!( - "the distribution mean must be in the range [0, 1], got {}", - value.mean - ), - }, - location, - ))); - } - if value.sigma <= 0. || value.sigma > 1. { - return Err(deserr::take_cf_content(E::error::( - None, - deserr::ErrorKind::Unexpected { - msg: format!( - "the distribution sigma must be in the range ]0, 1], got {}", - value.sigma - ), - }, - location, - ))); - } - - Ok(value.into()) - } -} - -#[derive(Serialize, Deserialize, Deserr)] -#[serde(deny_unknown_fields)] -#[deserr(deny_unknown_fields)] -struct DistributionShiftSerializable { - mean: f32, - sigma: f32, -} - -impl From for DistributionShiftSerializable { - fn from( - DistributionShift { - current_mean: OrderedFloat(current_mean), - current_sigma: OrderedFloat(current_sigma), - }: DistributionShift, - ) -> Self { - Self { mean: current_mean, sigma: current_sigma } - } -} - -impl From for DistributionShift { - fn from(DistributionShiftSerializable { mean, sigma }: DistributionShiftSerializable) -> Self { - Self { current_mean: OrderedFloat(mean), current_sigma: OrderedFloat(sigma) } - } -} - -impl DistributionShift { - /// `None` if sigma <= 0. - pub fn new(mean: f32, sigma: f32) -> Option { - if sigma <= 0.0 { - None - } else { - Some(Self { current_mean: OrderedFloat(mean), current_sigma: OrderedFloat(sigma) }) - } - } - - pub fn shift(&self, score: f32) -> f32 { - let current_mean = self.current_mean.0; - let current_sigma = self.current_sigma.0; - // - // We're somewhat abusively mapping the distribution of distances to a gaussian. - // The parameters we're given is the mean and sigma of the native result distribution. - // We're using them to retarget the distribution to a gaussian centered on 0.5 with a sigma of 0.4. - - let target_mean = 0.5; - let target_sigma = 0.4; - - // a^2 sig1^2 = sig2^2 => a^2 = sig2^2 / sig1^2 => a = sig2 / sig1, assuming a, sig1, and sig2 positive. - let factor = target_sigma / current_sigma; - // a*mu1 + b = mu2 => b = mu2 - a*mu1 - let offset = target_mean - (factor * current_mean); - - let mut score = factor * score + offset; - - // clamp the final score in the ]0, 1] interval. - if score <= 0.0 { - score = f32::EPSILON; - } - if score > 1.0 { - score = 1.0; - } - - score - } -} /// Whether CUDA is supported in this version of Meilisearch. pub const fn is_cuda_enabled() -> bool { cfg!(feature = "cuda") } - -fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator { - (0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id)) -} - -fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 { - let embedder_id = (embedder_id as u16) << 8; - embedder_id | (store_id as u16) -} diff --git a/crates/milli/src/vector/runtime.rs b/crates/milli/src/vector/runtime.rs new file mode 100644 index 000000000..5a653f1b1 --- /dev/null +++ b/crates/milli/src/vector/runtime.rs @@ -0,0 +1,81 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use super::Embedder; +use crate::prompt::Prompt; +use crate::vector::json_template::JsonTemplate; + +/// Map of runtime embedder data. +#[derive(Clone, Default)] +pub struct RuntimeEmbedders(HashMap>); + +pub struct RuntimeEmbedder { + pub embedder: Arc, + pub document_template: Prompt, + fragments: Vec, + pub is_quantized: bool, +} + +impl RuntimeEmbedder { + pub fn new( + embedder: Arc, + document_template: Prompt, + mut fragments: Vec, + is_quantized: bool, + ) -> Self { + fragments.sort_unstable_by(|left, right| left.name.cmp(&right.name)); + Self { embedder, document_template, fragments, is_quantized } + } + + /// The runtime fragments sorted by name. + pub fn fragments(&self) -> &[RuntimeFragment] { + self.fragments.as_slice() + } +} +pub struct RuntimeFragment { + pub name: String, + pub id: u8, + pub template: JsonTemplate, +} + +impl RuntimeEmbedders { + /// Create the map from its internal component.s + pub fn new(data: HashMap>) -> Self { + Self(data) + } + + pub fn contains(&self, name: &str) -> bool { + self.0.contains_key(name) + } + + /// Get an embedder configuration and template from its name. + pub fn get(&self, name: &str) -> Option<&Arc> { + self.0.get(name) + } + + pub fn inner_as_ref(&self) -> &HashMap> { + &self.0 + } + + pub fn into_inner(self) -> HashMap> { + self.0 + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } +} + +impl IntoIterator for RuntimeEmbedders { + type Item = (String, Arc); + + type IntoIter = std::collections::hash_map::IntoIter>; + + fn into_iter(self) -> Self::IntoIter { + self.0.into_iter() + } +} diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index b582bd840..b7ee7262b 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -2,7 +2,8 @@ use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use serde_json::Value; -use super::{EmbedError, Embedder, Embedding}; +use super::error::EmbedError; +use super::{Embedder, Embedding}; use crate::progress::EmbedderStats; use crate::{DocumentId, Result, ThreadPoolNoAbort}; type ExtractorId = u8; diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 1b85dd503..499ab3955 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -8,12 +8,12 @@ use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; -use super::composite::SubEmbedderOptions; -use super::hf::OverridePooling; -use super::{ollama, openai, DistributionShift, EmbedderOptions}; use crate::prompt::{default_max_bytes, PromptData}; use crate::update::Setting; -use crate::vector::EmbeddingConfig; +use crate::vector::embedder::composite::{self, SubEmbedderOptions}; +use crate::vector::embedder::hf::{self, OverridePooling}; +use crate::vector::embedder::{manual, ollama, openai, rest, EmbedderOptions}; +use crate::vector::{DistributionShift, EmbeddingConfig}; use crate::UserError; #[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] @@ -1789,12 +1789,7 @@ pub struct Fragment { impl EmbeddingSettings { fn from_hugging_face( - super::hf::EmbedderOptions { - model, - revision, - distribution, - pooling, - }: super::hf::EmbedderOptions, + hf::EmbedderOptions { model, revision, distribution, pooling }: hf::EmbedderOptions, document_template: Setting, document_template_max_bytes: Setting, quantized: Option, @@ -1822,13 +1817,13 @@ impl EmbeddingSettings { } fn from_openai( - super::openai::EmbedderOptions { + openai::EmbedderOptions { url, api_key, embedding_model, dimensions, distribution, - }: super::openai::EmbedderOptions, + }: openai::EmbedderOptions, document_template: Setting, document_template_max_bytes: Setting, quantized: Option, @@ -1856,13 +1851,13 @@ impl EmbeddingSettings { } fn from_ollama( - super::ollama::EmbedderOptions { - embedding_model, - url, - api_key, - distribution, - dimensions, - }: super::ollama::EmbedderOptions, + ollama::EmbedderOptions { + embedding_model, + url, + api_key, + distribution, + dimensions, + }: ollama::EmbedderOptions, document_template: Setting, document_template_max_bytes: Setting, quantized: Option, @@ -1890,7 +1885,7 @@ impl EmbeddingSettings { } fn from_user_provided( - super::manual::EmbedderOptions { dimensions, distribution }: super::manual::EmbedderOptions, + manual::EmbedderOptions { dimensions, distribution }: manual::EmbedderOptions, quantized: Option, ) -> Self { Self { @@ -1916,7 +1911,7 @@ impl EmbeddingSettings { } fn from_rest( - super::rest::EmbedderOptions { + rest::EmbedderOptions { api_key, dimensions, url, @@ -1926,7 +1921,7 @@ impl EmbeddingSettings { response, distribution, headers, - }: super::rest::EmbedderOptions, + }: rest::EmbedderOptions, document_template: Setting, document_template_max_bytes: Setting, quantized: Option, @@ -2015,37 +2010,36 @@ impl From for EmbeddingSettings { document_template_max_bytes, quantized, ), - super::EmbedderOptions::Composite(super::composite::EmbedderOptions { - search, - index, - }) => Self { - source: Setting::Set(EmbedderSource::Composite), - model: Setting::NotSet, - revision: Setting::NotSet, - pooling: Setting::NotSet, - api_key: Setting::NotSet, - dimensions: Setting::NotSet, - binary_quantized: Setting::some_or_not_set(quantized), - document_template: Setting::NotSet, - document_template_max_bytes: Setting::NotSet, - url: Setting::NotSet, - indexing_fragments: Setting::NotSet, - search_fragments: Setting::NotSet, - request: Setting::NotSet, - response: Setting::NotSet, - headers: Setting::NotSet, - distribution: Setting::some_or_not_set(search.distribution()), - search_embedder: Setting::Set(SubEmbeddingSettings::from_options( - search, - Setting::NotSet, - Setting::NotSet, - )), - indexing_embedder: Setting::Set(SubEmbeddingSettings::from_options( - index, - Setting::Set(prompt.template), - document_template_max_bytes, - )), - }, + super::EmbedderOptions::Composite(composite::EmbedderOptions { search, index }) => { + Self { + source: Setting::Set(EmbedderSource::Composite), + model: Setting::NotSet, + revision: Setting::NotSet, + pooling: Setting::NotSet, + api_key: Setting::NotSet, + dimensions: Setting::NotSet, + binary_quantized: Setting::some_or_not_set(quantized), + document_template: Setting::NotSet, + document_template_max_bytes: Setting::NotSet, + url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, + request: Setting::NotSet, + response: Setting::NotSet, + headers: Setting::NotSet, + distribution: Setting::some_or_not_set(search.distribution()), + search_embedder: Setting::Set(SubEmbeddingSettings::from_options( + search, + Setting::NotSet, + Setting::NotSet, + )), + indexing_embedder: Setting::Set(SubEmbeddingSettings::from_options( + index, + Setting::Set(prompt.template), + document_template_max_bytes, + )), + } + } } } } @@ -2212,7 +2206,7 @@ impl From for EmbeddingConfig { ) .into(), EmbedderSource::Composite => { - super::EmbedderOptions::Composite(super::composite::EmbedderOptions { + super::EmbedderOptions::Composite(composite::EmbedderOptions { // it is important to give the distribution to the search here, as this is from where we'll retrieve it search: SubEmbedderOptions::from_settings( search_embedder.set().unwrap(), @@ -2290,9 +2284,9 @@ impl SubEmbedderOptions { dimensions: Setting, distribution: Setting, ) -> Self { - let mut options = super::openai::EmbedderOptions::with_default_model(None); + let mut options = openai::EmbedderOptions::with_default_model(None); if let Some(model) = model.set() { - if let Some(model) = super::openai::EmbeddingModel::from_name(&model) { + if let Some(model) = openai::EmbeddingModel::from_name(&model) { options.embedding_model = model; } } @@ -2314,7 +2308,7 @@ impl SubEmbedderOptions { pooling: Setting, distribution: Setting, ) -> Self { - let mut options = super::hf::EmbedderOptions::default(); + let mut options = hf::EmbedderOptions::default(); if let Some(model) = model.set() { options.model = model; // Reset the revision if we are setting the model. @@ -2334,10 +2328,7 @@ impl SubEmbedderOptions { SubEmbedderOptions::HuggingFace(options) } fn user_provided(dimensions: usize, distribution: Setting) -> Self { - Self::UserProvided(super::manual::EmbedderOptions { - dimensions, - distribution: distribution.set(), - }) + Self::UserProvided(manual::EmbedderOptions { dimensions, distribution: distribution.set() }) } #[allow(clippy::too_many_arguments)] @@ -2352,7 +2343,7 @@ impl SubEmbedderOptions { dimensions: Setting, distribution: Setting, ) -> Self { - Self::Rest(super::rest::EmbedderOptions { + Self::Rest(rest::EmbedderOptions { api_key: api_key.set(), dimensions: dimensions.set(), url, @@ -2386,11 +2377,7 @@ impl SubEmbedderOptions { distribution: Setting, ) -> Self { let mut options: ollama::EmbedderOptions = - super::ollama::EmbedderOptions::with_default_model( - api_key.set(), - url.set(), - dimensions.set(), - ); + ollama::EmbedderOptions::with_default_model(api_key.set(), url.set(), dimensions.set()); if let Some(model) = model.set() { options.embedding_model = model; } diff --git a/crates/milli/src/vector/store.rs b/crates/milli/src/vector/store.rs new file mode 100644 index 000000000..d4d3f26cb --- /dev/null +++ b/crates/milli/src/vector/store.rs @@ -0,0 +1,1178 @@ +use hannoy::distances::{Cosine, Hamming}; +use hannoy::ItemId; +use heed::{RoTxn, RwTxn, Unspecified}; +use ordered_float::OrderedFloat; +use rand::SeedableRng as _; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; + +use crate::progress::Progress; +use crate::vector::Embeddings; + +const HANNOY_EF_CONSTRUCTION: usize = 125; +const HANNOY_M: usize = 16; +const HANNOY_M0: usize = 32; + +#[derive( + Debug, + Clone, + Copy, + PartialEq, + Eq, + Default, + Serialize, + Deserialize, + deserr::Deserr, + utoipa::ToSchema, +)] +pub enum VectorStoreBackend { + #[default] + #[deserr(rename = "stable")] + #[serde(rename = "stable")] + Arroy, + #[deserr(rename = "experimental")] + #[serde(rename = "experimental")] + Hannoy, +} + +pub struct VectorStore { + backend: VectorStoreBackend, + database: hannoy::Database, + embedder_index: u8, + quantized: bool, +} + +impl VectorStore { + // backend-independent public functions + + pub fn new( + backend: VectorStoreBackend, + database: hannoy::Database, + embedder_index: u8, + quantized: bool, + ) -> Self { + Self { backend, database, embedder_index, quantized } + } + + pub fn embedder_index(&self) -> u8 { + self.embedder_index + } + + // backend-dependent public functions + + /// The item ids that are present in the store specified by its id. + /// + /// The ids are accessed via a lambda to avoid lifetime shenanigans. + pub fn items_in_store( + &self, + rtxn: &RoTxn, + store_id: u8, + with_items: F, + ) -> crate::Result + where + F: FnOnce(&RoaringBitmap) -> O, + { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_items_in_store(rtxn, self._arroy_quantized_db(), store_id, with_items) + .map_err(Into::into) + } else { + self._arroy_items_in_store(rtxn, self._arroy_angular_db(), store_id, with_items) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_items_in_store(rtxn, self._hannoy_quantized_db(), store_id, with_items) + .map_err(Into::into) + } else { + self._hannoy_items_in_store(rtxn, self._hannoy_angular_db(), store_id, with_items) + .map_err(Into::into) + } + } + + pub fn dimensions(&self, rtxn: &RoTxn) -> crate::Result> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + Ok(self + ._arroy_readers(rtxn, self._arroy_quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } else { + Ok(self + ._arroy_readers(rtxn, self._arroy_angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } + } else if self.quantized { + Ok(self + ._hannoy_readers(rtxn, self._hannoy_quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } else { + Ok(self + ._hannoy_readers(rtxn, self._hannoy_angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } + } + + pub fn change_backend( + self, + rtxn: &RoTxn, + wtxn: &mut RwTxn, + progress: Progress, + must_stop_processing: &MSP, + available_memory: Option, + ) -> crate::Result<()> + where + MSP: Fn() -> bool + Sync, + { + let mut rng = rand::rngs::StdRng::from_entropy(); + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_to_hannoy_bq::(rtxn, wtxn, &progress, &mut rng, &must_stop_processing) + } else { + let dimensions = self + ._arroy_readers(wtxn, self._arroy_angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions()); + + let Some(dimensions) = dimensions else { return Ok(()) }; + + for index in vector_store_range_for_embedder(self.embedder_index) { + let writer = hannoy::Writer::new(self._hannoy_angular_db(), index, dimensions); + let mut builder = writer.builder(&mut rng).progress(progress.clone()); + builder.cancel(must_stop_processing); + builder.prepare_arroy_conversion(wtxn)?; + builder.build::(wtxn)?; + } + + Ok(()) + } + } else if self.quantized { + self._hannoy_to_arroy_bq::< + hannoy::distances::Hamming, + arroy::distances::BinaryQuantizedCosine, + _>(rtxn, wtxn, &progress, &mut rng, available_memory, &must_stop_processing) + } else { + let dimensions = self + ._hannoy_readers(wtxn, self._hannoy_angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions()); + + let Some(dimensions) = dimensions else { return Ok(()) }; + + for index in vector_store_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(self._arroy_angular_db(), index, dimensions); + let mut builder = writer.builder(&mut rng); + let builder = builder.progress(|step| progress.update_progress_from_arroy(step)); + builder.prepare_hannoy_conversion(wtxn)?; + builder.build(wtxn)?; + } + + Ok(()) + } + } + + #[allow(clippy::too_many_arguments)] + pub fn build_and_quantize( + &mut self, + wtxn: &mut RwTxn, + progress: Progress, + rng: &mut R, + dimension: usize, + quantizing: bool, + available_memory: Option, + cancel: &(impl Fn() -> bool + Sync + Send), + ) -> Result<(), crate::Error> { + for index in vector_store_range_for_embedder(self.embedder_index) { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + let writer = arroy::Writer::new(self._arroy_quantized_db(), index, dimension); + if writer.need_build(wtxn)? { + arroy_build(wtxn, &progress, rng, available_memory, cancel, &writer)?; + } else if writer.is_empty(wtxn)? { + continue; + } + } else { + let writer = arroy::Writer::new(self._arroy_angular_db(), index, dimension); + // If we are quantizing the databases, we can't know from meilisearch + // if the db was empty but still contained the wrong metadata, thus we need + // to quantize everything and can't stop early. Since this operation can + // only happens once in the life of an embedder, it's not very performance + // sensitive. + if quantizing && !self.quantized { + let writer = writer + .prepare_changing_distance::( + wtxn, + )?; + arroy_build(wtxn, &progress, rng, available_memory, cancel, &writer)?; + } else if writer.need_build(wtxn)? { + arroy_build(wtxn, &progress, rng, available_memory, cancel, &writer)?; + } else if writer.is_empty(wtxn)? { + continue; + } + } + } else if self.quantized { + let writer = hannoy::Writer::new(self._hannoy_quantized_db(), index, dimension); + if writer.need_build(wtxn)? { + hannoy_build(wtxn, &progress, rng, cancel, &writer)?; + } else if writer.is_empty(wtxn)? { + continue; + } + } else { + let writer = hannoy::Writer::new(self._hannoy_angular_db(), index, dimension); + // If we are quantizing the databases, we can't know from meilisearch + // if the db was empty but still contained the wrong metadata, thus we need + // to quantize everything and can't stop early. Since this operation can + // only happens once in the life of an embedder, it's not very performance + // sensitive. + if quantizing && !self.quantized { + let writer = writer.prepare_changing_distance::(wtxn)?; + hannoy_build(wtxn, &progress, rng, cancel, &writer)?; + } else if writer.need_build(wtxn)? { + hannoy_build(wtxn, &progress, rng, cancel, &writer)?; + } else if writer.is_empty(wtxn)? { + continue; + } + } + } + Ok(()) + } + + /// Overwrite all the embeddings associated with the index and item ID. + /// /!\ It won't remove embeddings after the last passed embedding, which can leave stale embeddings. + /// You should call `del_items` on the `item_id` before calling this method. + /// /!\ Cannot insert more than u8::MAX embeddings; after inserting u8::MAX embeddings, all the remaining ones will be silently ignored. + pub fn add_items( + &self, + wtxn: &mut RwTxn, + item_id: hannoy::ItemId, + embeddings: &Embeddings, + ) -> Result<(), crate::Error> { + let dimension = embeddings.dimension(); + for (index, vector) in + vector_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + arroy::Writer::new(self._arroy_quantized_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } else { + arroy::Writer::new(self._arroy_angular_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } + } else if self.quantized { + hannoy::Writer::new(self._hannoy_quantized_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } else { + hannoy::Writer::new(self._hannoy_angular_db(), index, dimension) + .add_item(wtxn, item_id, vector)? + } + } + Ok(()) + } + + /// Add one document int for this index where we can find an empty spot. + pub fn add_item( + &self, + wtxn: &mut RwTxn, + item_id: hannoy::ItemId, + vector: &[f32], + ) -> Result<(), crate::Error> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_add_item(wtxn, self._arroy_quantized_db(), item_id, vector) + .map_err(Into::into) + } else { + self._arroy_add_item(wtxn, self._arroy_angular_db(), item_id, vector) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_add_item(wtxn, self._hannoy_quantized_db(), item_id, vector) + .map_err(Into::into) + } else { + self._hannoy_add_item(wtxn, self._hannoy_angular_db(), item_id, vector) + .map_err(Into::into) + } + } + + /// Add a vector associated with a document in store specified by its id. + /// + /// Any existing vector associated with the document in the store will be replaced by the new vector. + pub fn add_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: hannoy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), crate::Error> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_add_item_in_store( + wtxn, + self._arroy_quantized_db(), + item_id, + store_id, + vector, + ) + .map_err(Into::into) + } else { + self._arroy_add_item_in_store( + wtxn, + self._arroy_angular_db(), + item_id, + store_id, + vector, + ) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_add_item_in_store( + wtxn, + self._hannoy_quantized_db(), + item_id, + store_id, + vector, + ) + .map_err(Into::into) + } else { + self._hannoy_add_item_in_store( + wtxn, + self._hannoy_angular_db(), + item_id, + store_id, + vector, + ) + .map_err(Into::into) + } + } + + /// Delete one item from its value. + pub fn del_item( + &self, + wtxn: &mut RwTxn, + item_id: hannoy::ItemId, + vector: &[f32], + ) -> Result { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_del_item(wtxn, self._arroy_quantized_db(), item_id, vector) + .map_err(Into::into) + } else { + self._arroy_del_item(wtxn, self._arroy_angular_db(), item_id, vector) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_del_item(wtxn, self._hannoy_quantized_db(), item_id, vector) + .map_err(Into::into) + } else { + self._hannoy_del_item(wtxn, self._hannoy_angular_db(), item_id, vector) + .map_err(Into::into) + } + } + + /// Delete all embeddings from a specific `item_id` + pub fn del_items( + &self, + wtxn: &mut RwTxn, + dimension: usize, + item_id: hannoy::ItemId, + ) -> Result<(), crate::Error> { + for index in vector_store_range_for_embedder(self.embedder_index) { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + let writer = arroy::Writer::new(self._arroy_quantized_db(), index, dimension); + writer.del_item(wtxn, item_id)?; + } else { + let writer = arroy::Writer::new(self._arroy_angular_db(), index, dimension); + writer.del_item(wtxn, item_id)?; + } + } else if self.quantized { + let writer = hannoy::Writer::new(self._hannoy_quantized_db(), index, dimension); + writer.del_item(wtxn, item_id)?; + } else { + let writer = hannoy::Writer::new(self._hannoy_angular_db(), index, dimension); + writer.del_item(wtxn, item_id)?; + } + } + + Ok(()) + } + + /// Removes the item specified by its id from the store specified by its id. + /// + /// Returns whether the item was removed. + /// + /// # Warning + /// + /// - This function will silently fail to remove the item if used against an arroy database that was never built. + pub fn del_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: hannoy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_del_item_in_store( + wtxn, + self._arroy_quantized_db(), + item_id, + store_id, + dimensions, + ) + .map_err(Into::into) + } else { + self._arroy_del_item_in_store( + wtxn, + self._arroy_angular_db(), + item_id, + store_id, + dimensions, + ) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_del_item_in_store( + wtxn, + self._hannoy_quantized_db(), + item_id, + store_id, + dimensions, + ) + .map_err(Into::into) + } else { + self._hannoy_del_item_in_store( + wtxn, + self._hannoy_angular_db(), + item_id, + store_id, + dimensions, + ) + .map_err(Into::into) + } + } + + /// Removes all items from the store specified by its id. + /// + /// # Warning + /// + /// - This function will silently fail to remove the items if used against an arroy database that was never built. + pub fn clear_store( + &self, + wtxn: &mut RwTxn, + store_id: u8, + dimensions: usize, + ) -> Result<(), crate::Error> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_clear_store(wtxn, self._arroy_quantized_db(), store_id, dimensions) + .map_err(Into::into) + } else { + self._arroy_clear_store(wtxn, self._arroy_angular_db(), store_id, dimensions) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_clear_store(wtxn, self._hannoy_quantized_db(), store_id, dimensions) + .map_err(Into::into) + } else { + self._hannoy_clear_store(wtxn, self._hannoy_angular_db(), store_id, dimensions) + .map_err(Into::into) + } + } + + pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), crate::Error> { + for index in vector_store_range_for_embedder(self.embedder_index) { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + let writer = arroy::Writer::new(self._arroy_quantized_db(), index, dimension); + if writer.is_empty(wtxn)? { + continue; + } + writer.clear(wtxn)?; + } else { + let writer = arroy::Writer::new(self._arroy_angular_db(), index, dimension); + if writer.is_empty(wtxn)? { + continue; + } + writer.clear(wtxn)?; + } + } else if self.quantized { + let writer = hannoy::Writer::new(self._hannoy_quantized_db(), index, dimension); + if writer.is_empty(wtxn)? { + continue; + } + writer.clear(wtxn)?; + } else { + let writer = hannoy::Writer::new(self._hannoy_angular_db(), index, dimension); + if writer.is_empty(wtxn)? { + continue; + } + writer.clear(wtxn)?; + } + } + Ok(()) + } + + pub fn contains_item( + &self, + rtxn: &RoTxn, + dimension: usize, + item: hannoy::ItemId, + ) -> crate::Result { + for index in vector_store_range_for_embedder(self.embedder_index) { + let contains = if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + let writer = arroy::Writer::new(self._arroy_quantized_db(), index, dimension); + if writer.is_empty(rtxn)? { + continue; + } + writer.contains_item(rtxn, item)? + } else { + let writer = arroy::Writer::new(self._arroy_angular_db(), index, dimension); + if writer.is_empty(rtxn)? { + continue; + } + writer.contains_item(rtxn, item)? + } + } else if self.quantized { + let writer = hannoy::Writer::new(self._hannoy_quantized_db(), index, dimension); + if writer.is_empty(rtxn)? { + continue; + } + writer.contains_item(rtxn, item)? + } else { + let writer = hannoy::Writer::new(self._hannoy_angular_db(), index, dimension); + if writer.is_empty(rtxn)? { + continue; + } + writer.contains_item(rtxn, item)? + }; + if contains { + return Ok(contains); + } + } + Ok(false) + } + + pub fn nns_by_item( + &self, + rtxn: &RoTxn, + item: ItemId, + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> crate::Result> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_nns_by_item(rtxn, self._arroy_quantized_db(), item, limit, filter) + .map_err(Into::into) + } else { + self._arroy_nns_by_item(rtxn, self._arroy_angular_db(), item, limit, filter) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_nns_by_item(rtxn, self._hannoy_quantized_db(), item, limit, filter) + .map_err(Into::into) + } else { + self._hannoy_nns_by_item(rtxn, self._hannoy_angular_db(), item, limit, filter) + .map_err(Into::into) + } + } + pub fn nns_by_vector( + &self, + rtxn: &RoTxn, + vector: &[f32], + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> crate::Result> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + self._arroy_nns_by_vector(rtxn, self._arroy_quantized_db(), vector, limit, filter) + .map_err(Into::into) + } else { + self._arroy_nns_by_vector(rtxn, self._arroy_angular_db(), vector, limit, filter) + .map_err(Into::into) + } + } else if self.quantized { + self._hannoy_nns_by_vector(rtxn, self._hannoy_quantized_db(), vector, limit, filter) + .map_err(Into::into) + } else { + self._hannoy_nns_by_vector(rtxn, self._hannoy_angular_db(), vector, limit, filter) + .map_err(Into::into) + } + } + pub fn item_vectors(&self, rtxn: &RoTxn, item_id: u32) -> crate::Result>> { + let mut vectors = Vec::new(); + + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + for reader in self._arroy_readers(rtxn, self._arroy_quantized_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } + } + } else { + for reader in self._arroy_readers(rtxn, self._arroy_angular_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } + } + } + } else if self.quantized { + for reader in self._hannoy_readers(rtxn, self._hannoy_quantized_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } + } + } else { + for reader in self._hannoy_readers(rtxn, self._hannoy_angular_db()) { + if let Some(vec) = reader?.item_vector(rtxn, item_id)? { + vectors.push(vec); + } + } + } + + Ok(vectors) + } + + pub fn aggregate_stats( + &self, + rtxn: &RoTxn, + stats: &mut VectorStoreStats, + ) -> Result<(), crate::Error> { + if self.backend == VectorStoreBackend::Arroy { + if self.quantized { + for reader in self._arroy_readers(rtxn, self._arroy_quantized_db()) { + let reader = reader?; + let documents = reader.item_ids(); + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } else { + for reader in self._arroy_readers(rtxn, self._arroy_angular_db()) { + let reader = reader?; + let documents = reader.item_ids(); + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } + } else if self.quantized { + for reader in self._hannoy_readers(rtxn, self._hannoy_quantized_db()) { + let reader = reader?; + let documents = reader.item_ids(); + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } else { + for reader in self._hannoy_readers(rtxn, self._hannoy_angular_db()) { + let reader = reader?; + let documents = reader.item_ids(); + stats.documents |= documents; + stats.number_of_embeddings += documents.len(); + } + } + + Ok(()) + } + + // private functions + fn _arroy_readers<'a, D: arroy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: arroy::Database, + ) -> impl Iterator, arroy::Error>> + 'a { + vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { + match arroy::Reader::open(rtxn, index, db) { + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, + Err(arroy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) + } + + fn _hannoy_readers<'a, D: hannoy::Distance>( + &'a self, + rtxn: &'a RoTxn<'a>, + db: hannoy::Database, + ) -> impl Iterator, hannoy::Error>> + 'a { + vector_store_range_for_embedder(self.embedder_index).filter_map(move |index| { + match hannoy::Reader::open(rtxn, index, db) { + Ok(reader) => match reader.is_empty(rtxn) { + Ok(false) => Some(Ok(reader)), + Ok(true) => None, + Err(e) => Some(Err(e)), + }, + Err(hannoy::Error::MissingMetadata(_)) => None, + Err(e) => Some(Err(e)), + } + }) + } + + fn _arroy_items_in_store( + &self, + rtxn: &RoTxn, + db: arroy::Database, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { + let index = vector_store_for_embedder(self.embedder_index, store_id); + let reader = arroy::Reader::open(rtxn, index, db); + match reader { + Ok(reader) => Ok(with_items(reader.item_ids())), + Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), + Err(err) => Err(err), + } + } + + fn _hannoy_items_in_store( + &self, + rtxn: &RoTxn, + db: hannoy::Database, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { + let index = vector_store_for_embedder(self.embedder_index, store_id); + let reader = hannoy::Reader::open(rtxn, index, db); + match reader { + Ok(reader) => Ok(with_items(reader.item_ids())), + Err(hannoy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), + Err(err) => Err(err), + } + } + + fn _arroy_add_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + for index in vector_store_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(db, index, dimension); + if !writer.contains_item(wtxn, item_id)? { + writer.add_item(wtxn, item_id, vector)?; + break; + } + } + Ok(()) + } + + fn _hannoy_add_item( + &self, + wtxn: &mut RwTxn, + db: hannoy::Database, + item_id: hannoy::ItemId, + vector: &[f32], + ) -> Result<(), hannoy::Error> { + let dimension = vector.len(); + + for index in vector_store_range_for_embedder(self.embedder_index) { + let writer = hannoy::Writer::new(db, index, dimension); + if !writer.contains_item(wtxn, item_id)? { + writer.add_item(wtxn, item_id, vector)?; + break; + } + } + Ok(()) + } + + fn _arroy_add_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + let index = vector_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimension); + writer.add_item(wtxn, item_id, vector) + } + + fn _hannoy_add_item_in_store( + &self, + wtxn: &mut RwTxn, + db: hannoy::Database, + item_id: hannoy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), hannoy::Error> { + let dimension = vector.len(); + + let index = vector_store_for_embedder(self.embedder_index, store_id); + let writer = hannoy::Writer::new(db, index, dimension); + writer.add_item(wtxn, item_id, vector) + } + + fn _arroy_del_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + let index = vector_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.del_item(wtxn, item_id) + } + + fn _hannoy_del_item_in_store( + &self, + wtxn: &mut RwTxn, + db: hannoy::Database, + item_id: hannoy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + let index = vector_store_for_embedder(self.embedder_index, store_id); + let writer = hannoy::Writer::new(db, index, dimensions); + writer.del_item(wtxn, item_id) + } + + fn _arroy_clear_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + let index = vector_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.clear(wtxn) + } + + fn _hannoy_clear_store( + &self, + wtxn: &mut RwTxn, + db: hannoy::Database, + store_id: u8, + dimensions: usize, + ) -> Result<(), hannoy::Error> { + let index = vector_store_for_embedder(self.embedder_index, store_id); + let writer = hannoy::Writer::new(db, index, dimensions); + writer.clear(wtxn) + } + + fn _arroy_del_item( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + vector: &[f32], + ) -> Result { + let dimension = vector.len(); + + for index in vector_store_range_for_embedder(self.embedder_index) { + let writer = arroy::Writer::new(db, index, dimension); + if writer.contains_item(wtxn, item_id)? { + return writer.del_item(wtxn, item_id); + } + } + Ok(false) + } + + fn _hannoy_del_item( + &self, + wtxn: &mut RwTxn, + db: hannoy::Database, + item_id: hannoy::ItemId, + vector: &[f32], + ) -> Result { + let dimension = vector.len(); + + for index in vector_store_range_for_embedder(self.embedder_index) { + let writer = hannoy::Writer::new(db, index, dimension); + if writer.contains_item(wtxn, item_id)? { + return writer.del_item(wtxn, item_id); + } + } + Ok(false) + } + + fn _arroy_nns_by_item( + &self, + rtxn: &RoTxn, + db: arroy::Database, + item: ItemId, + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for reader in self._arroy_readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } + searcher.candidates(filter); + } + + if let Some(mut ret) = searcher.by_item(rtxn, item)? { + results.append(&mut ret); + } + } + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + Ok(results) + } + + fn _hannoy_nns_by_item( + &self, + rtxn: &RoTxn, + db: hannoy::Database, + item: ItemId, + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, hannoy::Error> { + let mut results = Vec::new(); + + for reader in self._hannoy_readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + searcher.ef_search((limit * 10).max(100)); // TODO find better ef + if let Some(filter) = filter { + searcher.candidates(filter); + } + + if let Some(mut ret) = searcher.by_item(rtxn, item)? { + results.append(&mut ret); + } + } + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + Ok(results) + } + + fn _arroy_nns_by_vector( + &self, + rtxn: &RoTxn, + db: arroy::Database, + vector: &[f32], + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, arroy::Error> { + let mut results = Vec::new(); + + for reader in self._arroy_readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } + searcher.candidates(filter); + } + + results.append(&mut searcher.by_vector(rtxn, vector)?); + } + + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + + Ok(results) + } + + fn _hannoy_nns_by_vector( + &self, + rtxn: &RoTxn, + db: hannoy::Database, + vector: &[f32], + limit: usize, + filter: Option<&RoaringBitmap>, + ) -> Result, hannoy::Error> { + let mut results = Vec::new(); + + for reader in self._hannoy_readers(rtxn, db) { + let reader = reader?; + let mut searcher = reader.nns(limit); + searcher.ef_search((limit * 10).max(100)); // TODO find better ef + if let Some(filter) = filter { + searcher.candidates(filter); + } + + results.append(&mut searcher.by_vector(rtxn, vector)?); + } + + results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); + + Ok(results) + } + + fn _arroy_angular_db(&self) -> arroy::Database { + self.database.remap_types() + } + + fn _arroy_quantized_db(&self) -> arroy::Database { + self.database.remap_types() + } + + fn _hannoy_angular_db(&self) -> hannoy::Database { + self.database.remap_data_type() + } + + fn _hannoy_quantized_db(&self) -> hannoy::Database { + self.database.remap_data_type() + } + + fn _arroy_to_hannoy_bq( + self, + arroy_rtxn: &RoTxn, + hannoy_wtxn: &mut RwTxn, + progress: &Progress, + rng: &mut R, + cancel: &(impl Fn() -> bool + Sync + Send), + ) -> crate::Result<()> + where + R: rand::Rng + rand::SeedableRng, + { + for index in vector_store_range_for_embedder(self.embedder_index) { + let arroy_reader: arroy::Reader = + match arroy::Reader::open(arroy_rtxn, index, self.database.remap_types()) { + Ok(reader) => reader, + Err(arroy::Error::MissingMetadata(_)) => continue, + Err(err) => return Err(err.into()), + }; + let dimensions = arroy_reader.dimensions(); + let hannoy_writer: hannoy::Writer = + hannoy::Writer::new(self.database.remap_types(), index, dimensions); + // Since the bq mode of arroy and hannoy are not compatible, we have to clear and re-insert everything + hannoy_writer.clear(hannoy_wtxn)?; + for entry in arroy_reader.iter(arroy_rtxn)? { + let (item, mut vector) = entry?; + // arroy bug? the `vector` here can be longer than `dimensions`. + // workaround: truncating. + if vector.len() > dimensions { + vector.truncate(dimensions); + } + hannoy_writer.add_item(hannoy_wtxn, item, &vector)?; + } + hannoy_build(hannoy_wtxn, progress, rng, cancel, &hannoy_writer)?; + } + Ok(()) + } + + fn _hannoy_to_arroy_bq( + self, + hannoy_rtxn: &RoTxn, + arroy_wtxn: &mut RwTxn, + progress: &Progress, + rng: &mut R, + available_memory: Option, + cancel: &(impl Fn() -> bool + Sync + Send), + ) -> crate::Result<()> + where + R: rand::Rng + rand::SeedableRng, + { + for index in vector_store_range_for_embedder(self.embedder_index) { + let hannoy_reader: hannoy::Reader = + match hannoy::Reader::open(hannoy_rtxn, index, self.database.remap_types()) { + Ok(reader) => reader, + Err(hannoy::Error::MissingMetadata(_)) => continue, + Err(err) => return Err(err.into()), + }; + let dimensions = hannoy_reader.dimensions(); + let arroy_writer: arroy::Writer = + arroy::Writer::new(self.database.remap_types(), index, dimensions); + // Since the bq mode of arroy and hannoy are not compatible, we have to clear and re-insert everything + arroy_writer.clear(arroy_wtxn)?; + for entry in hannoy_reader.iter(hannoy_rtxn)? { + let (item, mut vector) = entry?; + // hannoy bug? the `vector` here can be longer than `dimensions`. + // workaround: truncating. + if vector.len() > dimensions { + vector.truncate(dimensions); + } + // arroy and hannoy disagreement over the 0 value + // - arroy does: + // - if x >= 0 => 1 + // - if x < 0 => -1 + // - hannoy does: + // - if x > 0 => 1 + // - if x <= 0 => 0 + // because of this, a 0 from a bq hannoy will be converted to a 1 in arroy, destroying the information. + // to fix that, we subtract 0.5 from the hannoy vector, so that any zero value is translated to a strictly + // negative value. + for x in &mut vector { + *x -= 0.5; + } + + arroy_writer.add_item(arroy_wtxn, item, &vector)?; + } + arroy_build(arroy_wtxn, progress, rng, available_memory, cancel, &arroy_writer)?; + } + Ok(()) + } +} + +fn arroy_build( + wtxn: &mut RwTxn<'_>, + progress: &Progress, + rng: &mut R, + available_memory: Option, + cancel: &(impl Fn() -> bool + Sync + Send), + writer: &arroy::Writer, +) -> Result<(), crate::Error> +where + R: rand::Rng + rand::SeedableRng, + D: arroy::Distance, +{ + let mut builder = writer.builder(rng); + let builder = builder.progress(|step| progress.update_progress_from_arroy(step)); + builder.available_memory(available_memory.unwrap_or(usize::MAX)).cancel(cancel).build(wtxn)?; + Ok(()) +} + +fn hannoy_build( + wtxn: &mut RwTxn<'_>, + progress: &Progress, + rng: &mut R, + cancel: &(impl Fn() -> bool + Sync + Send), + writer: &hannoy::Writer, +) -> Result<(), crate::Error> +where + R: rand::Rng + rand::SeedableRng, + D: hannoy::Distance, +{ + let mut builder = writer.builder(rng).progress(progress.clone()); + builder + .cancel(cancel) + .ef_construction(HANNOY_EF_CONSTRUCTION) + .build::(wtxn)?; + Ok(()) +} + +#[derive(Debug, Default, Clone)] +pub struct VectorStoreStats { + pub number_of_embeddings: u64, + pub documents: RoaringBitmap, +} + +fn vector_store_range_for_embedder(embedder_id: u8) -> impl Iterator { + (0..=u8::MAX).map(move |store_id| vector_store_for_embedder(embedder_id, store_id)) +} + +fn vector_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 { + let embedder_id = (embedder_id as u16) << 8; + embedder_id | (store_id as u16) +} diff --git a/crates/tracing-trace/src/main.rs b/crates/tracing-trace/src/main.rs index 4a3d26923..22c96ec78 100644 --- a/crates/tracing-trace/src/main.rs +++ b/crates/tracing-trace/src/main.rs @@ -59,7 +59,7 @@ fn fibo_recursive(n: u32) -> u32 { if n == 1 { return 2; } - return fibo_recursive(n - 1) - fibo_recursive(n - 2); + fibo_recursive(n - 1) - fibo_recursive(n - 2) } use tracing_error::ExtractSpanTrace as _;