mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-19 21:10:34 +00:00
Compare commits
218 Commits
prototype-
...
v1.6.2
Author | SHA1 | Date | |
---|---|---|---|
1a083d54fc | |||
3a97d30cd9 | |||
3bdb2b06be | |||
64afed4dbe | |||
b26ddfcc3d | |||
c59f3f2f95 | |||
049bd45849 | |||
2491db8746 | |||
425bc92ce6 | |||
cbd065ed46 | |||
b9f365a965 | |||
3f21daf2e7 | |||
d77df4ecdb | |||
fdac97e3c8 | |||
bbdfbd8ea1 | |||
da7c796be1 | |||
014eaea428 | |||
a6fa0b97ec | |||
38abfec611 | |||
84a5c304fc | |||
e93d36d5b9 | |||
95f8e21533 | |||
68f197624e | |||
b79b03d4e2 | |||
86270e6878 | |||
81b6128b29 | |||
5f5a486895 | |||
5f4fc6c955 | |||
1f5e8fc072 | |||
3f3462ab62 | |||
93363b0201 | |||
97bb1ff9e2 | |||
5ee1378856 | |||
e27b850b09 | |||
f75f22e026 | |||
6203f4acef | |||
12edc2c20a | |||
94b9f3b310 | |||
da99a04eb3 | |||
54ae6951eb | |||
658ec6e0a4 | |||
43e822e802 | |||
ee54d3171e | |||
a0e713c4e7 | |||
d4cb0a885b | |||
f52dee2b3b | |||
0bf879fb88 | |||
6ff81de401 | |||
2e4c9651df | |||
ec9649c922 | |||
9123370e90 | |||
14b396d302 | |||
393216bf30 | |||
e249e4db7b | |||
de2ca7006e | |||
333ce12eb2 | |||
fb9db1eba6 | |||
fa2b96b9a5 | |||
19736cefe8 | |||
4fb25b8782 | |||
c83a33017e | |||
be72326c0a | |||
547379abb0 | |||
0b2fff27f2 | |||
3adbc2b942 | |||
fbea721378 | |||
391eb72137 | |||
d78ad51082 | |||
1956045a06 | |||
b2193e612f | |||
942d49314c | |||
9a846e82bc | |||
9df8cfc013 | |||
248aaa6d45 | |||
50d6317ec0 | |||
b734bd9891 | |||
9800d5a103 | |||
7c4ed07617 | |||
3a99a555a2 | |||
9e1b458010 | |||
2aede03bc2 | |||
e741bc1c62 | |||
6425996e36 | |||
eb5cb91da2 | |||
87bba98bd8 | |||
217105b7da | |||
1b7c164a55 | |||
f3f3944469 | |||
93dcbf598d | |||
ac68f33194 | |||
9991152bbe | |||
a4536b1381 | |||
5b51cb04af | |||
3c1a14f1cd | |||
b8e4709dfa | |||
806e5b6899 | |||
61bd2fb7a9 | |||
e0cc775dc4 | |||
12940d79a9 | |||
922a640188 | |||
abbe131084 | |||
d4715e0c4d | |||
11e2a2c1aa | |||
65e49b7092 | |||
e56f160032 | |||
687d92f217 | |||
fb539f61fe | |||
cb4ebe163e | |||
dde3a04679 | |||
13c2c6c16b | |||
21bcf32109 | |||
35e1981488 | |||
e0f712b9d3 | |||
56571f762a | |||
005800634d | |||
976af4fa8f | |||
99fec27788 | |||
afa8f273a8 | |||
4b644f6bc0 | |||
7e259cb0d2 | |||
0fbc1511d7 | |||
c9860c7913 | |||
03ffabe889 | |||
1f4fc9c229 | |||
8cc3c54117 | |||
467b49153d | |||
0c3fa8cbc4 | |||
bddc168d83 | |||
84a36002d7 | |||
c95d68e244 | |||
3b3fa38f27 | |||
170e063b80 | |||
d6c2ee15a9 | |||
6376c342c1 | |||
5b563f872b | |||
ec9b52d608 | |||
34c67ac389 | |||
d050c9b4ae | |||
7dd1226faf | |||
1575456594 | |||
add2ceef67 | |||
548c8247c2 | |||
181ca48482 | |||
5751f5c640 | |||
d32eb11329 | |||
dc07790133 | |||
3d23b388bc | |||
85626cff8e | |||
58dac8af42 | |||
0dbf1a16ff | |||
462b4c0080 | |||
0d4482625a | |||
56a0d91ecd | |||
b366acdae6 | |||
7cb7e37ba8 | |||
33b7c574ea | |||
d3575fb028 | |||
39cbb499c2 | |||
ebef6bc24d | |||
d59b7db8d0 | |||
263e825619 | |||
69354a6144 | |||
b0adc73ce6 | |||
2b5d9042d1 | |||
5b57fbab08 | |||
72d3fa4898 | |||
772964125d | |||
378deb0bef | |||
1f36410541 | |||
b11f85a635 | |||
a2d6dc8571 | |||
ee1701157f | |||
8c649d8061 | |||
492fc086f0 | |||
a2d0c73b41 | |||
264b10ec20 | |||
825257da76 | |||
f8289cd974 | |||
3053e01c05 | |||
b11c2afac0 | |||
9cef800b2a | |||
db2fb86b8b | |||
882ab9cc85 | |||
5a9c96e1db | |||
70ce40828c | |||
688266c83e | |||
6dab826908 | |||
1e2fbc6a42 | |||
523519fdbf | |||
ef6fa10f7a | |||
48865470d7 | |||
c810df4d9f | |||
5e3df76699 | |||
02765fb267 | |||
841165d529 | |||
ea4a266f08 | |||
49f069ed97 | |||
be16b99d40 | |||
ec0c09d17c | |||
a9230f6e6c | |||
54f0ee1ed2 | |||
ce5647e730 | |||
b57b818b67 | |||
f7ea94e5f4 | |||
53382bb1b8 | |||
5b004a2583 | |||
13416ccbf7 | |||
2614e7d9ca | |||
e7244aa485 | |||
9cacc82307 | |||
4c6fddb1cb | |||
ca52021079 | |||
ee6f79d60b | |||
e4c24ca6a3 | |||
2bae9550c8 | |||
32c78ac8b1 | |||
5fe7c4545a | |||
2042229927 |
3
.github/workflows/benchmarks-pr.yml
vendored
3
.github/workflows/benchmarks-pr.yml
vendored
@ -90,7 +90,8 @@ jobs:
|
||||
set -x
|
||||
export base_ref=$(git merge-base origin/main ${{ steps.comment-branch.outputs.head_ref }} | head -c8)
|
||||
export base_filename=$(echo ${{ steps.command.outputs.command-arguments }}_main_${base_ref}.json)
|
||||
echo 'Here are your benchmarks diff 👊' >> body.txt
|
||||
export bench_name=$(echo ${{ steps.command.outputs.command-arguments }})
|
||||
echo "Here are your $bench_name benchmarks diff 👊" >> body.txt
|
||||
echo '```' >> body.txt
|
||||
./benchmarks/scripts/compare.sh $base_filename ${{ steps.file.outputs.basename }}.json >> body.txt
|
||||
echo '```' >> body.txt
|
||||
|
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
2
.github/workflows/publish-apt-brew-pkg.yml
vendored
@ -50,7 +50,7 @@ jobs:
|
||||
needs: check-version
|
||||
steps:
|
||||
- name: Create PR to Homebrew
|
||||
uses: mislav/bump-homebrew-formula-action@v2
|
||||
uses: mislav/bump-homebrew-formula-action@v3
|
||||
with:
|
||||
formula-name: meilisearch
|
||||
formula-path: Formula/m/meilisearch.rb
|
||||
|
2
.github/workflows/publish-docker-images.yml
vendored
2
.github/workflows/publish-docker-images.yml
vendored
@ -63,7 +63,7 @@ jobs:
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@v2
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
username: ${{ secrets.DOCKERHUB_USERNAME }}
|
||||
password: ${{ secrets.DOCKERHUB_TOKEN }}
|
||||
|
4
.github/workflows/sdks-tests.yml
vendored
4
.github/workflows/sdks-tests.yml
vendored
@ -160,7 +160,7 @@ jobs:
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
cache: 'yarn'
|
||||
- name: Install dependencies
|
||||
@ -318,7 +318,7 @@ jobs:
|
||||
with:
|
||||
repository: meilisearch/meilisearch-js-plugins
|
||||
- name: Setup node
|
||||
uses: actions/setup-node@v3
|
||||
uses: actions/setup-node@v4
|
||||
with:
|
||||
cache: yarn
|
||||
- name: Install dependencies
|
||||
|
10
.github/workflows/test-suite.yml
vendored
10
.github/workflows/test-suite.yml
vendored
@ -43,7 +43,7 @@ jobs:
|
||||
toolchain: nightly
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -65,7 +65,7 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo check without any default features
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -149,7 +149,7 @@ jobs:
|
||||
toolchain: stable
|
||||
override: true
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run tests in debug
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -168,7 +168,7 @@ jobs:
|
||||
override: true
|
||||
components: clippy
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo clippy
|
||||
uses: actions-rs/cargo@v1
|
||||
with:
|
||||
@ -187,7 +187,7 @@ jobs:
|
||||
override: true
|
||||
components: rustfmt
|
||||
- name: Cache dependencies
|
||||
uses: Swatinem/rust-cache@v2.6.2
|
||||
uses: Swatinem/rust-cache@v2.7.1
|
||||
- name: Run cargo fmt
|
||||
# Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file.
|
||||
# Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate
|
||||
|
2074
Cargo.lock
generated
2074
Cargo.lock
generated
File diff suppressed because it is too large
Load Diff
@ -2,6 +2,7 @@
|
||||
resolver = "2"
|
||||
members = [
|
||||
"meilisearch",
|
||||
"meilitool",
|
||||
"meilisearch-types",
|
||||
"meilisearch-auth",
|
||||
"meili-snap",
|
||||
@ -18,7 +19,7 @@ members = [
|
||||
]
|
||||
|
||||
[workspace.package]
|
||||
version = "1.4.1"
|
||||
version = "1.6.2"
|
||||
authors = ["Quentin de Quelen <quentin@dequelen.me>", "Clément Renault <clement@meilisearch.com>"]
|
||||
description = "Meilisearch HTTP server"
|
||||
homepage = "https://meilisearch.com"
|
||||
|
13
Dockerfile
13
Dockerfile
@ -1,9 +1,9 @@
|
||||
# Compile
|
||||
FROM rust:alpine3.16 AS compiler
|
||||
FROM rust:1.71.1-alpine3.18 AS compiler
|
||||
|
||||
RUN apk add -q --update-cache --no-cache build-base openssl-dev
|
||||
|
||||
WORKDIR /meilisearch
|
||||
WORKDIR /
|
||||
|
||||
ARG COMMIT_SHA
|
||||
ARG COMMIT_DATE
|
||||
@ -17,7 +17,7 @@ RUN set -eux; \
|
||||
if [ "$apkArch" = "aarch64" ]; then \
|
||||
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
|
||||
fi && \
|
||||
cargo build --release
|
||||
cargo build --release -p meilisearch -p meilitool
|
||||
|
||||
# Run
|
||||
FROM alpine:3.16
|
||||
@ -28,9 +28,10 @@ ENV MEILI_SERVER_PROVIDER docker
|
||||
RUN apk update --quiet \
|
||||
&& apk add -q --no-cache libgcc tini curl
|
||||
|
||||
# add meilisearch to the `/bin` so you can run it from anywhere and it's easy
|
||||
# to find.
|
||||
COPY --from=compiler /meilisearch/target/release/meilisearch /bin/meilisearch
|
||||
# add meilisearch and meilitool to the `/bin` so you can run it from anywhere
|
||||
# and it's easy to find.
|
||||
COPY --from=compiler /target/release/meilisearch /bin/meilisearch
|
||||
COPY --from=compiler /target/release/meilitool /bin/meilitool
|
||||
# To stay compatible with the older version of the container (pre v0.27.0) we're
|
||||
# going to symlink the meilisearch binary in the path to `/meilisearch`
|
||||
RUN ln -s /bin/meilisearch /meilisearch
|
||||
|
@ -25,12 +25,6 @@
|
||||
|
||||
<p align="center">⚡ A lightning-fast search engine that fits effortlessly into your apps, websites, and workflow 🔍</p>
|
||||
|
||||
---
|
||||
|
||||
### 🔥 On November 2nd, we are hosting our first-ever live demo and product updates for [Meilisearch Cloud](https://www.meilisearch.com/cloud?utm_campaign=oss&utm_source=github&utm_medium=meilisearch). Make sure to [register here](https://us06web.zoom.us/meeting/register/tZMlc-mqrjIsH912-HTRe-AaT-pp41bDe81a#/registration) and bring your questions for live Q&A!
|
||||
|
||||
---
|
||||
|
||||
Meilisearch helps you shape a delightful search experience in a snap, offering features that work out-of-the-box to speed up your workflow.
|
||||
|
||||
<p align="center" name="demo">
|
||||
|
@ -36,7 +36,7 @@ fn setup_index() -> Index {
|
||||
}
|
||||
|
||||
fn setup_settings<'t>(
|
||||
wtxn: &mut RwTxn<'t, '_>,
|
||||
wtxn: &mut RwTxn<'t>,
|
||||
index: &'t Index,
|
||||
primary_key: &str,
|
||||
searchable_fields: &[&str],
|
||||
@ -864,22 +864,12 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBi
|
||||
|
||||
let indexer_config = IndexerConfig::default();
|
||||
for ids in document_ids_to_delete {
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(&wtxn, ids)
|
||||
.unwrap()
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let ids = external_to_internal.unwrap();
|
||||
let config = IndexDocumentsConfig::default();
|
||||
|
||||
let mut builder =
|
||||
IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
|
||||
.unwrap();
|
||||
(builder, _) = builder.remove_documents(ids).unwrap();
|
||||
(builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
|
||||
builder.execute().unwrap();
|
||||
}
|
||||
|
||||
|
@ -129,3 +129,6 @@ experimental_enable_metrics = false
|
||||
|
||||
# Experimental RAM reduction during indexing, do not use in production, see: <https://github.com/meilisearch/product/discussions/652>
|
||||
experimental_reduce_indexing_memory_usage = false
|
||||
|
||||
# Experimentally reduces the maximum number of tasks that will be processed at once, see: <https://github.com/orgs/meilisearch/discussions/713>
|
||||
# experimental_max_number_of_batched_tasks = 100
|
||||
|
@ -267,6 +267,7 @@ pub(crate) mod test {
|
||||
dictionary: Setting::NotSet,
|
||||
synonyms: Setting::NotSet,
|
||||
distinct_attribute: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
typo_tolerance: Setting::NotSet,
|
||||
faceting: Setting::Set(FacetingSettings {
|
||||
max_values_per_facet: Setting::Set(111),
|
||||
@ -275,6 +276,7 @@ pub(crate) mod test {
|
||||
),
|
||||
}),
|
||||
pagination: Setting::NotSet,
|
||||
embedders: Setting::NotSet,
|
||||
_kind: std::marker::PhantomData,
|
||||
};
|
||||
settings.check()
|
||||
|
@ -345,6 +345,7 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
|
||||
dictionary: v6::Setting::NotSet,
|
||||
synonyms: settings.synonyms.into(),
|
||||
distinct_attribute: settings.distinct_attribute.into(),
|
||||
proximity_precision: v6::Setting::NotSet,
|
||||
typo_tolerance: match settings.typo_tolerance {
|
||||
v5::Setting::Set(typo) => v6::Setting::Set(v6::TypoTolerance {
|
||||
enabled: typo.enabled.into(),
|
||||
@ -377,6 +378,7 @@ impl<T> From<v5::Settings<T>> for v6::Settings<v6::Unchecked> {
|
||||
v5::Setting::Reset => v6::Setting::Reset,
|
||||
v5::Setting::NotSet => v6::Setting::NotSet,
|
||||
},
|
||||
embedders: v6::Setting::NotSet,
|
||||
_kind: std::marker::PhantomData,
|
||||
}
|
||||
}
|
||||
|
@ -13,12 +13,12 @@ use crate::{Result, Version};
|
||||
|
||||
mod compat;
|
||||
|
||||
pub(self) mod v1;
|
||||
pub(self) mod v2;
|
||||
pub(self) mod v3;
|
||||
pub(self) mod v4;
|
||||
pub(self) mod v5;
|
||||
pub(self) mod v6;
|
||||
mod v1;
|
||||
mod v2;
|
||||
mod v3;
|
||||
mod v4;
|
||||
mod v5;
|
||||
mod v6;
|
||||
|
||||
pub type Document = serde_json::Map<String, serde_json::Value>;
|
||||
pub type UpdateFile = dyn Iterator<Item = Result<Document>>;
|
||||
|
@ -56,8 +56,7 @@ pub enum RankingRule {
|
||||
Desc(String),
|
||||
}
|
||||
|
||||
static ASC_DESC_REGEX: Lazy<Regex> =
|
||||
Lazy::new(|| Regex::new(r#"(asc|desc)\(([\w_-]+)\)"#).unwrap());
|
||||
static ASC_DESC_REGEX: Lazy<Regex> = Lazy::new(|| Regex::new(r"(asc|desc)\(([\w_-]+)\)").unwrap());
|
||||
|
||||
impl FromStr for RankingRule {
|
||||
type Err = ();
|
||||
|
@ -564,10 +564,10 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn parse_escaped() {
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\'"#), @r#"{title} = {foo\}"#);
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\'"#), @r#"{title} = {foo\\}"#);
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\'"#), @r#"{title} = {foo\\\}"#);
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\\\\\\\\'"#), @r#"{title} = {foo\\\\}"#);
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\'"), @r#"{title} = {foo\}"#);
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\\\'"), @r#"{title} = {foo\\}"#);
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\'"), @r#"{title} = {foo\\\}"#);
|
||||
insta::assert_display_snapshot!(p(r"title = 'foo\\\\\\\\'"), @r#"{title} = {foo\\\\}"#);
|
||||
// but it also works with other sequencies
|
||||
insta::assert_display_snapshot!(p(r#"title = 'foo\x20\n\t\"\'"'"#), @"{title} = {foo \n\t\"\'\"}");
|
||||
}
|
||||
|
@ -270,8 +270,8 @@ pub mod test {
|
||||
("aaaa", "", rtok("", "aaaa"), "aaaa"),
|
||||
(r#"aa"aa"#, r#""aa"#, rtok("", "aa"), "aa"),
|
||||
(r#"aa\"aa"#, r#""#, rtok("", r#"aa\"aa"#), r#"aa"aa"#),
|
||||
(r#"aa\\\aa"#, r#""#, rtok("", r#"aa\\\aa"#), r#"aa\\\aa"#),
|
||||
(r#"aa\\"\aa"#, r#""\aa"#, rtok("", r#"aa\\"#), r#"aa\\"#),
|
||||
(r"aa\\\aa", r#""#, rtok("", r"aa\\\aa"), r"aa\\\aa"),
|
||||
(r#"aa\\"\aa"#, r#""\aa"#, rtok("", r"aa\\"), r"aa\\"),
|
||||
(r#"aa\\\"\aa"#, r#""#, rtok("", r#"aa\\\"\aa"#), r#"aa\\"\aa"#),
|
||||
(r#"\"\""#, r#""#, rtok("", r#"\"\""#), r#""""#),
|
||||
];
|
||||
@ -301,12 +301,12 @@ pub mod test {
|
||||
);
|
||||
// simple quote
|
||||
assert_eq!(
|
||||
unescape(Span::new_extra(r#"Hello \'World\'"#, ""), '\''),
|
||||
unescape(Span::new_extra(r"Hello \'World\'", ""), '\''),
|
||||
r#"Hello 'World'"#.to_string()
|
||||
);
|
||||
assert_eq!(
|
||||
unescape(Span::new_extra(r#"Hello \\\'World\\\'"#, ""), '\''),
|
||||
r#"Hello \\'World\\'"#.to_string()
|
||||
unescape(Span::new_extra(r"Hello \\\'World\\\'", ""), '\''),
|
||||
r"Hello \\'World\\'".to_string()
|
||||
);
|
||||
}
|
||||
|
||||
@ -335,19 +335,19 @@ pub mod test {
|
||||
("\"cha'nnel\"", "cha'nnel", false),
|
||||
("I'm tamo", "I", false),
|
||||
// escaped thing but not quote
|
||||
(r#""\\""#, r#"\"#, true),
|
||||
(r#""\\\\\\""#, r#"\\\"#, true),
|
||||
(r#""aa\\aa""#, r#"aa\aa"#, true),
|
||||
(r#""\\""#, r"\", true),
|
||||
(r#""\\\\\\""#, r"\\\", true),
|
||||
(r#""aa\\aa""#, r"aa\aa", true),
|
||||
// with double quote
|
||||
(r#""Hello \"world\"""#, r#"Hello "world""#, true),
|
||||
(r#""Hello \\\"world\\\"""#, r#"Hello \"world\""#, true),
|
||||
(r#""I'm \"super\" tamo""#, r#"I'm "super" tamo"#, true),
|
||||
(r#""\"\"""#, r#""""#, true),
|
||||
// with simple quote
|
||||
(r#"'Hello \'world\''"#, r#"Hello 'world'"#, true),
|
||||
(r#"'Hello \\\'world\\\''"#, r#"Hello \'world\'"#, true),
|
||||
(r"'Hello \'world\''", r#"Hello 'world'"#, true),
|
||||
(r"'Hello \\\'world\\\''", r"Hello \'world\'", true),
|
||||
(r#"'I\'m "super" tamo'"#, r#"I'm "super" tamo"#, true),
|
||||
(r#"'\'\''"#, r#"''"#, true),
|
||||
(r"'\'\''", r#"''"#, true),
|
||||
];
|
||||
|
||||
for (input, expected, escaped) in test_case {
|
||||
|
@ -113,7 +113,7 @@ fn main() {
|
||||
index.documents(&wtxn, res.documents_ids).unwrap();
|
||||
progression.fetch_add(1, Ordering::Relaxed);
|
||||
}
|
||||
wtxn.abort().unwrap();
|
||||
wtxn.abort();
|
||||
});
|
||||
if let err @ Err(_) = handle.join() {
|
||||
stop.store(true, Ordering::Relaxed);
|
||||
|
@ -18,11 +18,12 @@ derive_builder = "0.12.0"
|
||||
dump = { path = "../dump" }
|
||||
enum-iterator = "1.4.0"
|
||||
file-store = { path = "../file-store" }
|
||||
flate2 = "1.0.28"
|
||||
log = "0.4.17"
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
page_size = "0.5.0"
|
||||
puffin = "0.16.0"
|
||||
puffin = { version = "0.16.0", features = ["serialization"] }
|
||||
roaring = { version = "0.10.1", features = ["serde"] }
|
||||
serde = { version = "1.0.160", features = ["derive"] }
|
||||
serde_json = { version = "1.0.95", features = ["preserve_order"] }
|
||||
@ -30,6 +31,7 @@ synchronoise = "1.0.1"
|
||||
tempfile = "3.5.0"
|
||||
thiserror = "1.0.40"
|
||||
time = { version = "0.3.20", features = ["serde-well-known", "formatting", "parsing", "macros"] }
|
||||
ureq = "2.9.1"
|
||||
uuid = { version = "1.3.1", features = ["serde", "v4"] }
|
||||
|
||||
[dev-dependencies]
|
||||
|
@ -32,7 +32,7 @@ use meilisearch_types::milli::heed::CompactionOption;
|
||||
use meilisearch_types::milli::update::{
|
||||
IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
|
||||
};
|
||||
use meilisearch_types::milli::{self, Filter, BEU32};
|
||||
use meilisearch_types::milli::{self, Filter};
|
||||
use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
|
||||
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status, Task};
|
||||
use meilisearch_types::{compression, Index, VERSION_FILE_NAME};
|
||||
@ -104,12 +104,6 @@ pub(crate) enum IndexOperation {
|
||||
operations: Vec<DocumentOperation>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
DocumentDeletion {
|
||||
index_uid: String,
|
||||
// The vec associated with each document deletion tasks.
|
||||
documents: Vec<Vec<String>>,
|
||||
tasks: Vec<Task>,
|
||||
},
|
||||
IndexDocumentDeletionByFilter {
|
||||
index_uid: String,
|
||||
task: Task,
|
||||
@ -161,7 +155,6 @@ impl Batch {
|
||||
}
|
||||
Batch::IndexOperation { op, .. } => match op {
|
||||
IndexOperation::DocumentOperation { tasks, .. }
|
||||
| IndexOperation::DocumentDeletion { tasks, .. }
|
||||
| IndexOperation::Settings { tasks, .. }
|
||||
| IndexOperation::DocumentClear { tasks, .. } => {
|
||||
tasks.iter().map(|task| task.uid).collect()
|
||||
@ -226,7 +219,6 @@ impl IndexOperation {
|
||||
pub fn index_uid(&self) -> &str {
|
||||
match self {
|
||||
IndexOperation::DocumentOperation { index_uid, .. }
|
||||
| IndexOperation::DocumentDeletion { index_uid, .. }
|
||||
| IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
|
||||
| IndexOperation::DocumentClear { index_uid, .. }
|
||||
| IndexOperation::Settings { index_uid, .. }
|
||||
@ -242,9 +234,6 @@ impl fmt::Display for IndexOperation {
|
||||
IndexOperation::DocumentOperation { .. } => {
|
||||
f.write_str("IndexOperation::DocumentOperation")
|
||||
}
|
||||
IndexOperation::DocumentDeletion { .. } => {
|
||||
f.write_str("IndexOperation::DocumentDeletion")
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { .. } => {
|
||||
f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
|
||||
}
|
||||
@ -347,18 +336,27 @@ impl IndexScheduler {
|
||||
BatchKind::DocumentDeletion { deletion_ids } => {
|
||||
let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
|
||||
|
||||
let mut documents = Vec::new();
|
||||
let mut operations = Vec::with_capacity(tasks.len());
|
||||
let mut documents_counts = Vec::with_capacity(tasks.len());
|
||||
for task in &tasks {
|
||||
match task.kind {
|
||||
KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
|
||||
documents.push(documents_ids.clone())
|
||||
operations.push(DocumentOperation::Delete(documents_ids.clone()));
|
||||
documents_counts.push(documents_ids.len() as u64);
|
||||
}
|
||||
_ => unreachable!(),
|
||||
}
|
||||
}
|
||||
|
||||
Ok(Some(Batch::IndexOperation {
|
||||
op: IndexOperation::DocumentDeletion { index_uid, documents, tasks },
|
||||
op: IndexOperation::DocumentOperation {
|
||||
index_uid,
|
||||
primary_key: None,
|
||||
method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
documents_counts,
|
||||
operations,
|
||||
tasks,
|
||||
},
|
||||
must_create_index,
|
||||
}))
|
||||
}
|
||||
@ -586,7 +584,9 @@ impl IndexScheduler {
|
||||
let index_tasks = self.index_tasks(rtxn, index_name)? & enqueued;
|
||||
|
||||
// If autobatching is disabled we only take one task at a time.
|
||||
let tasks_limit = if self.autobatching_enabled { usize::MAX } else { 1 };
|
||||
// Otherwise, we take only a maximum of tasks to create batches.
|
||||
let tasks_limit =
|
||||
if self.autobatching_enabled { self.max_number_of_batched_tasks } else { 1 };
|
||||
|
||||
let enqueued = index_tasks
|
||||
.into_iter()
|
||||
@ -717,7 +717,7 @@ impl IndexScheduler {
|
||||
|
||||
// 2. Snapshot the index-scheduler LMDB env
|
||||
//
|
||||
// When we call copy_to_path, LMDB opens a read transaction by itself,
|
||||
// When we call copy_to_file, LMDB opens a read transaction by itself,
|
||||
// we can't provide our own. It is an issue as we would like to know
|
||||
// the update files to copy but new ones can be enqueued between the copy
|
||||
// of the env and the new transaction we open to retrieve the enqueued tasks.
|
||||
@ -730,7 +730,7 @@ impl IndexScheduler {
|
||||
// 2.1 First copy the LMDB env of the index-scheduler
|
||||
let dst = temp_snapshot_dir.path().join("tasks");
|
||||
fs::create_dir_all(&dst)?;
|
||||
self.env.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?;
|
||||
self.env.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
|
||||
|
||||
// 2.2 Create a read transaction on the index-scheduler
|
||||
let rtxn = self.env.read_txn()?;
|
||||
@ -755,7 +755,7 @@ impl IndexScheduler {
|
||||
let index = self.index_mapper.index(&rtxn, name)?;
|
||||
let dst = temp_snapshot_dir.path().join("indexes").join(uuid.to_string());
|
||||
fs::create_dir_all(&dst)?;
|
||||
index.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?;
|
||||
index.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
|
||||
}
|
||||
|
||||
drop(rtxn);
|
||||
@ -768,7 +768,7 @@ impl IndexScheduler {
|
||||
.map_size(1024 * 1024 * 1024) // 1 GiB
|
||||
.max_dbs(2)
|
||||
.open(&self.auth_path)?;
|
||||
auth.copy_to_path(dst.join("data.mdb"), CompactionOption::Enabled)?;
|
||||
auth.copy_to_file(dst.join("data.mdb"), CompactionOption::Enabled)?;
|
||||
|
||||
// 5. Copy and tarball the flat snapshot
|
||||
// 5.1 Find the original name of the database
|
||||
@ -824,6 +824,10 @@ impl IndexScheduler {
|
||||
// 2. dump the tasks
|
||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||
for ret in self.all_tasks.iter(&rtxn)? {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
|
||||
let (_, mut t) = ret?;
|
||||
let status = t.status;
|
||||
let content_file = t.content_uuid();
|
||||
@ -844,6 +848,9 @@ impl IndexScheduler {
|
||||
|
||||
// 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||
if let Some(content_file) = content_file {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
if status == Status::Enqueued {
|
||||
let content_file = self.file_store.get_update(content_file)?;
|
||||
|
||||
@ -883,6 +890,9 @@ impl IndexScheduler {
|
||||
|
||||
// 3.1. Dump the documents
|
||||
for ret in index.all_documents(&rtxn)? {
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
let (_id, doc) = ret?;
|
||||
let document = milli::obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
index_dumper.push_document(&document)?;
|
||||
@ -902,6 +912,9 @@ impl IndexScheduler {
|
||||
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
||||
)).unwrap();
|
||||
|
||||
if self.must_stop_processing.get() {
|
||||
return Err(Error::AbortedTask);
|
||||
}
|
||||
let path = self.dumps_path.join(format!("{}.dump", dump_uid));
|
||||
let file = File::create(path)?;
|
||||
dump.persist_to(BufWriter::new(file))?;
|
||||
@ -922,6 +935,10 @@ impl IndexScheduler {
|
||||
self.index_mapper.index(&rtxn, &index_uid)?
|
||||
};
|
||||
|
||||
// the index operation can take a long time, so save this handle to make it available to the search for the duration of the tick
|
||||
self.index_mapper
|
||||
.set_currently_updating_index(Some((index_uid.clone(), index.clone())));
|
||||
|
||||
let mut index_wtxn = index.write_txn()?;
|
||||
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
|
||||
index_wtxn.commit()?;
|
||||
@ -1091,7 +1108,7 @@ impl IndexScheduler {
|
||||
for task_id in &index_lhs_task_ids | &index_rhs_task_ids {
|
||||
let mut task = self.get_task(wtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?;
|
||||
swap_index_uid_in_task(&mut task, (lhs, rhs));
|
||||
self.all_tasks.put(wtxn, &BEU32::new(task_id), &task)?;
|
||||
self.all_tasks.put(wtxn, &task_id, &task)?;
|
||||
}
|
||||
|
||||
// 4. remove the task from indexuid = before_name
|
||||
@ -1117,7 +1134,7 @@ impl IndexScheduler {
|
||||
/// The list of processed tasks.
|
||||
fn apply_index_operation<'i>(
|
||||
&self,
|
||||
index_wtxn: &mut RwTxn<'i, '_>,
|
||||
index_wtxn: &mut RwTxn<'i>,
|
||||
index: &'i Index,
|
||||
operation: IndexOperation,
|
||||
) -> Result<Vec<Task>> {
|
||||
@ -1185,6 +1202,10 @@ impl IndexScheduler {
|
||||
|
||||
let config = IndexDocumentsConfig { update_method: method, ..Default::default() };
|
||||
|
||||
let embedder_configs = index.embedding_configs(index_wtxn)?;
|
||||
// TODO: consider Arc'ing the map too (we only need read access + we'll be cloning it multiple times, so really makes sense)
|
||||
let embedders = self.embedders(embedder_configs)?;
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
index_wtxn,
|
||||
index,
|
||||
@ -1203,6 +1224,8 @@ impl IndexScheduler {
|
||||
let (new_builder, user_result) = builder.add_documents(reader)?;
|
||||
builder = new_builder;
|
||||
|
||||
builder = builder.with_embedders(embedders.clone());
|
||||
|
||||
let received_documents =
|
||||
if let Some(Details::DocumentAdditionOrUpdate {
|
||||
received_documents,
|
||||
@ -1275,45 +1298,6 @@ impl IndexScheduler {
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
|
||||
let indexer_config = self.index_mapper.indexer_config();
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
..Default::default()
|
||||
};
|
||||
let must_stop_processing = self.must_stop_processing.clone();
|
||||
|
||||
let mut builder = milli::update::IndexDocuments::new(
|
||||
index_wtxn,
|
||||
index,
|
||||
indexer_config,
|
||||
config,
|
||||
|indexing_step| trace!("update: {:?}", indexing_step),
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let document_ids = documents.iter().flatten().cloned().collect();
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
for (task, documents) in tasks.iter_mut().zip(documents) {
|
||||
task.status = Status::Succeeded;
|
||||
task.details = Some(Details::DocumentDeletion {
|
||||
provided_ids: documents.len(),
|
||||
deleted_documents: Some(count.min(documents.len() as u64)),
|
||||
});
|
||||
}
|
||||
|
||||
if !tasks.iter().all(|res| res.error.is_some()) {
|
||||
let addition = builder.execute()?;
|
||||
info!("document deletion done: {:?}", addition);
|
||||
}
|
||||
|
||||
Ok(tasks)
|
||||
}
|
||||
IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
|
||||
let filter =
|
||||
if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =
|
||||
@ -1503,10 +1487,9 @@ impl IndexScheduler {
|
||||
}
|
||||
|
||||
for task in to_delete_tasks.iter() {
|
||||
self.all_tasks.delete(wtxn, &BEU32::new(task))?;
|
||||
self.all_tasks.delete(wtxn, &task)?;
|
||||
}
|
||||
for canceled_by in affected_canceled_by {
|
||||
let canceled_by = BEU32::new(canceled_by);
|
||||
if let Some(mut tasks) = self.canceled_by.get(wtxn, &canceled_by)? {
|
||||
tasks -= &to_delete_tasks;
|
||||
if tasks.is_empty() {
|
||||
@ -1554,14 +1537,14 @@ impl IndexScheduler {
|
||||
task.details = task.details.map(|d| d.to_failed());
|
||||
self.update_task(wtxn, &task)?;
|
||||
}
|
||||
self.canceled_by.put(wtxn, &BEU32::new(cancel_task_id), &tasks_to_cancel)?;
|
||||
self.canceled_by.put(wtxn, &cancel_task_id, &tasks_to_cancel)?;
|
||||
|
||||
Ok(content_files_to_delete)
|
||||
}
|
||||
}
|
||||
|
||||
fn delete_document_by_filter<'a>(
|
||||
wtxn: &mut RwTxn<'a, '_>,
|
||||
wtxn: &mut RwTxn<'a>,
|
||||
filter: &serde_json::Value,
|
||||
indexer_config: &IndexerConfig,
|
||||
must_stop_processing: MustStopProcessing,
|
||||
@ -1575,18 +1558,6 @@ fn delete_document_by_filter<'a>(
|
||||
}
|
||||
e => e.into(),
|
||||
})?;
|
||||
let external_documents_ids = index.external_documents_ids();
|
||||
// FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
|
||||
// Since what we have is an iterator, it would be better to delete in chunks
|
||||
let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
|
||||
external_documents_ids
|
||||
.find_external_id_of(wtxn, candidates)?
|
||||
.only_external_ids()
|
||||
.collect();
|
||||
let document_ids = match external_to_internal {
|
||||
Ok(external_ids) => external_ids,
|
||||
Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
|
||||
};
|
||||
|
||||
let config = IndexDocumentsConfig {
|
||||
update_method: IndexDocumentsMethod::ReplaceDocuments,
|
||||
@ -1602,13 +1573,10 @@ fn delete_document_by_filter<'a>(
|
||||
|| must_stop_processing.get(),
|
||||
)?;
|
||||
|
||||
let (new_builder, user_result) = builder.remove_documents(document_ids)?;
|
||||
let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
|
||||
builder = new_builder;
|
||||
// Uses Invariant: remove documents actually always returns Ok for the inner result
|
||||
let count = user_result.unwrap();
|
||||
|
||||
let _ = builder.execute()?;
|
||||
|
||||
count
|
||||
} else {
|
||||
0
|
||||
|
@ -108,6 +108,8 @@ pub enum Error {
|
||||
TaskDeletionWithEmptyQuery,
|
||||
#[error("Query parameters to filter the tasks to cancel are missing. Available query parameters are: `uids`, `indexUids`, `statuses`, `types`, `canceledBy`, `beforeEnqueuedAt`, `afterEnqueuedAt`, `beforeStartedAt`, `afterStartedAt`, `beforeFinishedAt`, `afterFinishedAt`.")]
|
||||
TaskCancelationWithEmptyQuery,
|
||||
#[error("Aborted task")]
|
||||
AbortedTask,
|
||||
|
||||
#[error(transparent)]
|
||||
Dump(#[from] dump::Error),
|
||||
@ -175,6 +177,7 @@ impl Error {
|
||||
| Error::TaskNotFound(_)
|
||||
| Error::TaskDeletionWithEmptyQuery
|
||||
| Error::TaskCancelationWithEmptyQuery
|
||||
| Error::AbortedTask
|
||||
| Error::Dump(_)
|
||||
| Error::Heed(_)
|
||||
| Error::Milli(_)
|
||||
@ -236,6 +239,9 @@ impl ErrorCode for Error {
|
||||
Error::TaskDatabaseUpdate(_) => Code::Internal,
|
||||
Error::CreateBatch(_) => Code::Internal,
|
||||
|
||||
// This one should never be seen by the end user
|
||||
Error::AbortedTask => Code::Internal,
|
||||
|
||||
#[cfg(test)]
|
||||
Error::PlannedFailure => Code::Internal,
|
||||
}
|
||||
|
@ -56,12 +56,12 @@ impl RoFeatures {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn check_vector(&self) -> Result<()> {
|
||||
pub fn check_vector(&self, disabled_action: &'static str) -> Result<()> {
|
||||
if self.runtime.vector_store {
|
||||
Ok(())
|
||||
} else {
|
||||
Err(FeatureNotEnabledError {
|
||||
disabled_action: "Passing `vector` as a query parameter",
|
||||
disabled_action,
|
||||
feature: "vector store",
|
||||
issue_link: "https://github.com/meilisearch/product/discussions/677",
|
||||
}
|
||||
|
@ -1,12 +1,8 @@
|
||||
/// the map size to use when we don't succeed in reading it in indexes.
|
||||
const DEFAULT_MAP_SIZE: usize = 10 * 1024 * 1024 * 1024; // 10 GiB
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::Path;
|
||||
use std::time::Duration;
|
||||
|
||||
use meilisearch_types::heed::flags::Flags;
|
||||
use meilisearch_types::heed::{EnvClosingEvent, EnvOpenOptions};
|
||||
use meilisearch_types::heed::{EnvClosingEvent, EnvFlags, EnvOpenOptions};
|
||||
use meilisearch_types::milli::Index;
|
||||
use time::OffsetDateTime;
|
||||
use uuid::Uuid;
|
||||
@ -236,7 +232,7 @@ impl IndexMap {
|
||||
enable_mdb_writemap: bool,
|
||||
map_size_growth: usize,
|
||||
) {
|
||||
let map_size = index.map_size().unwrap_or(DEFAULT_MAP_SIZE) + map_size_growth;
|
||||
let map_size = index.map_size() + map_size_growth;
|
||||
let closing_event = index.prepare_for_closing();
|
||||
let generation = self.next_generation();
|
||||
self.unavailable.insert(
|
||||
@ -309,7 +305,7 @@ fn create_or_open_index(
|
||||
options.map_size(clamp_to_page_size(map_size));
|
||||
options.max_readers(1024);
|
||||
if enable_mdb_writemap {
|
||||
unsafe { options.flag(Flags::MdbWriteMap) };
|
||||
unsafe { options.flags(EnvFlags::WRITE_MAP) };
|
||||
}
|
||||
|
||||
if let Some((created, updated)) = date {
|
||||
@ -388,7 +384,7 @@ mod tests {
|
||||
|
||||
fn assert_index_size(index: Index, expected: usize) {
|
||||
let expected = clamp_to_page_size(expected);
|
||||
let index_map_size = index.map_size().unwrap();
|
||||
let index_map_size = index.map_size();
|
||||
assert_eq!(index_map_size, expected);
|
||||
}
|
||||
}
|
||||
|
@ -69,6 +69,10 @@ pub struct IndexMapper {
|
||||
/// Whether we open a meilisearch index with the MDB_WRITEMAP option or not.
|
||||
enable_mdb_writemap: bool,
|
||||
pub indexer_config: Arc<IndexerConfig>,
|
||||
|
||||
/// A few types of long running batches of tasks that act on a single index set this field
|
||||
/// so that a handle to the index is available from other threads (search) in an optimized manner.
|
||||
currently_updating_index: Arc<RwLock<Option<(String, Index)>>>,
|
||||
}
|
||||
|
||||
/// Whether the index is available for use or is forbidden to be inserted back in the index map
|
||||
@ -151,6 +155,7 @@ impl IndexMapper {
|
||||
index_growth_amount,
|
||||
enable_mdb_writemap,
|
||||
indexer_config: Arc::new(indexer_config),
|
||||
currently_updating_index: Default::default(),
|
||||
})
|
||||
}
|
||||
|
||||
@ -303,6 +308,14 @@ impl IndexMapper {
|
||||
|
||||
/// Return an index, may open it if it wasn't already opened.
|
||||
pub fn index(&self, rtxn: &RoTxn, name: &str) -> Result<Index> {
|
||||
if let Some((current_name, current_index)) =
|
||||
self.currently_updating_index.read().unwrap().as_ref()
|
||||
{
|
||||
if current_name == name {
|
||||
return Ok(current_index.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let uuid = self
|
||||
.index_mapping
|
||||
.get(rtxn, name)?
|
||||
@ -474,4 +487,8 @@ impl IndexMapper {
|
||||
pub fn indexer_config(&self) -> &IndexerConfig {
|
||||
&self.indexer_config
|
||||
}
|
||||
|
||||
pub fn set_currently_updating_index(&self, index: Option<(String, Index)>) {
|
||||
*self.currently_updating_index.write().unwrap() = index;
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::fmt::Write;
|
||||
|
||||
use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str};
|
||||
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str};
|
||||
use meilisearch_types::heed::{Database, RoTxn};
|
||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, RoaringBitmapCodec, BEU32};
|
||||
use meilisearch_types::tasks::{Details, Task};
|
||||
@ -30,15 +30,19 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String {
|
||||
index_mapper,
|
||||
features: _,
|
||||
max_number_of_tasks: _,
|
||||
max_number_of_batched_tasks: _,
|
||||
puffin_frame: _,
|
||||
wake_up: _,
|
||||
dumps_path: _,
|
||||
snapshots_path: _,
|
||||
auth_path: _,
|
||||
version_file_path: _,
|
||||
webhook_url: _,
|
||||
webhook_authorization_header: _,
|
||||
test_breakpoint_sdr: _,
|
||||
planned_failures: _,
|
||||
run_loop_iteration: _,
|
||||
embedders: _,
|
||||
} = scheduler;
|
||||
|
||||
let rtxn = env.read_txn().unwrap();
|
||||
@ -114,7 +118,7 @@ pub fn snapshot_bitmap(r: &RoaringBitmap) -> String {
|
||||
snap
|
||||
}
|
||||
|
||||
pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database<OwnedType<BEU32>, SerdeJson<Task>>) -> String {
|
||||
pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database<BEU32, SerdeJson<Task>>) -> String {
|
||||
let mut snap = String::new();
|
||||
let iter = db.iter(rtxn).unwrap();
|
||||
for next in iter {
|
||||
@ -124,10 +128,7 @@ pub fn snapshot_all_tasks(rtxn: &RoTxn, db: Database<OwnedType<BEU32>, SerdeJson
|
||||
snap
|
||||
}
|
||||
|
||||
pub fn snapshot_date_db(
|
||||
rtxn: &RoTxn,
|
||||
db: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
) -> String {
|
||||
pub fn snapshot_date_db(rtxn: &RoTxn, db: Database<BEI128, CboRoaringBitmapCodec>) -> String {
|
||||
let mut snap = String::new();
|
||||
let iter = db.iter(rtxn).unwrap();
|
||||
for next in iter {
|
||||
@ -247,10 +248,7 @@ pub fn snapshot_index_tasks(rtxn: &RoTxn, db: Database<Str, RoaringBitmapCodec>)
|
||||
}
|
||||
snap
|
||||
}
|
||||
pub fn snapshot_canceled_by(
|
||||
rtxn: &RoTxn,
|
||||
db: Database<OwnedType<BEU32>, RoaringBitmapCodec>,
|
||||
) -> String {
|
||||
pub fn snapshot_canceled_by(rtxn: &RoTxn, db: Database<BEU32, RoaringBitmapCodec>) -> String {
|
||||
let mut snap = String::new();
|
||||
let iter = db.iter(rtxn).unwrap();
|
||||
for next in iter {
|
||||
|
@ -27,13 +27,14 @@ mod index_mapper;
|
||||
mod insta_snapshot;
|
||||
mod lru;
|
||||
mod utils;
|
||||
mod uuid_codec;
|
||||
pub mod uuid_codec;
|
||||
|
||||
pub type Result<T> = std::result::Result<T, Error>;
|
||||
pub type TaskId = u32;
|
||||
|
||||
use std::collections::{BTreeMap, HashMap};
|
||||
use std::fs::File;
|
||||
use std::io::{self, BufReader, Read};
|
||||
use std::ops::{Bound, RangeBounds};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::atomic::AtomicBool;
|
||||
@ -45,13 +46,18 @@ use dump::{KindDump, TaskDump, UpdateFile};
|
||||
pub use error::Error;
|
||||
pub use features::RoFeatures;
|
||||
use file_store::FileStore;
|
||||
use flate2::bufread::GzEncoder;
|
||||
use flate2::Compression;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::features::{InstanceTogglableFeatures, RuntimeTogglableFeatures};
|
||||
use meilisearch_types::heed::types::{OwnedType, SerdeBincode, SerdeJson, Str};
|
||||
use meilisearch_types::heed::{self, Database, Env, RoTxn, RwTxn};
|
||||
use meilisearch_types::heed::byteorder::BE;
|
||||
use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128};
|
||||
use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::documents::DocumentsBatchBuilder;
|
||||
use meilisearch_types::milli::update::IndexerConfig;
|
||||
use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs};
|
||||
use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32};
|
||||
use meilisearch_types::task_view::TaskView;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task};
|
||||
use puffin::FrameView;
|
||||
use roaring::RoaringBitmap;
|
||||
@ -64,8 +70,7 @@ use uuid::Uuid;
|
||||
use crate::index_mapper::IndexMapper;
|
||||
use crate::utils::{check_index_swap_validity, clamp_to_page_size};
|
||||
|
||||
pub(crate) type BEI128 =
|
||||
meilisearch_types::heed::zerocopy::I128<meilisearch_types::heed::byteorder::BE>;
|
||||
pub(crate) type BEI128 = I128<BE>;
|
||||
|
||||
/// Defines a subset of tasks to be retrieved from the [`IndexScheduler`].
|
||||
///
|
||||
@ -169,8 +174,8 @@ impl ProcessingTasks {
|
||||
}
|
||||
|
||||
/// Set the processing tasks to an empty list
|
||||
fn stop_processing(&mut self) {
|
||||
self.processing = RoaringBitmap::new();
|
||||
fn stop_processing(&mut self) -> RoaringBitmap {
|
||||
std::mem::take(&mut self.processing)
|
||||
}
|
||||
|
||||
/// Returns `true` if there, at least, is one task that is currently processing that we must stop.
|
||||
@ -240,6 +245,10 @@ pub struct IndexSchedulerOptions {
|
||||
pub snapshots_path: PathBuf,
|
||||
/// The path to the folder containing the dumps.
|
||||
pub dumps_path: PathBuf,
|
||||
/// The URL on which we must send the tasks statuses
|
||||
pub webhook_url: Option<String>,
|
||||
/// The value we will send into the Authorization HTTP header on the webhook URL
|
||||
pub webhook_authorization_header: Option<String>,
|
||||
/// The maximum size, in bytes, of the task index.
|
||||
pub task_db_size: usize,
|
||||
/// The size, in bytes, with which a meilisearch index is opened the first time of each meilisearch index.
|
||||
@ -258,6 +267,9 @@ pub struct IndexSchedulerOptions {
|
||||
/// The maximum number of tasks stored in the task queue before starting
|
||||
/// to auto schedule task deletions.
|
||||
pub max_number_of_tasks: usize,
|
||||
/// If the autobatcher is allowed to automatically batch tasks
|
||||
/// it will only batch this defined number of tasks at once.
|
||||
pub max_number_of_batched_tasks: usize,
|
||||
/// The experimental features enabled for this instance.
|
||||
pub instance_features: InstanceTogglableFeatures,
|
||||
}
|
||||
@ -278,7 +290,7 @@ pub struct IndexScheduler {
|
||||
pub(crate) file_store: FileStore,
|
||||
|
||||
// The main database, it contains all the tasks accessible by their Id.
|
||||
pub(crate) all_tasks: Database<OwnedType<BEU32>, SerdeJson<Task>>,
|
||||
pub(crate) all_tasks: Database<BEU32, SerdeJson<Task>>,
|
||||
|
||||
/// All the tasks ids grouped by their status.
|
||||
// TODO we should not be able to serialize a `Status::Processing` in this database.
|
||||
@ -289,16 +301,16 @@ pub struct IndexScheduler {
|
||||
pub(crate) index_tasks: Database<Str, RoaringBitmapCodec>,
|
||||
|
||||
/// Store the tasks that were canceled by a task uid
|
||||
pub(crate) canceled_by: Database<OwnedType<BEU32>, RoaringBitmapCodec>,
|
||||
pub(crate) canceled_by: Database<BEU32, RoaringBitmapCodec>,
|
||||
|
||||
/// Store the task ids of tasks which were enqueued at a specific date
|
||||
pub(crate) enqueued_at: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
pub(crate) enqueued_at: Database<BEI128, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Store the task ids of finished tasks which started being processed at a specific date
|
||||
pub(crate) started_at: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
pub(crate) started_at: Database<BEI128, CboRoaringBitmapCodec>,
|
||||
|
||||
/// Store the task ids of tasks which finished at a specific date
|
||||
pub(crate) finished_at: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
pub(crate) finished_at: Database<BEI128, CboRoaringBitmapCodec>,
|
||||
|
||||
/// In charge of creating, opening, storing and returning indexes.
|
||||
pub(crate) index_mapper: IndexMapper,
|
||||
@ -316,6 +328,14 @@ pub struct IndexScheduler {
|
||||
/// the finished tasks automatically.
|
||||
pub(crate) max_number_of_tasks: usize,
|
||||
|
||||
/// The maximum number of tasks that will be batched together.
|
||||
pub(crate) max_number_of_batched_tasks: usize,
|
||||
|
||||
/// The webhook url we should send tasks to after processing every batches.
|
||||
pub(crate) webhook_url: Option<String>,
|
||||
/// The Authorization header to send to the webhook URL.
|
||||
pub(crate) webhook_authorization_header: Option<String>,
|
||||
|
||||
/// A frame to output the indexation profiling files to disk.
|
||||
pub(crate) puffin_frame: Arc<puffin::GlobalFrameView>,
|
||||
|
||||
@ -331,6 +351,8 @@ pub struct IndexScheduler {
|
||||
/// The path to the version file of Meilisearch.
|
||||
pub(crate) version_file_path: PathBuf,
|
||||
|
||||
embedders: Arc<RwLock<HashMap<EmbedderOptions, Arc<Embedder>>>>,
|
||||
|
||||
// ================= test
|
||||
// The next entry is dedicated to the tests.
|
||||
/// Provide a way to set a breakpoint in multiple part of the scheduler.
|
||||
@ -369,11 +391,15 @@ impl IndexScheduler {
|
||||
wake_up: self.wake_up.clone(),
|
||||
autobatching_enabled: self.autobatching_enabled,
|
||||
max_number_of_tasks: self.max_number_of_tasks,
|
||||
max_number_of_batched_tasks: self.max_number_of_batched_tasks,
|
||||
puffin_frame: self.puffin_frame.clone(),
|
||||
snapshots_path: self.snapshots_path.clone(),
|
||||
dumps_path: self.dumps_path.clone(),
|
||||
auth_path: self.auth_path.clone(),
|
||||
version_file_path: self.version_file_path.clone(),
|
||||
webhook_url: self.webhook_url.clone(),
|
||||
webhook_authorization_header: self.webhook_authorization_header.clone(),
|
||||
embedders: self.embedders.clone(),
|
||||
#[cfg(test)]
|
||||
test_breakpoint_sdr: self.test_breakpoint_sdr.clone(),
|
||||
#[cfg(test)]
|
||||
@ -466,10 +492,14 @@ impl IndexScheduler {
|
||||
puffin_frame: Arc::new(puffin::GlobalFrameView::default()),
|
||||
autobatching_enabled: options.autobatching_enabled,
|
||||
max_number_of_tasks: options.max_number_of_tasks,
|
||||
max_number_of_batched_tasks: options.max_number_of_batched_tasks,
|
||||
dumps_path: options.dumps_path,
|
||||
snapshots_path: options.snapshots_path,
|
||||
auth_path: options.auth_path,
|
||||
version_file_path: options.version_file_path,
|
||||
webhook_url: options.webhook_url,
|
||||
webhook_authorization_header: options.webhook_authorization_header,
|
||||
embedders: Default::default(),
|
||||
|
||||
#[cfg(test)]
|
||||
test_breakpoint_sdr,
|
||||
@ -717,9 +747,7 @@ impl IndexScheduler {
|
||||
if let Some(canceled_by) = &query.canceled_by {
|
||||
let mut all_canceled_tasks = RoaringBitmap::new();
|
||||
for cancel_task_uid in canceled_by {
|
||||
if let Some(canceled_by_uid) =
|
||||
self.canceled_by.get(rtxn, &BEU32::new(*cancel_task_uid))?
|
||||
{
|
||||
if let Some(canceled_by_uid) = self.canceled_by.get(rtxn, cancel_task_uid)? {
|
||||
all_canceled_tasks |= canceled_by_uid;
|
||||
}
|
||||
}
|
||||
@ -970,7 +998,7 @@ impl IndexScheduler {
|
||||
|
||||
// if the task doesn't delete anything and 50% of the task queue is full, we must refuse to enqueue the incomming task
|
||||
if !matches!(&kind, KindWithContent::TaskDeletion { tasks, .. } if !tasks.is_empty())
|
||||
&& (self.env.non_free_pages_size()? * 100) / self.env.map_size()? as u64 > 50
|
||||
&& (self.env.non_free_pages_size()? * 100) / self.env.info().map_size as u64 > 50
|
||||
{
|
||||
return Err(Error::NoSpaceLeftInTaskQueue);
|
||||
}
|
||||
@ -996,7 +1024,7 @@ impl IndexScheduler {
|
||||
// Get rid of the mutability.
|
||||
let task = task;
|
||||
|
||||
self.all_tasks.append(&mut wtxn, &BEU32::new(task.uid), &task)?;
|
||||
self.all_tasks.put_with_flags(&mut wtxn, PutFlags::APPEND, &task.uid, &task)?;
|
||||
|
||||
for index in task.indexes() {
|
||||
self.update_index(&mut wtxn, index, |bitmap| {
|
||||
@ -1133,6 +1161,9 @@ impl IndexScheduler {
|
||||
handle.join().unwrap_or(Err(Error::ProcessBatchPanicked))
|
||||
};
|
||||
|
||||
// Reset the currently updating index to relinquish the index handle
|
||||
self.index_mapper.set_currently_updating_index(None);
|
||||
|
||||
#[cfg(test)]
|
||||
self.maybe_fail(tests::FailureLocation::AcquiringWtxn)?;
|
||||
|
||||
@ -1167,10 +1198,11 @@ impl IndexScheduler {
|
||||
// If we have an abortion error we must stop the tick here and re-schedule tasks.
|
||||
Err(Error::Milli(milli::Error::InternalError(
|
||||
milli::InternalError::AbortedIndexation,
|
||||
))) => {
|
||||
)))
|
||||
| Err(Error::AbortedTask) => {
|
||||
#[cfg(test)]
|
||||
self.breakpoint(Breakpoint::AbortedIndexation);
|
||||
wtxn.abort().map_err(Error::HeedTransaction)?;
|
||||
wtxn.abort();
|
||||
|
||||
// We make sure that we don't call `stop_processing` on the `processing_tasks`,
|
||||
// this is because we want to let the next tick call `create_next_batch` and keep
|
||||
@ -1191,7 +1223,7 @@ impl IndexScheduler {
|
||||
let index_uid = index_uid.unwrap();
|
||||
// fixme: handle error more gracefully? not sure when this could happen
|
||||
self.index_mapper.resize_index(&wtxn, &index_uid)?;
|
||||
wtxn.abort().map_err(Error::HeedTransaction)?;
|
||||
wtxn.abort();
|
||||
|
||||
return Ok(TickOutcome::TickAgain(0));
|
||||
}
|
||||
@ -1223,19 +1255,99 @@ impl IndexScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
self.processing_tasks.write().unwrap().stop_processing();
|
||||
let processed = self.processing_tasks.write().unwrap().stop_processing();
|
||||
|
||||
#[cfg(test)]
|
||||
self.maybe_fail(tests::FailureLocation::CommittingWtxn)?;
|
||||
|
||||
wtxn.commit().map_err(Error::HeedTransaction)?;
|
||||
|
||||
// We shouldn't crash the tick function if we can't send data to the webhook.
|
||||
let _ = self.notify_webhook(&processed);
|
||||
|
||||
#[cfg(test)]
|
||||
self.breakpoint(Breakpoint::AfterProcessing);
|
||||
|
||||
Ok(TickOutcome::TickAgain(processed_tasks))
|
||||
}
|
||||
|
||||
/// Once the tasks changes have been commited we must send all the tasks that were updated to our webhook if there is one.
|
||||
fn notify_webhook(&self, updated: &RoaringBitmap) -> Result<()> {
|
||||
if let Some(ref url) = self.webhook_url {
|
||||
struct TaskReader<'a, 'b> {
|
||||
rtxn: &'a RoTxn<'a>,
|
||||
index_scheduler: &'a IndexScheduler,
|
||||
tasks: &'b mut roaring::bitmap::Iter<'b>,
|
||||
buffer: Vec<u8>,
|
||||
written: usize,
|
||||
}
|
||||
|
||||
impl<'a, 'b> Read for TaskReader<'a, 'b> {
|
||||
fn read(&mut self, mut buf: &mut [u8]) -> std::io::Result<usize> {
|
||||
if self.buffer.is_empty() {
|
||||
match self.tasks.next() {
|
||||
None => return Ok(0),
|
||||
Some(task_id) => {
|
||||
let task = self
|
||||
.index_scheduler
|
||||
.get_task(self.rtxn, task_id)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?
|
||||
.ok_or_else(|| {
|
||||
io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
Error::CorruptedTaskQueue,
|
||||
)
|
||||
})?;
|
||||
|
||||
serde_json::to_writer(
|
||||
&mut self.buffer,
|
||||
&TaskView::from_task(&task),
|
||||
)?;
|
||||
self.buffer.push(b'\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut to_write = &self.buffer[self.written..];
|
||||
let wrote = io::copy(&mut to_write, &mut buf)?;
|
||||
self.written += wrote as usize;
|
||||
|
||||
// we wrote everything and must refresh our buffer on the next call
|
||||
if self.written == self.buffer.len() {
|
||||
self.written = 0;
|
||||
self.buffer.clear();
|
||||
}
|
||||
|
||||
Ok(wrote as usize)
|
||||
}
|
||||
}
|
||||
|
||||
let rtxn = self.env.read_txn()?;
|
||||
|
||||
let task_reader = TaskReader {
|
||||
rtxn: &rtxn,
|
||||
index_scheduler: self,
|
||||
tasks: &mut updated.into_iter(),
|
||||
buffer: Vec::with_capacity(50), // on average a task is around ~100 bytes
|
||||
written: 0,
|
||||
};
|
||||
|
||||
// let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
|
||||
let reader = GzEncoder::new(BufReader::new(task_reader), Compression::default());
|
||||
let request = ureq::post(url).set("Content-Encoding", "gzip");
|
||||
let request = match &self.webhook_authorization_header {
|
||||
Some(header) => request.set("Authorization", header),
|
||||
None => request,
|
||||
};
|
||||
|
||||
if let Err(e) = request.send(reader) {
|
||||
log::error!("While sending data to the webhook: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Register a task to cleanup the task queue if needed
|
||||
fn cleanup_task_queue(&self) -> Result<()> {
|
||||
let rtxn = self.env.read_txn().map_err(Error::HeedTransaction)?;
|
||||
@ -1310,6 +1422,40 @@ impl IndexScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: consider using a type alias or a struct embedder/template
|
||||
pub fn embedders(
|
||||
&self,
|
||||
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
|
||||
) -> Result<EmbeddingConfigs> {
|
||||
let res: Result<_> = embedding_configs
|
||||
.into_iter()
|
||||
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
|
||||
let prompt =
|
||||
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
|
||||
// optimistically return existing embedder
|
||||
{
|
||||
let embedders = self.embedders.read().unwrap();
|
||||
if let Some(embedder) = embedders.get(&embedder_options) {
|
||||
return Ok((name, (embedder.clone(), prompt)));
|
||||
}
|
||||
}
|
||||
|
||||
// add missing embedder
|
||||
let embedder = Arc::new(
|
||||
Embedder::new(embedder_options.clone())
|
||||
.map_err(meilisearch_types::milli::vector::Error::from)
|
||||
.map_err(meilisearch_types::milli::Error::from)?,
|
||||
);
|
||||
{
|
||||
let mut embedders = self.embedders.write().unwrap();
|
||||
embedders.insert(embedder_options, embedder.clone());
|
||||
}
|
||||
Ok((name, (embedder, prompt)))
|
||||
})
|
||||
.collect();
|
||||
res.map(EmbeddingConfigs::new)
|
||||
}
|
||||
|
||||
/// Blocks the thread until the test handle asks to progress to/through this breakpoint.
|
||||
///
|
||||
/// Two messages are sent through the channel for each breakpoint.
|
||||
@ -1337,7 +1483,7 @@ impl IndexScheduler {
|
||||
|
||||
pub struct Dump<'a> {
|
||||
index_scheduler: &'a IndexScheduler,
|
||||
wtxn: RwTxn<'a, 'a>,
|
||||
wtxn: RwTxn<'a>,
|
||||
|
||||
indexes: HashMap<String, RoaringBitmap>,
|
||||
statuses: HashMap<Status, RoaringBitmap>,
|
||||
@ -1452,7 +1598,7 @@ impl<'a> Dump<'a> {
|
||||
},
|
||||
};
|
||||
|
||||
self.index_scheduler.all_tasks.put(&mut self.wtxn, &BEU32::new(task.uid), &task)?;
|
||||
self.index_scheduler.all_tasks.put(&mut self.wtxn, &task.uid, &task)?;
|
||||
|
||||
for index in task.indexes() {
|
||||
match self.indexes.get_mut(index) {
|
||||
@ -1494,8 +1640,8 @@ impl<'a> Dump<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
self.statuses.entry(task.status).or_insert(RoaringBitmap::new()).insert(task.uid);
|
||||
self.kinds.entry(task.kind.as_kind()).or_insert(RoaringBitmap::new()).insert(task.uid);
|
||||
self.statuses.entry(task.status).or_default().insert(task.uid);
|
||||
self.kinds.entry(task.kind.as_kind()).or_default().insert(task.uid);
|
||||
|
||||
Ok(task)
|
||||
}
|
||||
@ -1615,6 +1761,8 @@ mod tests {
|
||||
indexes_path: tempdir.path().join("indexes"),
|
||||
snapshots_path: tempdir.path().join("snapshots"),
|
||||
dumps_path: tempdir.path().join("dumps"),
|
||||
webhook_url: None,
|
||||
webhook_authorization_header: None,
|
||||
task_db_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
|
||||
index_base_map_size: 1000 * 1000, // 1 MB, we don't use MiB on purpose.
|
||||
enable_mdb_writemap: false,
|
||||
@ -1623,6 +1771,7 @@ mod tests {
|
||||
indexer_config,
|
||||
autobatching_enabled: true,
|
||||
max_number_of_tasks: 1_000_000,
|
||||
max_number_of_batched_tasks: usize::MAX,
|
||||
instance_features: Default::default(),
|
||||
};
|
||||
configuration(&mut options);
|
||||
@ -4323,4 +4472,26 @@ mod tests {
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cancel_processing_dump() {
|
||||
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||
|
||||
let dump_creation = KindWithContent::DumpCreation { keys: Vec::new(), instance_uid: None };
|
||||
let dump_cancellation = KindWithContent::TaskCancelation {
|
||||
query: "cancel dump".to_owned(),
|
||||
tasks: RoaringBitmap::from_iter([0]),
|
||||
};
|
||||
let _ = index_scheduler.register(dump_creation).unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_dump_register");
|
||||
handle.advance_till([Start, BatchCreated, InsideProcessBatch]);
|
||||
|
||||
let _ = index_scheduler.register(dump_cancellation).unwrap();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_registered");
|
||||
|
||||
snapshot!(format!("{:?}", handle.advance()), @"AbortedIndexation");
|
||||
|
||||
handle.advance_one_successful_batch();
|
||||
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "cancel_processed");
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,35 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"dumpCreation" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
@ -0,0 +1,45 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: canceled, canceled_by: 1, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||
1 {uid: 1, status: succeeded, details: { matched_tasks: 1, canceled_tasks: Some(0), original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued []
|
||||
succeeded [1,]
|
||||
canceled [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"taskCancelation" [1,]
|
||||
"dumpCreation" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
1 [0,]
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
@ -0,0 +1,38 @@
|
||||
---
|
||||
source: index-scheduler/src/lib.rs
|
||||
---
|
||||
### Autobatching Enabled = true
|
||||
### Processing Tasks:
|
||||
[0,]
|
||||
----------------------------------------------------------------------
|
||||
### All Tasks:
|
||||
0 {uid: 0, status: enqueued, details: { dump_uid: None }, kind: DumpCreation { keys: [], instance_uid: None }}
|
||||
1 {uid: 1, status: enqueued, details: { matched_tasks: 1, canceled_tasks: None, original_filter: "cancel dump" }, kind: TaskCancelation { query: "cancel dump", tasks: RoaringBitmap<[0]> }}
|
||||
----------------------------------------------------------------------
|
||||
### Status:
|
||||
enqueued [0,1,]
|
||||
----------------------------------------------------------------------
|
||||
### Kind:
|
||||
"taskCancelation" [1,]
|
||||
"dumpCreation" [0,]
|
||||
----------------------------------------------------------------------
|
||||
### Index Tasks:
|
||||
----------------------------------------------------------------------
|
||||
### Index Mapper:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Canceled By:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
### Enqueued At:
|
||||
[timestamp] [0,]
|
||||
[timestamp] [1,]
|
||||
----------------------------------------------------------------------
|
||||
### Started At:
|
||||
----------------------------------------------------------------------
|
||||
### Finished At:
|
||||
----------------------------------------------------------------------
|
||||
### File Store:
|
||||
|
||||
----------------------------------------------------------------------
|
||||
|
@ -3,9 +3,9 @@
|
||||
use std::collections::{BTreeSet, HashSet};
|
||||
use std::ops::Bound;
|
||||
|
||||
use meilisearch_types::heed::types::{DecodeIgnore, OwnedType};
|
||||
use meilisearch_types::heed::types::DecodeIgnore;
|
||||
use meilisearch_types::heed::{Database, RoTxn, RwTxn};
|
||||
use meilisearch_types::milli::{CboRoaringBitmapCodec, BEU32};
|
||||
use meilisearch_types::milli::CboRoaringBitmapCodec;
|
||||
use meilisearch_types::tasks::{Details, IndexSwap, Kind, KindWithContent, Status};
|
||||
use roaring::{MultiOps, RoaringBitmap};
|
||||
use time::OffsetDateTime;
|
||||
@ -18,7 +18,7 @@ impl IndexScheduler {
|
||||
}
|
||||
|
||||
pub(crate) fn last_task_id(&self, rtxn: &RoTxn) -> Result<Option<TaskId>> {
|
||||
Ok(self.all_tasks.remap_data_type::<DecodeIgnore>().last(rtxn)?.map(|(k, _)| k.get() + 1))
|
||||
Ok(self.all_tasks.remap_data_type::<DecodeIgnore>().last(rtxn)?.map(|(k, _)| k + 1))
|
||||
}
|
||||
|
||||
pub(crate) fn next_task_id(&self, rtxn: &RoTxn) -> Result<TaskId> {
|
||||
@ -26,7 +26,7 @@ impl IndexScheduler {
|
||||
}
|
||||
|
||||
pub(crate) fn get_task(&self, rtxn: &RoTxn, task_id: TaskId) -> Result<Option<Task>> {
|
||||
Ok(self.all_tasks.get(rtxn, &BEU32::new(task_id))?)
|
||||
Ok(self.all_tasks.get(rtxn, &task_id)?)
|
||||
}
|
||||
|
||||
/// Convert an iterator to a `Vec` of tasks. The tasks MUST exist or a
|
||||
@ -88,7 +88,7 @@ impl IndexScheduler {
|
||||
}
|
||||
}
|
||||
|
||||
self.all_tasks.put(wtxn, &BEU32::new(task.uid), task)?;
|
||||
self.all_tasks.put(wtxn, &task.uid, task)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@ -169,11 +169,11 @@ impl IndexScheduler {
|
||||
|
||||
pub(crate) fn insert_task_datetime(
|
||||
wtxn: &mut RwTxn,
|
||||
database: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
||||
time: OffsetDateTime,
|
||||
task_id: TaskId,
|
||||
) -> Result<()> {
|
||||
let timestamp = BEI128::new(time.unix_timestamp_nanos());
|
||||
let timestamp = time.unix_timestamp_nanos();
|
||||
let mut task_ids = database.get(wtxn, ×tamp)?.unwrap_or_default();
|
||||
task_ids.insert(task_id);
|
||||
database.put(wtxn, ×tamp, &RoaringBitmap::from_iter(task_ids))?;
|
||||
@ -182,11 +182,11 @@ pub(crate) fn insert_task_datetime(
|
||||
|
||||
pub(crate) fn remove_task_datetime(
|
||||
wtxn: &mut RwTxn,
|
||||
database: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
||||
time: OffsetDateTime,
|
||||
task_id: TaskId,
|
||||
) -> Result<()> {
|
||||
let timestamp = BEI128::new(time.unix_timestamp_nanos());
|
||||
let timestamp = time.unix_timestamp_nanos();
|
||||
if let Some(mut existing) = database.get(wtxn, ×tamp)? {
|
||||
existing.remove(task_id);
|
||||
if existing.is_empty() {
|
||||
@ -202,7 +202,7 @@ pub(crate) fn remove_task_datetime(
|
||||
pub(crate) fn keep_tasks_within_datetimes(
|
||||
rtxn: &RoTxn,
|
||||
tasks: &mut RoaringBitmap,
|
||||
database: Database<OwnedType<BEI128>, CboRoaringBitmapCodec>,
|
||||
database: Database<BEI128, CboRoaringBitmapCodec>,
|
||||
after: Option<OffsetDateTime>,
|
||||
before: Option<OffsetDateTime>,
|
||||
) -> Result<()> {
|
||||
@ -213,8 +213,8 @@ pub(crate) fn keep_tasks_within_datetimes(
|
||||
(Some(after), Some(before)) => (Bound::Excluded(*after), Bound::Excluded(*before)),
|
||||
};
|
||||
let mut collected_task_ids = RoaringBitmap::new();
|
||||
let start = map_bound(start, |b| BEI128::new(b.unix_timestamp_nanos()));
|
||||
let end = map_bound(end, |b| BEI128::new(b.unix_timestamp_nanos()));
|
||||
let start = map_bound(start, |b| b.unix_timestamp_nanos());
|
||||
let end = map_bound(end, |b| b.unix_timestamp_nanos());
|
||||
let iter = database.range(rtxn, &(start, end))?;
|
||||
for r in iter {
|
||||
let (_timestamp, task_ids) = r?;
|
||||
@ -337,8 +337,6 @@ impl IndexScheduler {
|
||||
let rtxn = self.env.read_txn().unwrap();
|
||||
for task in self.all_tasks.iter(&rtxn).unwrap() {
|
||||
let (task_id, task) = task.unwrap();
|
||||
let task_id = task_id.get();
|
||||
|
||||
let task_index_uid = task.index_uid().map(ToOwned::to_owned);
|
||||
|
||||
let Task {
|
||||
@ -361,16 +359,13 @@ impl IndexScheduler {
|
||||
.unwrap()
|
||||
.contains(task.uid));
|
||||
}
|
||||
let db_enqueued_at = self
|
||||
.enqueued_at
|
||||
.get(&rtxn, &BEI128::new(enqueued_at.unix_timestamp_nanos()))
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
let db_enqueued_at =
|
||||
self.enqueued_at.get(&rtxn, &enqueued_at.unix_timestamp_nanos()).unwrap().unwrap();
|
||||
assert!(db_enqueued_at.contains(task_id));
|
||||
if let Some(started_at) = started_at {
|
||||
let db_started_at = self
|
||||
.started_at
|
||||
.get(&rtxn, &BEI128::new(started_at.unix_timestamp_nanos()))
|
||||
.get(&rtxn, &started_at.unix_timestamp_nanos())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(db_started_at.contains(task_id));
|
||||
@ -378,7 +373,7 @@ impl IndexScheduler {
|
||||
if let Some(finished_at) = finished_at {
|
||||
let db_finished_at = self
|
||||
.finished_at
|
||||
.get(&rtxn, &BEI128::new(finished_at.unix_timestamp_nanos()))
|
||||
.get(&rtxn, &finished_at.unix_timestamp_nanos())
|
||||
.unwrap()
|
||||
.unwrap();
|
||||
assert!(db_finished_at.contains(task_id));
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use meilisearch_types::heed::{BytesDecode, BytesEncode};
|
||||
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use uuid::Uuid;
|
||||
|
||||
/// A heed codec for value of struct Uuid.
|
||||
@ -10,15 +10,15 @@ pub struct UuidCodec;
|
||||
impl<'a> BytesDecode<'a> for UuidCodec {
|
||||
type DItem = Uuid;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
bytes.try_into().ok().map(Uuid::from_bytes)
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
bytes.try_into().map(Uuid::from_bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesEncode<'_> for UuidCodec {
|
||||
type EItem = Uuid;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
Some(Cow::Borrowed(item.as_bytes()))
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_bytes()))
|
||||
}
|
||||
}
|
||||
|
@ -4,17 +4,20 @@ use std::collections::HashSet;
|
||||
use std::convert::{TryFrom, TryInto};
|
||||
use std::fs::create_dir_all;
|
||||
use std::path::Path;
|
||||
use std::result::Result as StdResult;
|
||||
use std::str;
|
||||
use std::str::FromStr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use hmac::{Hmac, Mac};
|
||||
use meilisearch_types::heed::BoxedError;
|
||||
use meilisearch_types::index_uid_pattern::IndexUidPattern;
|
||||
use meilisearch_types::keys::KeyId;
|
||||
use meilisearch_types::milli;
|
||||
use meilisearch_types::milli::heed::types::{ByteSlice, DecodeIgnore, SerdeJson};
|
||||
use meilisearch_types::milli::heed::types::{Bytes, DecodeIgnore, SerdeJson};
|
||||
use meilisearch_types::milli::heed::{Database, Env, EnvOpenOptions, RwTxn};
|
||||
use sha2::Sha256;
|
||||
use thiserror::Error;
|
||||
use time::OffsetDateTime;
|
||||
use uuid::fmt::Hyphenated;
|
||||
use uuid::Uuid;
|
||||
@ -30,7 +33,7 @@ const KEY_ID_ACTION_INDEX_EXPIRATION_DB_NAME: &str = "keyid-action-index-expirat
|
||||
#[derive(Clone)]
|
||||
pub struct HeedAuthStore {
|
||||
env: Arc<Env>,
|
||||
keys: Database<ByteSlice, SerdeJson<Key>>,
|
||||
keys: Database<Bytes, SerdeJson<Key>>,
|
||||
action_keyid_index_expiration: Database<KeyIdActionCodec, SerdeJson<Option<OffsetDateTime>>>,
|
||||
should_close_on_drop: bool,
|
||||
}
|
||||
@ -276,7 +279,7 @@ impl HeedAuthStore {
|
||||
fn delete_key_from_inverted_db(&self, wtxn: &mut RwTxn, key: &KeyId) -> Result<()> {
|
||||
let mut iter = self
|
||||
.action_keyid_index_expiration
|
||||
.remap_types::<ByteSlice, DecodeIgnore>()
|
||||
.remap_types::<Bytes, DecodeIgnore>()
|
||||
.prefix_iter_mut(wtxn, key.as_bytes())?;
|
||||
while iter.next().transpose()?.is_some() {
|
||||
// safety: we don't keep references from inside the LMDB database.
|
||||
@ -294,23 +297,24 @@ pub struct KeyIdActionCodec;
|
||||
impl<'a> milli::heed::BytesDecode<'a> for KeyIdActionCodec {
|
||||
type DItem = (KeyId, Action, Option<&'a [u8]>);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (key_id_bytes, action_bytes) = try_split_array_at(bytes)?;
|
||||
let (action_bytes, index) = match try_split_array_at(action_bytes)? {
|
||||
(action, []) => (action, None),
|
||||
(action, index) => (action, Some(index)),
|
||||
};
|
||||
fn bytes_decode(bytes: &'a [u8]) -> StdResult<Self::DItem, BoxedError> {
|
||||
let (key_id_bytes, action_bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let (&action_byte, index) =
|
||||
match try_split_array_at(action_bytes).ok_or(SliceTooShortError)? {
|
||||
([action], []) => (action, None),
|
||||
([action], index) => (action, Some(index)),
|
||||
};
|
||||
let key_id = Uuid::from_bytes(*key_id_bytes);
|
||||
let action = Action::from_repr(u8::from_be_bytes(*action_bytes))?;
|
||||
let action = Action::from_repr(action_byte).ok_or(InvalidActionError { action_byte })?;
|
||||
|
||||
Some((key_id, action, index))
|
||||
Ok((key_id, action, index))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec {
|
||||
type EItem = (&'a KeyId, &'a Action, Option<&'a [u8]>);
|
||||
|
||||
fn bytes_encode((key_id, action, index): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((key_id, action, index): &Self::EItem) -> StdResult<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::new();
|
||||
|
||||
bytes.extend_from_slice(key_id.as_bytes());
|
||||
@ -320,10 +324,20 @@ impl<'a> milli::heed::BytesEncode<'a> for KeyIdActionCodec {
|
||||
bytes.extend_from_slice(index);
|
||||
}
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the slice is too short")]
|
||||
pub struct SliceTooShortError;
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("cannot construct a valid Action from {action_byte}")]
|
||||
pub struct InvalidActionError {
|
||||
pub action_byte: u8,
|
||||
}
|
||||
|
||||
pub fn generate_key_as_hexa(uid: Uuid, master_key: &[u8]) -> String {
|
||||
// format uid as hyphenated allowing user to generate their own keys.
|
||||
let mut uid_buffer = [0; Hyphenated::LENGTH];
|
||||
|
@ -15,7 +15,7 @@ actix-web = { version = "4.3.1", default-features = false }
|
||||
anyhow = "1.0.70"
|
||||
convert_case = "0.6.0"
|
||||
csv = "1.2.1"
|
||||
deserr = { version = "0.6.0", features = ["actix-web"]}
|
||||
deserr = { version = "0.6.0", features = ["actix-web"] }
|
||||
either = { version = "1.8.1", features = ["serde"] }
|
||||
enum-iterator = "1.4.0"
|
||||
file-store = { path = "../file-store" }
|
||||
@ -50,6 +50,7 @@ hebrew = ["milli/hebrew"]
|
||||
japanese = ["milli/japanese"]
|
||||
# thai specialized tokenization
|
||||
thai = ["milli/thai"]
|
||||
|
||||
# allow greek specialized tokenization
|
||||
greek = ["milli/greek"]
|
||||
# allow khmer specialized tokenization
|
||||
khmer = ["milli/khmer"]
|
||||
|
@ -188,3 +188,4 @@ merge_with_error_impl_take_error_message!(ParseOffsetDateTimeError);
|
||||
merge_with_error_impl_take_error_message!(ParseTaskKindError);
|
||||
merge_with_error_impl_take_error_message!(ParseTaskStatusError);
|
||||
merge_with_error_impl_take_error_message!(IndexUidFormatError);
|
||||
merge_with_error_impl_take_error_message!(InvalidSearchSemanticRatio);
|
||||
|
@ -222,6 +222,8 @@ InvalidVectorsType , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentId , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentLimit , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidDocumentOffset , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidEmbedder , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidHybridQuery , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidIndexLimit , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidIndexOffset , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidIndexPrimaryKey , InvalidRequest , BAD_REQUEST ;
|
||||
@ -233,6 +235,7 @@ InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchFacets , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchSemanticRatio , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidFacetSearchFacetName , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchFilter , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchHighlightPostTag , InvalidRequest , BAD_REQUEST ;
|
||||
@ -252,9 +255,11 @@ InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSearchSort , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsFaceting , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsFilterableAttributes , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsPagination , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsEmbedders , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsRankingRules , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsSearchableAttributes , InvalidRequest , BAD_REQUEST ;
|
||||
InvalidSettingsSortableAttributes , InvalidRequest , BAD_REQUEST ;
|
||||
@ -294,15 +299,18 @@ MissingFacetSearchFacetName , InvalidRequest , BAD_REQUEST ;
|
||||
MissingIndexUid , InvalidRequest , BAD_REQUEST ;
|
||||
MissingMasterKey , Auth , UNAUTHORIZED ;
|
||||
MissingPayload , InvalidRequest , BAD_REQUEST ;
|
||||
MissingSearchHybrid , InvalidRequest , BAD_REQUEST ;
|
||||
MissingSwapIndexes , InvalidRequest , BAD_REQUEST ;
|
||||
MissingTaskFilters , InvalidRequest , BAD_REQUEST ;
|
||||
NoSpaceLeftOnDevice , System , UNPROCESSABLE_ENTITY;
|
||||
PayloadTooLarge , InvalidRequest , PAYLOAD_TOO_LARGE ;
|
||||
TaskNotFound , InvalidRequest , NOT_FOUND ;
|
||||
TooManyOpenFiles , System , UNPROCESSABLE_ENTITY ;
|
||||
TooManyVectors , InvalidRequest , BAD_REQUEST ;
|
||||
UnretrievableDocument , Internal , BAD_REQUEST ;
|
||||
UnretrievableErrorCode , InvalidRequest , BAD_REQUEST ;
|
||||
UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE
|
||||
UnsupportedMediaType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ;
|
||||
VectorEmbeddingError , InvalidRequest , BAD_REQUEST
|
||||
}
|
||||
|
||||
impl ErrorCode for JoinError {
|
||||
@ -335,6 +343,13 @@ impl ErrorCode for milli::Error {
|
||||
UserError::InvalidDocumentId { .. } | UserError::TooManyDocumentIds { .. } => {
|
||||
Code::InvalidDocumentId
|
||||
}
|
||||
UserError::MissingDocumentField(_) => Code::InvalidDocumentFields,
|
||||
UserError::InvalidFieldForSource { .. }
|
||||
| UserError::MissingFieldForSource { .. }
|
||||
| UserError::InvalidOpenAiModel { .. }
|
||||
| UserError::InvalidPrompt(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders,
|
||||
UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders,
|
||||
UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound,
|
||||
UserError::MultiplePrimaryKeyCandidatesFound { .. } => {
|
||||
Code::IndexPrimaryKeyMultipleCandidatesFound
|
||||
@ -352,11 +367,15 @@ impl ErrorCode for milli::Error {
|
||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||
UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
|
||||
UserError::InvalidVectorsType { .. } => Code::InvalidVectorsType,
|
||||
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
||||
UserError::SortError(_) => Code::InvalidSearchSort,
|
||||
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
||||
Code::InvalidSettingsTypoTolerance
|
||||
}
|
||||
UserError::InvalidEmbedder(_) => Code::InvalidEmbedder,
|
||||
UserError::VectorEmbeddingError(_) => Code::VectorEmbeddingError,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -386,11 +405,11 @@ impl ErrorCode for HeedError {
|
||||
HeedError::Mdb(MdbError::Invalid) => Code::InvalidStoreFile,
|
||||
HeedError::Io(e) => e.error_code(),
|
||||
HeedError::Mdb(_)
|
||||
| HeedError::Encoding
|
||||
| HeedError::Decoding
|
||||
| HeedError::Encoding(_)
|
||||
| HeedError::Decoding(_)
|
||||
| HeedError::InvalidDatabaseTyping
|
||||
| HeedError::DatabaseClosing
|
||||
| HeedError::BadOpenOptions => Code::Internal,
|
||||
| HeedError::BadOpenOptions { .. } => Code::Internal,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -444,6 +463,15 @@ impl fmt::Display for DeserrParseIntError {
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for deserr_codes::InvalidSearchSemanticRatio {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`."
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! internal_error {
|
||||
($target:ty : $($other:path), *) => {
|
||||
|
@ -9,6 +9,7 @@ pub mod index_uid_pattern;
|
||||
pub mod keys;
|
||||
pub mod settings;
|
||||
pub mod star_or;
|
||||
pub mod task_view;
|
||||
pub mod tasks;
|
||||
pub mod versioning;
|
||||
pub use milli::{heed, Index};
|
||||
|
@ -8,6 +8,7 @@ use std::str::FromStr;
|
||||
|
||||
use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef};
|
||||
use fst::IntoStreamer;
|
||||
use milli::proximity::ProximityPrecision;
|
||||
use milli::update::Setting;
|
||||
use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET};
|
||||
use serde::{Deserialize, Serialize, Serializer};
|
||||
@ -186,6 +187,9 @@ pub struct Settings<T> {
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsDistinctAttribute>)]
|
||||
pub distinct_attribute: Setting<String>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsProximityPrecision>)]
|
||||
pub proximity_precision: Setting<ProximityPrecisionView>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsTypoTolerance>)]
|
||||
pub typo_tolerance: Setting<TypoSettings>,
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
@ -195,6 +199,10 @@ pub struct Settings<T> {
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsPagination>)]
|
||||
pub pagination: Setting<PaginationSettings>,
|
||||
|
||||
#[serde(default, skip_serializing_if = "Setting::is_not_set")]
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSettingsEmbedders>)]
|
||||
pub embedders: Setting<BTreeMap<String, Setting<milli::vector::settings::EmbeddingSettings>>>,
|
||||
|
||||
#[serde(skip)]
|
||||
#[deserr(skip)]
|
||||
pub _kind: PhantomData<T>,
|
||||
@ -214,9 +222,11 @@ impl Settings<Checked> {
|
||||
separator_tokens: Setting::Reset,
|
||||
dictionary: Setting::Reset,
|
||||
distinct_attribute: Setting::Reset,
|
||||
proximity_precision: Setting::Reset,
|
||||
typo_tolerance: Setting::Reset,
|
||||
faceting: Setting::Reset,
|
||||
pagination: Setting::Reset,
|
||||
embedders: Setting::Reset,
|
||||
_kind: PhantomData,
|
||||
}
|
||||
}
|
||||
@ -234,9 +244,11 @@ impl Settings<Checked> {
|
||||
dictionary,
|
||||
synonyms,
|
||||
distinct_attribute,
|
||||
proximity_precision,
|
||||
typo_tolerance,
|
||||
faceting,
|
||||
pagination,
|
||||
embedders,
|
||||
..
|
||||
} = self;
|
||||
|
||||
@ -252,9 +264,11 @@ impl Settings<Checked> {
|
||||
dictionary,
|
||||
synonyms,
|
||||
distinct_attribute,
|
||||
proximity_precision,
|
||||
typo_tolerance,
|
||||
faceting,
|
||||
pagination,
|
||||
embedders,
|
||||
_kind: PhantomData,
|
||||
}
|
||||
}
|
||||
@ -296,12 +310,29 @@ impl Settings<Unchecked> {
|
||||
separator_tokens: self.separator_tokens,
|
||||
dictionary: self.dictionary,
|
||||
distinct_attribute: self.distinct_attribute,
|
||||
proximity_precision: self.proximity_precision,
|
||||
typo_tolerance: self.typo_tolerance,
|
||||
faceting: self.faceting,
|
||||
pagination: self.pagination,
|
||||
embedders: self.embedders,
|
||||
_kind: PhantomData,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate(self) -> Result<Self, milli::Error> {
|
||||
self.validate_embedding_settings()
|
||||
}
|
||||
|
||||
fn validate_embedding_settings(mut self) -> Result<Self, milli::Error> {
|
||||
let Setting::Set(mut configs) = self.embedders else { return Ok(self) };
|
||||
for (name, config) in configs.iter_mut() {
|
||||
let config_to_check = std::mem::take(config);
|
||||
let checked_config = milli::update::validate_embedding_settings(config_to_check, name)?;
|
||||
*config = checked_config
|
||||
}
|
||||
self.embedders = Setting::Set(configs);
|
||||
Ok(self)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
@ -390,6 +421,12 @@ pub fn apply_settings_to_builder(
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
match settings.proximity_precision {
|
||||
Setting::Set(ref precision) => builder.set_proximity_precision((*precision).into()),
|
||||
Setting::Reset => builder.reset_proximity_precision(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
match settings.typo_tolerance {
|
||||
Setting::Set(ref value) => {
|
||||
match value.enabled {
|
||||
@ -476,6 +513,12 @@ pub fn apply_settings_to_builder(
|
||||
Setting::Reset => builder.reset_pagination_max_total_hits(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
|
||||
match settings.embedders.clone() {
|
||||
Setting::Set(value) => builder.set_embedder_settings(value),
|
||||
Setting::Reset => builder.reset_embedder_settings(),
|
||||
Setting::NotSet => (),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn settings(
|
||||
@ -509,6 +552,8 @@ pub fn settings(
|
||||
|
||||
let distinct_field = index.distinct_field(rtxn)?.map(String::from);
|
||||
|
||||
let proximity_precision = index.proximity_precision(rtxn)?.map(ProximityPrecisionView::from);
|
||||
|
||||
let synonyms = index.user_defined_synonyms(rtxn)?;
|
||||
|
||||
let min_typo_word_len = MinWordSizeTyposSetting {
|
||||
@ -532,7 +577,10 @@ pub fn settings(
|
||||
|
||||
let faceting = FacetingSettings {
|
||||
max_values_per_facet: Setting::Set(
|
||||
index.max_values_per_facet(rtxn)?.unwrap_or(DEFAULT_VALUES_PER_FACET),
|
||||
index
|
||||
.max_values_per_facet(rtxn)?
|
||||
.map(|x| x as usize)
|
||||
.unwrap_or(DEFAULT_VALUES_PER_FACET),
|
||||
),
|
||||
sort_facet_values_by: Setting::Set(
|
||||
index
|
||||
@ -545,10 +593,20 @@ pub fn settings(
|
||||
|
||||
let pagination = PaginationSettings {
|
||||
max_total_hits: Setting::Set(
|
||||
index.pagination_max_total_hits(rtxn)?.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
|
||||
index
|
||||
.pagination_max_total_hits(rtxn)?
|
||||
.map(|x| x as usize)
|
||||
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS),
|
||||
),
|
||||
};
|
||||
|
||||
let embedders: BTreeMap<_, _> = index
|
||||
.embedding_configs(rtxn)?
|
||||
.into_iter()
|
||||
.map(|(name, config)| (name, Setting::Set(config.into())))
|
||||
.collect();
|
||||
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
||||
|
||||
Ok(Settings {
|
||||
displayed_attributes: match displayed_attributes {
|
||||
Some(attrs) => Setting::Set(attrs),
|
||||
@ -569,10 +627,12 @@ pub fn settings(
|
||||
Some(field) => Setting::Set(field),
|
||||
None => Setting::Reset,
|
||||
},
|
||||
proximity_precision: Setting::Set(proximity_precision.unwrap_or_default()),
|
||||
synonyms: Setting::Set(synonyms),
|
||||
typo_tolerance: Setting::Set(typo_tolerance),
|
||||
faceting: Setting::Set(faceting),
|
||||
pagination: Setting::Set(pagination),
|
||||
embedders,
|
||||
_kind: PhantomData,
|
||||
})
|
||||
}
|
||||
@ -673,6 +733,32 @@ impl From<RankingRuleView> for Criterion {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Deserr, Serialize, Deserialize)]
|
||||
#[serde(deny_unknown_fields, rename_all = "camelCase")]
|
||||
#[deserr(error = DeserrJsonError<InvalidSettingsProximityPrecision>, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub enum ProximityPrecisionView {
|
||||
#[default]
|
||||
ByWord,
|
||||
ByAttribute,
|
||||
}
|
||||
|
||||
impl From<ProximityPrecision> for ProximityPrecisionView {
|
||||
fn from(value: ProximityPrecision) -> Self {
|
||||
match value {
|
||||
ProximityPrecision::ByWord => ProximityPrecisionView::ByWord,
|
||||
ProximityPrecision::ByAttribute => ProximityPrecisionView::ByAttribute,
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<ProximityPrecisionView> for ProximityPrecision {
|
||||
fn from(value: ProximityPrecisionView) -> Self {
|
||||
match value {
|
||||
ProximityPrecisionView::ByWord => ProximityPrecision::ByWord,
|
||||
ProximityPrecisionView::ByAttribute => ProximityPrecision::ByAttribute,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod test {
|
||||
use super::*;
|
||||
@ -692,9 +778,11 @@ pub(crate) mod test {
|
||||
dictionary: Setting::NotSet,
|
||||
synonyms: Setting::NotSet,
|
||||
distinct_attribute: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
typo_tolerance: Setting::NotSet,
|
||||
faceting: Setting::NotSet,
|
||||
pagination: Setting::NotSet,
|
||||
embedders: Setting::NotSet,
|
||||
_kind: PhantomData::<Unchecked>,
|
||||
};
|
||||
|
||||
@ -716,9 +804,11 @@ pub(crate) mod test {
|
||||
dictionary: Setting::NotSet,
|
||||
synonyms: Setting::NotSet,
|
||||
distinct_attribute: Setting::NotSet,
|
||||
proximity_precision: Setting::NotSet,
|
||||
typo_tolerance: Setting::NotSet,
|
||||
faceting: Setting::NotSet,
|
||||
pagination: Setting::NotSet,
|
||||
embedders: Setting::NotSet,
|
||||
_kind: PhantomData::<Unchecked>,
|
||||
};
|
||||
|
||||
|
139
meilisearch-types/src/task_view.rs
Normal file
139
meilisearch-types/src/task_view.rs
Normal file
@ -0,0 +1,139 @@
|
||||
use serde::Serialize;
|
||||
use time::{Duration, OffsetDateTime};
|
||||
|
||||
use crate::error::ResponseError;
|
||||
use crate::settings::{Settings, Unchecked};
|
||||
use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct TaskView {
|
||||
pub uid: TaskId,
|
||||
#[serde(default)]
|
||||
pub index_uid: Option<String>,
|
||||
pub status: Status,
|
||||
#[serde(rename = "type")]
|
||||
pub kind: Kind,
|
||||
pub canceled_by: Option<TaskId>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub details: Option<DetailsView>,
|
||||
pub error: Option<ResponseError>,
|
||||
#[serde(serialize_with = "serialize_duration", default)]
|
||||
pub duration: Option<Duration>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub enqueued_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub started_at: Option<OffsetDateTime>,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub finished_at: Option<OffsetDateTime>,
|
||||
}
|
||||
|
||||
impl TaskView {
|
||||
pub fn from_task(task: &Task) -> TaskView {
|
||||
TaskView {
|
||||
uid: task.uid,
|
||||
index_uid: task.index_uid().map(ToOwned::to_owned),
|
||||
status: task.status,
|
||||
kind: task.kind.as_kind(),
|
||||
canceled_by: task.canceled_by,
|
||||
details: task.details.clone().map(DetailsView::from),
|
||||
error: task.error.clone(),
|
||||
duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start),
|
||||
enqueued_at: task.enqueued_at,
|
||||
started_at: task.started_at,
|
||||
finished_at: task.finished_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DetailsView {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub received_documents: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub indexed_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub primary_key: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub provided_ids: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub matched_tasks: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub canceled_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub original_filter: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub dump_uid: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
pub settings: Option<Box<Settings<Unchecked>>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub swaps: Option<Vec<IndexSwap>>,
|
||||
}
|
||||
|
||||
impl From<Details> for DetailsView {
|
||||
fn from(details: Details) -> Self {
|
||||
match details {
|
||||
Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => {
|
||||
DetailsView {
|
||||
received_documents: Some(received_documents),
|
||||
indexed_documents: Some(indexed_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::SettingsUpdate { settings } => {
|
||||
DetailsView { settings: Some(settings), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexInfo { primary_key } => {
|
||||
DetailsView { primary_key: Some(primary_key), ..DetailsView::default() }
|
||||
}
|
||||
Details::DocumentDeletion {
|
||||
provided_ids: received_document_ids,
|
||||
deleted_documents,
|
||||
} => DetailsView {
|
||||
provided_ids: Some(received_document_ids),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
original_filter: Some(None),
|
||||
..DetailsView::default()
|
||||
},
|
||||
Details::DocumentDeletionByFilter { original_filter, deleted_documents } => {
|
||||
DetailsView {
|
||||
provided_ids: Some(0),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::ClearAll { deleted_documents } => {
|
||||
DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() }
|
||||
}
|
||||
Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
canceled_tasks: Some(canceled_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
deleted_tasks: Some(deleted_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::Dump { dump_uid } => {
|
||||
DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexSwap { swaps } => {
|
||||
DetailsView { swaps: Some(swaps), ..Default::default() }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -39,7 +39,7 @@ byte-unit = { version = "4.0.19", default-features = false, features = [
|
||||
bytes = "1.4.0"
|
||||
clap = { version = "4.2.1", features = ["derive", "env"] }
|
||||
crossbeam-channel = "0.5.8"
|
||||
deserr = { version = "0.6.0", features = ["actix-web"]}
|
||||
deserr = { version = "0.6.0", features = ["actix-web"] }
|
||||
dump = { path = "../dump" }
|
||||
either = "1.8.1"
|
||||
env_logger = "0.10.0"
|
||||
@ -104,6 +104,7 @@ walkdir = "2.3.3"
|
||||
yaup = "0.2.1"
|
||||
serde_urlencoded = "0.7.1"
|
||||
termcolor = "1.2.0"
|
||||
url = { version = "2.5.0", features = ["serde"] }
|
||||
|
||||
[dev-dependencies]
|
||||
actix-rt = "2.8.0"
|
||||
@ -150,7 +151,8 @@ hebrew = ["meilisearch-types/hebrew"]
|
||||
japanese = ["meilisearch-types/japanese"]
|
||||
thai = ["meilisearch-types/thai"]
|
||||
greek = ["meilisearch-types/greek"]
|
||||
khmer = ["meilisearch-types/khmer"]
|
||||
|
||||
[package.metadata.mini-dashboard]
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.11/build.zip"
|
||||
sha1 = "83cd44ed1e5f97ecb581dc9f958a63f4ccc982d9"
|
||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
|
||||
sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"
|
||||
|
@ -36,7 +36,7 @@ use crate::routes::{create_all_stats, Stats};
|
||||
use crate::search::{
|
||||
FacetSearchResult, MatchingStrategy, SearchQuery, SearchQueryWithIndex, SearchResult,
|
||||
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
|
||||
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT,
|
||||
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEMANTIC_RATIO,
|
||||
};
|
||||
use crate::Opt;
|
||||
|
||||
@ -251,6 +251,7 @@ struct Infos {
|
||||
env: String,
|
||||
experimental_enable_metrics: bool,
|
||||
experimental_reduce_indexing_memory_usage: bool,
|
||||
experimental_max_number_of_batched_tasks: usize,
|
||||
db_path: bool,
|
||||
import_dump: bool,
|
||||
dump_dir: bool,
|
||||
@ -263,6 +264,8 @@ struct Infos {
|
||||
ignore_snapshot_if_db_exists: bool,
|
||||
http_addr: bool,
|
||||
http_payload_size_limit: Byte,
|
||||
task_queue_webhook: bool,
|
||||
task_webhook_authorization_header: bool,
|
||||
log_level: String,
|
||||
max_indexing_memory: MaxMemory,
|
||||
max_indexing_threads: MaxThreads,
|
||||
@ -285,9 +288,12 @@ impl From<Opt> for Infos {
|
||||
db_path,
|
||||
experimental_enable_metrics,
|
||||
experimental_reduce_indexing_memory_usage,
|
||||
experimental_max_number_of_batched_tasks,
|
||||
http_addr,
|
||||
master_key: _,
|
||||
env,
|
||||
task_webhook_url,
|
||||
task_webhook_authorization_header,
|
||||
max_index_size: _,
|
||||
max_task_db_size: _,
|
||||
http_payload_size_limit,
|
||||
@ -340,6 +346,9 @@ impl From<Opt> for Infos {
|
||||
ignore_snapshot_if_db_exists,
|
||||
http_addr: http_addr != default_http_addr(),
|
||||
http_payload_size_limit,
|
||||
experimental_max_number_of_batched_tasks,
|
||||
task_queue_webhook: task_webhook_url.is_some(),
|
||||
task_webhook_authorization_header: task_webhook_authorization_header.is_some(),
|
||||
log_level: log_level.to_string(),
|
||||
max_indexing_memory,
|
||||
max_indexing_threads,
|
||||
@ -583,6 +592,11 @@ pub struct SearchAggregator {
|
||||
// vector
|
||||
// The maximum number of floats in a vector request
|
||||
max_vector_size: usize,
|
||||
// Whether the semantic ratio passed to a hybrid search equals the default ratio.
|
||||
semantic_ratio: bool,
|
||||
// Whether a non-default embedder was specified
|
||||
embedder: bool,
|
||||
hybrid: bool,
|
||||
|
||||
// every time a search is done, we increment the counter linked to the used settings
|
||||
matching_strategy: HashMap<String, usize>,
|
||||
@ -636,6 +650,7 @@ impl SearchAggregator {
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
} = query;
|
||||
|
||||
let mut ret = Self::default();
|
||||
@ -709,6 +724,12 @@ impl SearchAggregator {
|
||||
ret.show_ranking_score = *show_ranking_score;
|
||||
ret.show_ranking_score_details = *show_ranking_score_details;
|
||||
|
||||
if let Some(hybrid) = hybrid {
|
||||
ret.semantic_ratio = hybrid.semantic_ratio != DEFAULT_SEMANTIC_RATIO();
|
||||
ret.embedder = hybrid.embedder.is_some();
|
||||
ret.hybrid = true;
|
||||
}
|
||||
|
||||
ret
|
||||
}
|
||||
|
||||
@ -762,6 +783,9 @@ impl SearchAggregator {
|
||||
facets_total_number_of_facets,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
semantic_ratio,
|
||||
embedder,
|
||||
hybrid,
|
||||
} = other;
|
||||
|
||||
if self.timestamp.is_none() {
|
||||
@ -807,6 +831,9 @@ impl SearchAggregator {
|
||||
|
||||
// vector
|
||||
self.max_vector_size = self.max_vector_size.max(max_vector_size);
|
||||
self.semantic_ratio |= semantic_ratio;
|
||||
self.hybrid |= hybrid;
|
||||
self.embedder |= embedder;
|
||||
|
||||
// pagination
|
||||
self.max_limit = self.max_limit.max(max_limit);
|
||||
@ -875,6 +902,9 @@ impl SearchAggregator {
|
||||
facets_total_number_of_facets,
|
||||
show_ranking_score,
|
||||
show_ranking_score_details,
|
||||
semantic_ratio,
|
||||
embedder,
|
||||
hybrid,
|
||||
} = self;
|
||||
|
||||
if total_received == 0 {
|
||||
@ -914,6 +944,11 @@ impl SearchAggregator {
|
||||
"vector": {
|
||||
"max_vector_size": max_vector_size,
|
||||
},
|
||||
"hybrid": {
|
||||
"enabled": hybrid,
|
||||
"semantic_ratio": semantic_ratio,
|
||||
"embedder": embedder,
|
||||
},
|
||||
"pagination": {
|
||||
"max_limit": max_limit,
|
||||
"max_offset": max_offset,
|
||||
@ -1009,6 +1044,7 @@ impl MultiSearchAggregator {
|
||||
crop_marker: _,
|
||||
matching_strategy: _,
|
||||
attributes_to_search_on: _,
|
||||
hybrid: _,
|
||||
} = query;
|
||||
|
||||
index_uid.as_str()
|
||||
@ -1155,6 +1191,7 @@ impl FacetSearchAggregator {
|
||||
filter,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
} = query;
|
||||
|
||||
let mut ret = Self::default();
|
||||
@ -1168,7 +1205,8 @@ impl FacetSearchAggregator {
|
||||
|| vector.is_some()
|
||||
|| filter.is_some()
|
||||
|| *matching_strategy != MatchingStrategy::default()
|
||||
|| attributes_to_search_on.is_some();
|
||||
|| attributes_to_search_on.is_some()
|
||||
|| hybrid.is_some();
|
||||
|
||||
ret
|
||||
}
|
||||
|
@ -51,6 +51,8 @@ pub enum MeilisearchHttpError {
|
||||
DocumentFormat(#[from] DocumentFormatError),
|
||||
#[error(transparent)]
|
||||
Join(#[from] JoinError),
|
||||
#[error("Invalid request: missing `hybrid` parameter when both `q` and `vector` are present.")]
|
||||
MissingSearchHybrid,
|
||||
}
|
||||
|
||||
impl ErrorCode for MeilisearchHttpError {
|
||||
@ -74,6 +76,7 @@ impl ErrorCode for MeilisearchHttpError {
|
||||
MeilisearchHttpError::FileStore(_) => Code::Internal,
|
||||
MeilisearchHttpError::DocumentFormat(e) => e.error_code(),
|
||||
MeilisearchHttpError::Join(_) => Code::Internal,
|
||||
MeilisearchHttpError::MissingSearchHybrid => Code::MissingSearchHybrid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -228,12 +228,15 @@ fn open_or_create_database_unchecked(
|
||||
indexes_path: opt.db_path.join("indexes"),
|
||||
snapshots_path: opt.snapshot_dir.clone(),
|
||||
dumps_path: opt.dump_dir.clone(),
|
||||
webhook_url: opt.task_webhook_url.as_ref().map(|url| url.to_string()),
|
||||
webhook_authorization_header: opt.task_webhook_authorization_header.clone(),
|
||||
task_db_size: opt.max_task_db_size.get_bytes() as usize,
|
||||
index_base_map_size: opt.max_index_size.get_bytes() as usize,
|
||||
enable_mdb_writemap: opt.experimental_reduce_indexing_memory_usage,
|
||||
indexer_config: (&opt.indexer_options).try_into()?,
|
||||
autobatching_enabled: true,
|
||||
max_number_of_tasks: 1_000_000,
|
||||
max_number_of_batched_tasks: opt.experimental_max_number_of_batched_tasks,
|
||||
index_growth_amount: byte_unit::Byte::from_str("10GiB").unwrap().get_bytes() as usize,
|
||||
index_count: DEFAULT_INDEX_COUNT,
|
||||
instance_features,
|
||||
@ -397,6 +400,7 @@ pub fn configure_data(
|
||||
.app_data(web::Data::from(analytics))
|
||||
.app_data(
|
||||
web::JsonConfig::default()
|
||||
.limit(http_payload_size_limit)
|
||||
.content_type(|mime| mime == mime::APPLICATION_JSON)
|
||||
.error_handler(|err, req: &HttpRequest| match err {
|
||||
JsonPayloadError::ContentType => match req.headers().get(CONTENT_TYPE) {
|
||||
|
@ -19,7 +19,11 @@ static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
|
||||
/// does all the setup before meilisearch is launched
|
||||
fn setup(opt: &Opt) -> anyhow::Result<()> {
|
||||
let mut log_builder = env_logger::Builder::new();
|
||||
log_builder.parse_filters(&opt.log_level.to_string());
|
||||
let log_filters = format!(
|
||||
"{},h2=warn,hyper=warn,tokio_util=warn,tracing=warn,rustls=warn,mio=warn,reqwest=warn",
|
||||
opt.log_level
|
||||
);
|
||||
log_builder.parse_filters(&log_filters);
|
||||
|
||||
log_builder.init();
|
||||
|
||||
|
@ -21,6 +21,7 @@ use rustls::RootCertStore;
|
||||
use rustls_pemfile::{certs, pkcs8_private_keys, rsa_private_keys};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sysinfo::{RefreshKind, System, SystemExt};
|
||||
use url::Url;
|
||||
|
||||
const POSSIBLE_ENV: [&str; 2] = ["development", "production"];
|
||||
|
||||
@ -28,6 +29,8 @@ const MEILI_DB_PATH: &str = "MEILI_DB_PATH";
|
||||
const MEILI_HTTP_ADDR: &str = "MEILI_HTTP_ADDR";
|
||||
const MEILI_MASTER_KEY: &str = "MEILI_MASTER_KEY";
|
||||
const MEILI_ENV: &str = "MEILI_ENV";
|
||||
const MEILI_TASK_WEBHOOK_URL: &str = "MEILI_TASK_WEBHOOK_URL";
|
||||
const MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER: &str = "MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER";
|
||||
#[cfg(feature = "analytics")]
|
||||
const MEILI_NO_ANALYTICS: &str = "MEILI_NO_ANALYTICS";
|
||||
const MEILI_HTTP_PAYLOAD_SIZE_LIMIT: &str = "MEILI_HTTP_PAYLOAD_SIZE_LIMIT";
|
||||
@ -51,6 +54,8 @@ const MEILI_LOG_LEVEL: &str = "MEILI_LOG_LEVEL";
|
||||
const MEILI_EXPERIMENTAL_ENABLE_METRICS: &str = "MEILI_EXPERIMENTAL_ENABLE_METRICS";
|
||||
const MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE: &str =
|
||||
"MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE";
|
||||
const MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS: &str =
|
||||
"MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS";
|
||||
|
||||
const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml";
|
||||
const DEFAULT_DB_PATH: &str = "./data.ms";
|
||||
@ -154,6 +159,14 @@ pub struct Opt {
|
||||
#[serde(default = "default_env")]
|
||||
pub env: String,
|
||||
|
||||
/// Called whenever a task finishes so a third party can be notified.
|
||||
#[clap(long, env = MEILI_TASK_WEBHOOK_URL)]
|
||||
pub task_webhook_url: Option<Url>,
|
||||
|
||||
/// The Authorization header to send on the webhook URL whenever a task finishes so a third party can be notified.
|
||||
#[clap(long, env = MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER)]
|
||||
pub task_webhook_authorization_header: Option<String>,
|
||||
|
||||
/// Deactivates Meilisearch's built-in telemetry when provided.
|
||||
///
|
||||
/// Meilisearch automatically collects data from all instances that do not opt out using this flag.
|
||||
@ -301,6 +314,11 @@ pub struct Opt {
|
||||
#[serde(default)]
|
||||
pub experimental_reduce_indexing_memory_usage: bool,
|
||||
|
||||
/// Experimentally reduces the maximum number of tasks that will be processed at once, see: <https://github.com/orgs/meilisearch/discussions/713>
|
||||
#[clap(long, env = MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS, default_value_t = default_limit_batched_tasks())]
|
||||
#[serde(default = "default_limit_batched_tasks")]
|
||||
pub experimental_max_number_of_batched_tasks: usize,
|
||||
|
||||
#[serde(flatten)]
|
||||
#[clap(flatten)]
|
||||
pub indexer_options: IndexerOpts,
|
||||
@ -368,9 +386,12 @@ impl Opt {
|
||||
http_addr,
|
||||
master_key,
|
||||
env,
|
||||
task_webhook_url,
|
||||
task_webhook_authorization_header,
|
||||
max_index_size: _,
|
||||
max_task_db_size: _,
|
||||
http_payload_size_limit,
|
||||
experimental_max_number_of_batched_tasks,
|
||||
ssl_cert_path,
|
||||
ssl_key_path,
|
||||
ssl_auth_path,
|
||||
@ -392,8 +413,8 @@ impl Opt {
|
||||
config_file_path: _,
|
||||
#[cfg(feature = "analytics")]
|
||||
no_analytics,
|
||||
experimental_enable_metrics: enable_metrics_route,
|
||||
experimental_reduce_indexing_memory_usage: reduce_indexing_memory_usage,
|
||||
experimental_enable_metrics,
|
||||
experimental_reduce_indexing_memory_usage,
|
||||
} = self;
|
||||
export_to_env_if_not_present(MEILI_DB_PATH, db_path);
|
||||
export_to_env_if_not_present(MEILI_HTTP_ADDR, http_addr);
|
||||
@ -401,6 +422,16 @@ impl Opt {
|
||||
export_to_env_if_not_present(MEILI_MASTER_KEY, master_key);
|
||||
}
|
||||
export_to_env_if_not_present(MEILI_ENV, env);
|
||||
if let Some(task_webhook_url) = task_webhook_url {
|
||||
export_to_env_if_not_present(MEILI_TASK_WEBHOOK_URL, task_webhook_url.to_string());
|
||||
}
|
||||
if let Some(task_webhook_authorization_header) = task_webhook_authorization_header {
|
||||
export_to_env_if_not_present(
|
||||
MEILI_TASK_WEBHOOK_AUTHORIZATION_HEADER,
|
||||
task_webhook_authorization_header,
|
||||
);
|
||||
}
|
||||
|
||||
#[cfg(feature = "analytics")]
|
||||
{
|
||||
export_to_env_if_not_present(MEILI_NO_ANALYTICS, no_analytics.to_string());
|
||||
@ -409,6 +440,10 @@ impl Opt {
|
||||
MEILI_HTTP_PAYLOAD_SIZE_LIMIT,
|
||||
http_payload_size_limit.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_MAX_NUMBER_OF_BATCHED_TASKS,
|
||||
experimental_max_number_of_batched_tasks.to_string(),
|
||||
);
|
||||
if let Some(ssl_cert_path) = ssl_cert_path {
|
||||
export_to_env_if_not_present(MEILI_SSL_CERT_PATH, ssl_cert_path);
|
||||
}
|
||||
@ -433,11 +468,11 @@ impl Opt {
|
||||
export_to_env_if_not_present(MEILI_LOG_LEVEL, log_level.to_string());
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_ENABLE_METRICS,
|
||||
enable_metrics_route.to_string(),
|
||||
experimental_enable_metrics.to_string(),
|
||||
);
|
||||
export_to_env_if_not_present(
|
||||
MEILI_EXPERIMENTAL_REDUCE_INDEXING_MEMORY_USAGE,
|
||||
reduce_indexing_memory_usage.to_string(),
|
||||
experimental_reduce_indexing_memory_usage.to_string(),
|
||||
);
|
||||
indexer_options.export_to_env();
|
||||
}
|
||||
@ -727,6 +762,10 @@ fn default_http_payload_size_limit() -> Byte {
|
||||
Byte::from_str(DEFAULT_HTTP_PAYLOAD_SIZE_LIMIT).unwrap()
|
||||
}
|
||||
|
||||
fn default_limit_batched_tasks() -> usize {
|
||||
usize::MAX
|
||||
}
|
||||
|
||||
fn default_snapshot_dir() -> PathBuf {
|
||||
PathBuf::from(DEFAULT_SNAPSHOT_DIR)
|
||||
}
|
||||
|
@ -3,7 +3,7 @@ use std::io::ErrorKind;
|
||||
use actix_web::http::header::CONTENT_TYPE;
|
||||
use actix_web::web::Data;
|
||||
use actix_web::{web, HttpMessage, HttpRequest, HttpResponse};
|
||||
use bstr::ByteSlice;
|
||||
use bstr::ByteSlice as _;
|
||||
use deserr::actix_web::{AwebJson, AwebQueryParameter};
|
||||
use deserr::Deserr;
|
||||
use futures::StreamExt;
|
||||
|
@ -13,9 +13,9 @@ use crate::analytics::{Analytics, FacetSearchAggregator};
|
||||
use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_facet_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH,
|
||||
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
|
||||
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
add_search_rules, perform_facet_search, HybridQuery, MatchingStrategy, SearchQuery,
|
||||
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
|
||||
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
};
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
@ -36,6 +36,8 @@ pub struct FacetSearchQuery {
|
||||
pub q: Option<String>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchVector>)]
|
||||
pub vector: Option<Vec<f32>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidHybridQuery>)]
|
||||
pub hybrid: Option<HybridQuery>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchFilter>)]
|
||||
pub filter: Option<Value>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchMatchingStrategy>, default)]
|
||||
@ -95,6 +97,7 @@ impl From<FacetSearchQuery> for SearchQuery {
|
||||
filter,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
} = value;
|
||||
|
||||
SearchQuery {
|
||||
@ -119,6 +122,7 @@ impl From<FacetSearchQuery> for SearchQuery {
|
||||
matching_strategy,
|
||||
vector,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,12 +2,14 @@ use actix_web::web::Data;
|
||||
use actix_web::{web, HttpRequest, HttpResponse};
|
||||
use deserr::actix_web::{AwebJson, AwebQueryParameter};
|
||||
use index_scheduler::IndexScheduler;
|
||||
use log::debug;
|
||||
use log::{debug, warn};
|
||||
use meilisearch_types::deserr::query_params::Param;
|
||||
use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError};
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli;
|
||||
use meilisearch_types::milli::vector::DistributionShift;
|
||||
use meilisearch_types::serde_cs::vec::CS;
|
||||
use serde_json::Value;
|
||||
|
||||
@ -16,9 +18,9 @@ use crate::extractors::authentication::policies::*;
|
||||
use crate::extractors::authentication::GuardedData;
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_search, MatchingStrategy, SearchQuery, DEFAULT_CROP_LENGTH,
|
||||
DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG,
|
||||
DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET,
|
||||
add_search_rules, perform_search, HybridQuery, MatchingStrategy, SearchQuery, SemanticRatio,
|
||||
DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG,
|
||||
DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO,
|
||||
};
|
||||
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
@ -74,6 +76,31 @@ pub struct SearchQueryGet {
|
||||
matching_strategy: MatchingStrategy,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchAttributesToSearchOn>)]
|
||||
pub attributes_to_search_on: Option<CS<String>>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidEmbedder>)]
|
||||
pub hybrid_embedder: Option<String>,
|
||||
#[deserr(default, error = DeserrQueryParamError<InvalidSearchSemanticRatio>)]
|
||||
pub hybrid_semantic_ratio: Option<SemanticRatioGet>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, Default, PartialEq, deserr::Deserr)]
|
||||
#[deserr(try_from(String) = TryFrom::try_from -> InvalidSearchSemanticRatio)]
|
||||
pub struct SemanticRatioGet(SemanticRatio);
|
||||
|
||||
impl std::convert::TryFrom<String> for SemanticRatioGet {
|
||||
type Error = InvalidSearchSemanticRatio;
|
||||
|
||||
fn try_from(s: String) -> Result<Self, Self::Error> {
|
||||
let f: f32 = s.parse().map_err(|_| InvalidSearchSemanticRatio)?;
|
||||
Ok(SemanticRatioGet(SemanticRatio::try_from(f)?))
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for SemanticRatioGet {
|
||||
type Target = SemanticRatio;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SearchQueryGet> for SearchQuery {
|
||||
@ -86,6 +113,20 @@ impl From<SearchQueryGet> for SearchQuery {
|
||||
None => None,
|
||||
};
|
||||
|
||||
let hybrid = match (other.hybrid_embedder, other.hybrid_semantic_ratio) {
|
||||
(None, None) => None,
|
||||
(None, Some(semantic_ratio)) => {
|
||||
Some(HybridQuery { semantic_ratio: *semantic_ratio, embedder: None })
|
||||
}
|
||||
(Some(embedder), None) => Some(HybridQuery {
|
||||
semantic_ratio: DEFAULT_SEMANTIC_RATIO(),
|
||||
embedder: Some(embedder),
|
||||
}),
|
||||
(Some(embedder), Some(semantic_ratio)) => {
|
||||
Some(HybridQuery { semantic_ratio: *semantic_ratio, embedder: Some(embedder) })
|
||||
}
|
||||
};
|
||||
|
||||
Self {
|
||||
q: other.q,
|
||||
vector: other.vector.map(CS::into_inner),
|
||||
@ -108,6 +149,7 @@ impl From<SearchQueryGet> for SearchQuery {
|
||||
crop_marker: other.crop_marker,
|
||||
matching_strategy: other.matching_strategy,
|
||||
attributes_to_search_on: other.attributes_to_search_on.map(|o| o.into_iter().collect()),
|
||||
hybrid,
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -158,8 +200,12 @@ pub async fn search_with_url_query(
|
||||
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
let features = index_scheduler.features();
|
||||
|
||||
let distribution = embed(&mut query, index_scheduler.get_ref(), &index).await?;
|
||||
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features, distribution))
|
||||
.await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
aggregate.succeed(search_result);
|
||||
}
|
||||
@ -193,8 +239,12 @@ pub async fn search_with_post(
|
||||
let index = index_scheduler.index(&index_uid)?;
|
||||
|
||||
let features = index_scheduler.features();
|
||||
|
||||
let distribution = embed(&mut query, index_scheduler.get_ref(), &index).await?;
|
||||
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features)).await?;
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features, distribution))
|
||||
.await?;
|
||||
if let Ok(ref search_result) = search_result {
|
||||
aggregate.succeed(search_result);
|
||||
}
|
||||
@ -206,6 +256,80 @@ pub async fn search_with_post(
|
||||
Ok(HttpResponse::Ok().json(search_result))
|
||||
}
|
||||
|
||||
pub async fn embed(
|
||||
query: &mut SearchQuery,
|
||||
index_scheduler: &IndexScheduler,
|
||||
index: &milli::Index,
|
||||
) -> Result<Option<DistributionShift>, ResponseError> {
|
||||
match (&query.hybrid, &query.vector, &query.q) {
|
||||
(Some(HybridQuery { semantic_ratio: _, embedder }), None, Some(q))
|
||||
if !q.trim().is_empty() =>
|
||||
{
|
||||
let embedder_configs = index.embedding_configs(&index.read_txn()?)?;
|
||||
let embedders = index_scheduler.embedders(embedder_configs)?;
|
||||
|
||||
let embedder = if let Some(embedder_name) = embedder {
|
||||
embedders.get(embedder_name)
|
||||
} else {
|
||||
embedders.get_default()
|
||||
};
|
||||
|
||||
let embedder = embedder
|
||||
.ok_or(milli::UserError::InvalidEmbedder("default".to_owned()))
|
||||
.map_err(milli::Error::from)?
|
||||
.0;
|
||||
|
||||
let distribution = embedder.distribution();
|
||||
|
||||
let embeddings = embedder
|
||||
.embed(vec![q.to_owned()])
|
||||
.await
|
||||
.map_err(milli::vector::Error::from)
|
||||
.map_err(milli::Error::from)?
|
||||
.pop()
|
||||
.expect("No vector returned from embedding");
|
||||
|
||||
if embeddings.iter().nth(1).is_some() {
|
||||
warn!("Ignoring embeddings past the first one in long search query");
|
||||
query.vector = Some(embeddings.iter().next().unwrap().to_vec());
|
||||
} else {
|
||||
query.vector = Some(embeddings.into_inner());
|
||||
}
|
||||
Ok(distribution)
|
||||
}
|
||||
(Some(hybrid), vector, _) => {
|
||||
let embedder_configs = index.embedding_configs(&index.read_txn()?)?;
|
||||
let embedders = index_scheduler.embedders(embedder_configs)?;
|
||||
|
||||
let embedder = if let Some(embedder_name) = &hybrid.embedder {
|
||||
embedders.get(embedder_name)
|
||||
} else {
|
||||
embedders.get_default()
|
||||
};
|
||||
|
||||
let embedder = embedder
|
||||
.ok_or(milli::UserError::InvalidEmbedder("default".to_owned()))
|
||||
.map_err(milli::Error::from)?
|
||||
.0;
|
||||
|
||||
if let Some(vector) = vector {
|
||||
if vector.len() != embedder.dimensions() {
|
||||
return Err(meilisearch_types::milli::Error::UserError(
|
||||
meilisearch_types::milli::UserError::InvalidVectorDimensions {
|
||||
expected: embedder.dimensions(),
|
||||
found: vector.len(),
|
||||
},
|
||||
)
|
||||
.into());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(embedder.distribution())
|
||||
}
|
||||
_ => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
@ -7,6 +7,7 @@ use meilisearch_types::deserr::DeserrJsonError;
|
||||
use meilisearch_types::error::ResponseError;
|
||||
use meilisearch_types::facet_values_sort::FacetValuesSort;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::update::Setting;
|
||||
use meilisearch_types::settings::{settings, RankingRuleView, Settings, Unchecked};
|
||||
use meilisearch_types::tasks::KindWithContent;
|
||||
use serde_json::json;
|
||||
@ -78,6 +79,7 @@ macro_rules! make_setting_route {
|
||||
|
||||
let body = body.into_inner();
|
||||
|
||||
#[allow(clippy::redundant_closure_call)]
|
||||
$analytics(&body, &req);
|
||||
|
||||
let new_settings = Settings {
|
||||
@ -88,6 +90,11 @@ macro_rules! make_setting_route {
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
let new_settings = $crate::routes::indexes::settings::validate_settings(
|
||||
new_settings,
|
||||
&index_scheduler,
|
||||
)?;
|
||||
|
||||
let allow_index_creation =
|
||||
index_scheduler.filters().allow_index_creation(&index_uid);
|
||||
|
||||
@ -434,6 +441,31 @@ make_setting_route!(
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
"/proximity-precision",
|
||||
put,
|
||||
meilisearch_types::settings::ProximityPrecisionView,
|
||||
meilisearch_types::deserr::DeserrJsonError<
|
||||
meilisearch_types::error::deserr_codes::InvalidSettingsProximityPrecision,
|
||||
>,
|
||||
proximity_precision,
|
||||
"proximityPrecision",
|
||||
analytics,
|
||||
|precision: &Option<meilisearch_types::settings::ProximityPrecisionView>, req: &HttpRequest| {
|
||||
use serde_json::json;
|
||||
analytics.publish(
|
||||
"ProximityPrecision Updated".to_string(),
|
||||
json!({
|
||||
"proximity_precision": {
|
||||
"set": precision.is_some(),
|
||||
"value": precision.unwrap_or_default(),
|
||||
}
|
||||
}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
"/ranking-rules",
|
||||
put,
|
||||
@ -520,6 +552,67 @@ make_setting_route!(
|
||||
}
|
||||
);
|
||||
|
||||
make_setting_route!(
|
||||
"/embedders",
|
||||
patch,
|
||||
std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>,
|
||||
meilisearch_types::deserr::DeserrJsonError<
|
||||
meilisearch_types::error::deserr_codes::InvalidSettingsEmbedders,
|
||||
>,
|
||||
embedders,
|
||||
"embedders",
|
||||
analytics,
|
||||
|setting: &Option<std::collections::BTreeMap<String, Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>>>, req: &HttpRequest| {
|
||||
|
||||
|
||||
analytics.publish(
|
||||
"Embedders Updated".to_string(),
|
||||
serde_json::json!({"embedders": crate::routes::indexes::settings::embedder_analytics(setting.as_ref())}),
|
||||
Some(req),
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
fn embedder_analytics(
|
||||
setting: Option<
|
||||
&std::collections::BTreeMap<
|
||||
String,
|
||||
Setting<meilisearch_types::milli::vector::settings::EmbeddingSettings>,
|
||||
>,
|
||||
>,
|
||||
) -> serde_json::Value {
|
||||
let mut sources = std::collections::HashSet::new();
|
||||
|
||||
if let Some(s) = &setting {
|
||||
for source in s
|
||||
.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.filter_map(|config| config.source.set())
|
||||
{
|
||||
use meilisearch_types::milli::vector::settings::EmbedderSource;
|
||||
match source {
|
||||
EmbedderSource::OpenAi => sources.insert("openAi"),
|
||||
EmbedderSource::HuggingFace => sources.insert("huggingFace"),
|
||||
EmbedderSource::UserProvided => sources.insert("userProvided"),
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
let document_template_used = setting.as_ref().map(|map| {
|
||||
map.values()
|
||||
.filter_map(|config| config.clone().set())
|
||||
.any(|config| config.document_template.set().is_some())
|
||||
});
|
||||
|
||||
json!(
|
||||
{
|
||||
"total": setting.as_ref().map(|s| s.len()),
|
||||
"sources": sources,
|
||||
"document_template_used": document_template_used,
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
macro_rules! generate_configure {
|
||||
($($mod:ident),*) => {
|
||||
pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
@ -540,6 +633,7 @@ generate_configure!(
|
||||
displayed_attributes,
|
||||
searchable_attributes,
|
||||
distinct_attribute,
|
||||
proximity_precision,
|
||||
stop_words,
|
||||
separator_tokens,
|
||||
non_separator_tokens,
|
||||
@ -548,7 +642,8 @@ generate_configure!(
|
||||
ranking_rules,
|
||||
typo_tolerance,
|
||||
pagination,
|
||||
faceting
|
||||
faceting,
|
||||
embedders
|
||||
);
|
||||
|
||||
pub async fn update_all(
|
||||
@ -561,6 +656,7 @@ pub async fn update_all(
|
||||
let index_uid = IndexUid::try_from(index_uid.into_inner())?;
|
||||
|
||||
let new_settings = body.into_inner();
|
||||
let new_settings = validate_settings(new_settings, &index_scheduler)?;
|
||||
|
||||
analytics.publish(
|
||||
"Settings Updated".to_string(),
|
||||
@ -593,6 +689,10 @@ pub async fn update_all(
|
||||
"distinct_attribute": {
|
||||
"set": new_settings.distinct_attribute.as_ref().set().is_some()
|
||||
},
|
||||
"proximity_precision": {
|
||||
"set": new_settings.proximity_precision.as_ref().set().is_some(),
|
||||
"value": new_settings.proximity_precision.as_ref().set().copied().unwrap_or_default()
|
||||
},
|
||||
"typo_tolerance": {
|
||||
"enabled": new_settings.typo_tolerance
|
||||
.as_ref()
|
||||
@ -652,6 +752,7 @@ pub async fn update_all(
|
||||
"synonyms": {
|
||||
"total": new_settings.synonyms.as_ref().set().map(|synonyms| synonyms.len()),
|
||||
},
|
||||
"embedders": crate::routes::indexes::settings::embedder_analytics(new_settings.embedders.as_ref().set())
|
||||
}),
|
||||
Some(&req),
|
||||
);
|
||||
@ -706,3 +807,13 @@ pub async fn delete_all(
|
||||
debug!("returns: {:?}", task);
|
||||
Ok(HttpResponse::Accepted().json(task))
|
||||
}
|
||||
|
||||
fn validate_settings(
|
||||
settings: Settings<Unchecked>,
|
||||
index_scheduler: &IndexScheduler,
|
||||
) -> Result<Settings<Unchecked>, ResponseError> {
|
||||
if matches!(settings.embedders, Setting::Set(_)) {
|
||||
index_scheduler.features().check_vector("Passing `embedders` in settings")?
|
||||
}
|
||||
Ok(settings.validate()?)
|
||||
}
|
||||
|
@ -13,6 +13,7 @@ use crate::analytics::{Analytics, MultiSearchAggregator};
|
||||
use crate::extractors::authentication::policies::ActionPolicy;
|
||||
use crate::extractors::authentication::{AuthenticationError, GuardedData};
|
||||
use crate::extractors::sequential_extractor::SeqHandler;
|
||||
use crate::routes::indexes::search::embed;
|
||||
use crate::search::{
|
||||
add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex,
|
||||
};
|
||||
@ -46,49 +47,51 @@ pub async fn multi_search_with_post(
|
||||
// Explicitly expect a `(ResponseError, usize)` for the error type rather than `ResponseError` only,
|
||||
// so that `?` doesn't work if it doesn't use `with_index`, ensuring that it is not forgotten in case of code
|
||||
// changes.
|
||||
let search_results: Result<_, (ResponseError, usize)> = (|| {
|
||||
async {
|
||||
let mut search_results = Vec::with_capacity(queries.len());
|
||||
for (query_index, (index_uid, mut query)) in
|
||||
queries.into_iter().map(SearchQueryWithIndex::into_index_query).enumerate()
|
||||
{
|
||||
debug!("multi-search #{query_index}: called with params: {:?}", query);
|
||||
let search_results: Result<_, (ResponseError, usize)> = async {
|
||||
let mut search_results = Vec::with_capacity(queries.len());
|
||||
for (query_index, (index_uid, mut query)) in
|
||||
queries.into_iter().map(SearchQueryWithIndex::into_index_query).enumerate()
|
||||
{
|
||||
debug!("multi-search #{query_index}: called with params: {:?}", query);
|
||||
|
||||
// Check index from API key
|
||||
if !index_scheduler.filters().is_index_authorized(&index_uid) {
|
||||
return Err(AuthenticationError::InvalidToken).with_index(query_index);
|
||||
}
|
||||
// Apply search rules from tenant token
|
||||
if let Some(search_rules) =
|
||||
index_scheduler.filters().get_index_search_rules(&index_uid)
|
||||
{
|
||||
add_search_rules(&mut query, search_rules);
|
||||
}
|
||||
|
||||
let index = index_scheduler
|
||||
.index(&index_uid)
|
||||
.map_err(|err| {
|
||||
let mut err = ResponseError::from(err);
|
||||
// Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but
|
||||
// here the resource not found is not part of the URL.
|
||||
err.code = StatusCode::BAD_REQUEST;
|
||||
err
|
||||
})
|
||||
.with_index(query_index)?;
|
||||
|
||||
let search_result =
|
||||
tokio::task::spawn_blocking(move || perform_search(&index, query, features))
|
||||
.await
|
||||
.with_index(query_index)?;
|
||||
|
||||
search_results.push(SearchResultWithIndex {
|
||||
index_uid: index_uid.into_inner(),
|
||||
result: search_result.with_index(query_index)?,
|
||||
});
|
||||
// Check index from API key
|
||||
if !index_scheduler.filters().is_index_authorized(&index_uid) {
|
||||
return Err(AuthenticationError::InvalidToken).with_index(query_index);
|
||||
}
|
||||
Ok(search_results)
|
||||
// Apply search rules from tenant token
|
||||
if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid)
|
||||
{
|
||||
add_search_rules(&mut query, search_rules);
|
||||
}
|
||||
|
||||
let index = index_scheduler
|
||||
.index(&index_uid)
|
||||
.map_err(|err| {
|
||||
let mut err = ResponseError::from(err);
|
||||
// Patch the HTTP status code to 400 as it defaults to 404 for `index_not_found`, but
|
||||
// here the resource not found is not part of the URL.
|
||||
err.code = StatusCode::BAD_REQUEST;
|
||||
err
|
||||
})
|
||||
.with_index(query_index)?;
|
||||
|
||||
let distribution = embed(&mut query, index_scheduler.get_ref(), &index)
|
||||
.await
|
||||
.with_index(query_index)?;
|
||||
|
||||
let search_result = tokio::task::spawn_blocking(move || {
|
||||
perform_search(&index, query, features, distribution)
|
||||
})
|
||||
.await
|
||||
.with_index(query_index)?;
|
||||
|
||||
search_results.push(SearchResultWithIndex {
|
||||
index_uid: index_uid.into_inner(),
|
||||
result: search_result.with_index(query_index)?,
|
||||
});
|
||||
}
|
||||
})()
|
||||
Ok(search_results)
|
||||
}
|
||||
.await;
|
||||
|
||||
if search_results.is_ok() {
|
||||
|
@ -8,11 +8,9 @@ use meilisearch_types::deserr::DeserrQueryParamError;
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::error::{InvalidTaskDateError, ResponseError};
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::settings::{Settings, Unchecked};
|
||||
use meilisearch_types::star_or::{OptionStarOr, OptionStarOrList};
|
||||
use meilisearch_types::tasks::{
|
||||
serialize_duration, Details, IndexSwap, Kind, KindWithContent, Status, Task,
|
||||
};
|
||||
use meilisearch_types::task_view::TaskView;
|
||||
use meilisearch_types::tasks::{Kind, KindWithContent, Status};
|
||||
use serde::Serialize;
|
||||
use serde_json::json;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
@ -37,140 +35,6 @@ pub fn configure(cfg: &mut web::ServiceConfig) {
|
||||
.service(web::resource("/cancel").route(web::post().to(SeqHandler(cancel_tasks))))
|
||||
.service(web::resource("/{task_id}").route(web::get().to(SeqHandler(get_task))));
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Eq, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct TaskView {
|
||||
pub uid: TaskId,
|
||||
#[serde(default)]
|
||||
pub index_uid: Option<String>,
|
||||
pub status: Status,
|
||||
#[serde(rename = "type")]
|
||||
pub kind: Kind,
|
||||
pub canceled_by: Option<TaskId>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub details: Option<DetailsView>,
|
||||
pub error: Option<ResponseError>,
|
||||
#[serde(serialize_with = "serialize_duration", default)]
|
||||
pub duration: Option<Duration>,
|
||||
#[serde(with = "time::serde::rfc3339")]
|
||||
pub enqueued_at: OffsetDateTime,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub started_at: Option<OffsetDateTime>,
|
||||
#[serde(with = "time::serde::rfc3339::option", default)]
|
||||
pub finished_at: Option<OffsetDateTime>,
|
||||
}
|
||||
|
||||
impl TaskView {
|
||||
pub fn from_task(task: &Task) -> TaskView {
|
||||
TaskView {
|
||||
uid: task.uid,
|
||||
index_uid: task.index_uid().map(ToOwned::to_owned),
|
||||
status: task.status,
|
||||
kind: task.kind.as_kind(),
|
||||
canceled_by: task.canceled_by,
|
||||
details: task.details.clone().map(DetailsView::from),
|
||||
error: task.error.clone(),
|
||||
duration: task.started_at.zip(task.finished_at).map(|(start, end)| end - start),
|
||||
enqueued_at: task.enqueued_at,
|
||||
started_at: task.started_at,
|
||||
finished_at: task.finished_at,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Debug, PartialEq, Eq, Clone, Serialize)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct DetailsView {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub received_documents: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub indexed_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub primary_key: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub provided_ids: Option<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_documents: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub matched_tasks: Option<u64>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub canceled_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub deleted_tasks: Option<Option<u64>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub original_filter: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub dump_uid: Option<Option<String>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
#[serde(flatten)]
|
||||
pub settings: Option<Box<Settings<Unchecked>>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub swaps: Option<Vec<IndexSwap>>,
|
||||
}
|
||||
|
||||
impl From<Details> for DetailsView {
|
||||
fn from(details: Details) -> Self {
|
||||
match details {
|
||||
Details::DocumentAdditionOrUpdate { received_documents, indexed_documents } => {
|
||||
DetailsView {
|
||||
received_documents: Some(received_documents),
|
||||
indexed_documents: Some(indexed_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::SettingsUpdate { settings } => {
|
||||
DetailsView { settings: Some(settings), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexInfo { primary_key } => {
|
||||
DetailsView { primary_key: Some(primary_key), ..DetailsView::default() }
|
||||
}
|
||||
Details::DocumentDeletion {
|
||||
provided_ids: received_document_ids,
|
||||
deleted_documents,
|
||||
} => DetailsView {
|
||||
provided_ids: Some(received_document_ids),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
original_filter: Some(None),
|
||||
..DetailsView::default()
|
||||
},
|
||||
Details::DocumentDeletionByFilter { original_filter, deleted_documents } => {
|
||||
DetailsView {
|
||||
provided_ids: Some(0),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
deleted_documents: Some(deleted_documents),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::ClearAll { deleted_documents } => {
|
||||
DetailsView { deleted_documents: Some(deleted_documents), ..DetailsView::default() }
|
||||
}
|
||||
Details::TaskCancelation { matched_tasks, canceled_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
canceled_tasks: Some(canceled_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::TaskDeletion { matched_tasks, deleted_tasks, original_filter } => {
|
||||
DetailsView {
|
||||
matched_tasks: Some(matched_tasks),
|
||||
deleted_tasks: Some(deleted_tasks),
|
||||
original_filter: Some(Some(original_filter)),
|
||||
..DetailsView::default()
|
||||
}
|
||||
}
|
||||
Details::Dump { dump_uid } => {
|
||||
DetailsView { dump_uid: Some(dump_uid), ..DetailsView::default() }
|
||||
}
|
||||
Details::IndexSwap { swaps } => {
|
||||
DetailsView { swaps: Some(swaps), ..Default::default() }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserr)]
|
||||
#[deserr(error = DeserrQueryParamError, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct TasksFilterQuery {
|
||||
|
@ -7,24 +7,21 @@ use deserr::Deserr;
|
||||
use either::Either;
|
||||
use index_scheduler::RoFeatures;
|
||||
use indexmap::IndexMap;
|
||||
use log::warn;
|
||||
use meilisearch_auth::IndexSearchRules;
|
||||
use meilisearch_types::deserr::DeserrJsonError;
|
||||
use meilisearch_types::error::deserr_codes::*;
|
||||
use meilisearch_types::heed::RoTxn;
|
||||
use meilisearch_types::index_uid::IndexUid;
|
||||
use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy};
|
||||
use meilisearch_types::milli::{
|
||||
dot_product_similarity, FacetValueHit, InternalError, OrderBy, SearchForFacetValues,
|
||||
};
|
||||
use meilisearch_types::milli::score_details::{self, ScoreDetails, ScoringStrategy};
|
||||
use meilisearch_types::milli::vector::DistributionShift;
|
||||
use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues};
|
||||
use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS;
|
||||
use meilisearch_types::{milli, Document};
|
||||
use milli::tokenizer::TokenizerBuilder;
|
||||
use milli::{
|
||||
AscDesc, FieldId, FieldsIdsMap, Filter, FormatOptions, Index, MatchBounds, MatcherBuilder,
|
||||
SortError, TermsMatchingStrategy, VectorOrArrayOfVectors, DEFAULT_VALUES_PER_FACET,
|
||||
SortError, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
|
||||
};
|
||||
use ordered_float::OrderedFloat;
|
||||
use regex::Regex;
|
||||
use serde::Serialize;
|
||||
use serde_json::{json, Value};
|
||||
@ -39,6 +36,7 @@ pub const DEFAULT_CROP_LENGTH: fn() -> usize = || 10;
|
||||
pub const DEFAULT_CROP_MARKER: fn() -> String = || "…".to_string();
|
||||
pub const DEFAULT_HIGHLIGHT_PRE_TAG: fn() -> String = || "<em>".to_string();
|
||||
pub const DEFAULT_HIGHLIGHT_POST_TAG: fn() -> String = || "</em>".to_string();
|
||||
pub const DEFAULT_SEMANTIC_RATIO: fn() -> SemanticRatio = || SemanticRatio(0.5);
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
|
||||
#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)]
|
||||
@ -47,6 +45,8 @@ pub struct SearchQuery {
|
||||
pub q: Option<String>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchVector>)]
|
||||
pub vector: Option<Vec<f32>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidHybridQuery>)]
|
||||
pub hybrid: Option<HybridQuery>,
|
||||
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
|
||||
pub offset: usize,
|
||||
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
|
||||
@ -87,6 +87,48 @@ pub struct SearchQuery {
|
||||
pub attributes_to_search_on: Option<Vec<String>>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Default, PartialEq, Deserr)]
|
||||
#[deserr(error = DeserrJsonError<InvalidHybridQuery>, rename_all = camelCase, deny_unknown_fields)]
|
||||
pub struct HybridQuery {
|
||||
/// TODO validate that sementic ratio is between 0.0 and 1,0
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchSemanticRatio>, default)]
|
||||
pub semantic_ratio: SemanticRatio,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidEmbedder>, default)]
|
||||
pub embedder: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Deserr)]
|
||||
#[deserr(try_from(f32) = TryFrom::try_from -> InvalidSearchSemanticRatio)]
|
||||
pub struct SemanticRatio(f32);
|
||||
|
||||
impl Default for SemanticRatio {
|
||||
fn default() -> Self {
|
||||
DEFAULT_SEMANTIC_RATIO()
|
||||
}
|
||||
}
|
||||
|
||||
impl std::convert::TryFrom<f32> for SemanticRatio {
|
||||
type Error = InvalidSearchSemanticRatio;
|
||||
|
||||
fn try_from(f: f32) -> Result<Self, Self::Error> {
|
||||
// the suggested "fix" is: `!(0.0..=1.0).contains(&f)`` which is allegedly less readable
|
||||
#[allow(clippy::manual_range_contains)]
|
||||
if f > 1.0 || f < 0.0 {
|
||||
Err(InvalidSearchSemanticRatio)
|
||||
} else {
|
||||
Ok(SemanticRatio(f))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::ops::Deref for SemanticRatio {
|
||||
type Target = f32;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl SearchQuery {
|
||||
pub fn is_finite_pagination(&self) -> bool {
|
||||
self.page.or(self.hits_per_page).is_some()
|
||||
@ -106,6 +148,8 @@ pub struct SearchQueryWithIndex {
|
||||
pub q: Option<String>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidSearchQ>)]
|
||||
pub vector: Option<Vec<f32>>,
|
||||
#[deserr(default, error = DeserrJsonError<InvalidHybridQuery>)]
|
||||
pub hybrid: Option<HybridQuery>,
|
||||
#[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError<InvalidSearchOffset>)]
|
||||
pub offset: usize,
|
||||
#[deserr(default = DEFAULT_SEARCH_LIMIT(), error = DeserrJsonError<InvalidSearchLimit>)]
|
||||
@ -171,6 +215,7 @@ impl SearchQueryWithIndex {
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
} = self;
|
||||
(
|
||||
index_uid,
|
||||
@ -196,6 +241,7 @@ impl SearchQueryWithIndex {
|
||||
crop_marker,
|
||||
matching_strategy,
|
||||
attributes_to_search_on,
|
||||
hybrid,
|
||||
// do not use ..Default::default() here,
|
||||
// rather add any missing field from `SearchQuery` to `SearchQueryWithIndex`
|
||||
},
|
||||
@ -335,19 +381,44 @@ fn prepare_search<'t>(
|
||||
rtxn: &'t RoTxn,
|
||||
query: &'t SearchQuery,
|
||||
features: RoFeatures,
|
||||
distribution: Option<DistributionShift>,
|
||||
) -> Result<(milli::Search<'t>, bool, usize, usize), MeilisearchHttpError> {
|
||||
let mut search = index.search(rtxn);
|
||||
|
||||
if query.vector.is_some() && query.q.is_some() {
|
||||
warn!("Ignoring the query string `q` when used with the `vector` parameter.");
|
||||
if query.vector.is_some() {
|
||||
features.check_vector("Passing `vector` as a query parameter")?;
|
||||
}
|
||||
|
||||
if query.hybrid.is_some() {
|
||||
features.check_vector("Passing `hybrid` as a query parameter")?;
|
||||
}
|
||||
|
||||
if query.hybrid.is_none() && query.q.is_some() && query.vector.is_some() {
|
||||
return Err(MeilisearchHttpError::MissingSearchHybrid);
|
||||
}
|
||||
|
||||
search.distribution_shift(distribution);
|
||||
|
||||
if let Some(ref vector) = query.vector {
|
||||
search.vector(vector.clone());
|
||||
match &query.hybrid {
|
||||
// If semantic ratio is 0.0, only the query search will impact the search results,
|
||||
// skip the vector
|
||||
Some(hybrid) if *hybrid.semantic_ratio == 0.0 => (),
|
||||
_otherwise => {
|
||||
search.vector(vector.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref query) = query.q {
|
||||
search.query(query);
|
||||
if let Some(ref q) = query.q {
|
||||
match &query.hybrid {
|
||||
// If semantic ratio is 1.0, only the vector search will impact the search results,
|
||||
// skip the query
|
||||
Some(hybrid) if *hybrid.semantic_ratio == 1.0 => (),
|
||||
_otherwise => {
|
||||
search.query(q);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some(ref searchable) = query.attributes_to_search_on {
|
||||
@ -360,6 +431,7 @@ fn prepare_search<'t>(
|
||||
let max_total_hits = index
|
||||
.pagination_max_total_hits(rtxn)
|
||||
.map_err(milli::Error::from)?
|
||||
.map(|x| x as usize)
|
||||
.unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS);
|
||||
|
||||
search.exhaustive_number_hits(is_finite_pagination);
|
||||
@ -373,8 +445,8 @@ fn prepare_search<'t>(
|
||||
features.check_score_details()?;
|
||||
}
|
||||
|
||||
if query.vector.is_some() {
|
||||
features.check_vector()?;
|
||||
if let Some(HybridQuery { embedder: Some(embedder), .. }) = &query.hybrid {
|
||||
search.embedder_name(embedder);
|
||||
}
|
||||
|
||||
// compute the offset on the limit depending on the pagination mode.
|
||||
@ -420,15 +492,22 @@ pub fn perform_search(
|
||||
index: &Index,
|
||||
query: SearchQuery,
|
||||
features: RoFeatures,
|
||||
distribution: Option<DistributionShift>,
|
||||
) -> Result<SearchResult, MeilisearchHttpError> {
|
||||
let before_search = Instant::now();
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
let (search, is_finite_pagination, max_total_hits, offset) =
|
||||
prepare_search(index, &rtxn, &query, features)?;
|
||||
prepare_search(index, &rtxn, &query, features, distribution)?;
|
||||
|
||||
let milli::SearchResult { documents_ids, matching_words, candidates, document_scores, .. } =
|
||||
search.execute()?;
|
||||
match &query.hybrid {
|
||||
Some(hybrid) => match *hybrid.semantic_ratio {
|
||||
ratio if ratio == 0.0 || ratio == 1.0 => search.execute()?,
|
||||
ratio => search.execute_hybrid(ratio)?,
|
||||
},
|
||||
None => search.execute()?,
|
||||
};
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||
|
||||
@ -537,13 +616,17 @@ pub fn perform_search(
|
||||
insert_geo_distance(sort, &mut document);
|
||||
}
|
||||
|
||||
let semantic_score = match query.vector.as_ref() {
|
||||
Some(vector) => match extract_field("_vectors", &fields_ids_map, obkv)? {
|
||||
Some(vectors) => compute_semantic_score(vector, vectors)?,
|
||||
None => None,
|
||||
},
|
||||
None => None,
|
||||
};
|
||||
let mut semantic_score = None;
|
||||
for details in &score {
|
||||
if let ScoreDetails::Vector(score_details::Vector {
|
||||
target_vector: _,
|
||||
value_similarity: Some((_matching_vector, similarity)),
|
||||
}) = details
|
||||
{
|
||||
semantic_score = Some(*similarity);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
let ranking_score =
|
||||
query.show_ranking_score.then(|| ScoreDetails::global_score(score.iter()));
|
||||
@ -586,6 +669,7 @@ pub fn perform_search(
|
||||
let max_values_by_facet = index
|
||||
.max_values_per_facet(&rtxn)
|
||||
.map_err(milli::Error::from)?
|
||||
.map(|x| x as usize)
|
||||
.unwrap_or(DEFAULT_VALUES_PER_FACET);
|
||||
facet_distribution.max_values_per_facet(max_values_by_facet);
|
||||
|
||||
@ -645,11 +729,15 @@ pub fn perform_facet_search(
|
||||
let before_search = Instant::now();
|
||||
let rtxn = index.read_txn()?;
|
||||
|
||||
let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features)?;
|
||||
let mut facet_search = SearchForFacetValues::new(facet_name, search);
|
||||
let (search, _, _, _) = prepare_search(index, &rtxn, &search_query, features, None)?;
|
||||
let mut facet_search =
|
||||
SearchForFacetValues::new(facet_name, search, search_query.hybrid.is_some());
|
||||
if let Some(facet_query) = &facet_query {
|
||||
facet_search.query(facet_query);
|
||||
}
|
||||
if let Some(max_facets) = index.max_values_per_facet(&rtxn)? {
|
||||
facet_search.max_values(max_facets as usize);
|
||||
}
|
||||
|
||||
Ok(FacetSearchResult {
|
||||
facet_hits: facet_search.execute()?,
|
||||
@ -674,18 +762,6 @@ fn insert_geo_distance(sorts: &[String], document: &mut Document) {
|
||||
}
|
||||
}
|
||||
|
||||
fn compute_semantic_score(query: &[f32], vectors: Value) -> milli::Result<Option<f32>> {
|
||||
let vectors = serde_json::from_value(vectors)
|
||||
.map(VectorOrArrayOfVectors::into_array_of_vectors)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
Ok(vectors
|
||||
.into_iter()
|
||||
.flatten()
|
||||
.map(|v| OrderedFloat(dot_product_similarity(query, &v)))
|
||||
.max()
|
||||
.map(OrderedFloat::into_inner))
|
||||
}
|
||||
|
||||
fn compute_formatted_options(
|
||||
attr_to_highlight: &HashSet<String>,
|
||||
attr_to_crop: &[String],
|
||||
@ -813,22 +889,6 @@ fn make_document(
|
||||
Ok(document)
|
||||
}
|
||||
|
||||
/// Extract the JSON value under the field name specified
|
||||
/// but doesn't support nested objects.
|
||||
fn extract_field(
|
||||
field_name: &str,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
obkv: obkv::KvReaderU16,
|
||||
) -> Result<Option<serde_json::Value>, MeilisearchHttpError> {
|
||||
match field_ids_map.id(field_name) {
|
||||
Some(fid) => match obkv.get(fid) {
|
||||
Some(value) => Ok(serde_json::from_slice(value).map(Some)?),
|
||||
None => Ok(None),
|
||||
},
|
||||
None => Ok(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn format_fields<'a>(
|
||||
document: &Document,
|
||||
field_ids_map: &FieldsIdsMap,
|
||||
@ -840,6 +900,14 @@ fn format_fields<'a>(
|
||||
let mut matches_position = compute_matches.then(BTreeMap::new);
|
||||
let mut document = document.clone();
|
||||
|
||||
// reduce the formatted option list to the attributes that should be formatted,
|
||||
// instead of all the attributes to display.
|
||||
let formatting_fields_options: Vec<_> = formatted_options
|
||||
.iter()
|
||||
.filter(|(_, option)| option.should_format())
|
||||
.map(|(fid, option)| (field_ids_map.name(*fid).unwrap(), option))
|
||||
.collect();
|
||||
|
||||
// select the attributes to retrieve
|
||||
let displayable_names =
|
||||
displayable_ids.iter().map(|&fid| field_ids_map.name(fid).expect("Missing field name"));
|
||||
@ -848,13 +916,15 @@ fn format_fields<'a>(
|
||||
// to the value and merge them together. eg. If a user said he wanted to highlight `doggo`
|
||||
// and crop `doggo.name`. `doggo.name` needs to be highlighted + cropped while `doggo.age` is only
|
||||
// highlighted.
|
||||
let format = formatted_options
|
||||
// Warn: The time to compute the format list scales with the number of fields to format;
|
||||
// cumulated with map_leaf_values that iterates over all the nested fields, it gives a quadratic complexity:
|
||||
// d*f where d is the total number of fields to display and f is the total number of fields to format.
|
||||
let format = formatting_fields_options
|
||||
.iter()
|
||||
.filter(|(field, _option)| {
|
||||
let name = field_ids_map.name(**field).unwrap();
|
||||
.filter(|(name, _option)| {
|
||||
milli::is_faceted_by(name, key) || milli::is_faceted_by(key, name)
|
||||
})
|
||||
.map(|(_, option)| *option)
|
||||
.map(|(_, option)| **option)
|
||||
.reduce(|acc, option| acc.merge(option));
|
||||
let mut infos = Vec::new();
|
||||
|
||||
@ -951,7 +1021,7 @@ fn format_value<'a>(
|
||||
let value = matcher.format(format_options);
|
||||
Value::String(value.into_owned())
|
||||
}
|
||||
None => Value::Number(number),
|
||||
None => Value::String(s),
|
||||
}
|
||||
}
|
||||
value => value,
|
||||
|
Binary file not shown.
@ -5,9 +5,11 @@ pub mod service;
|
||||
|
||||
use std::fmt::{self, Display};
|
||||
|
||||
#[allow(unused)]
|
||||
pub use index::{GetAllDocumentsOptions, GetDocumentOptions};
|
||||
use meili_snap::json_string;
|
||||
use serde::{Deserialize, Serialize};
|
||||
#[allow(unused)]
|
||||
pub use server::{default_settings, Server};
|
||||
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
|
||||
@ -62,7 +64,7 @@ impl Display for Value {
|
||||
write!(
|
||||
f,
|
||||
"{}",
|
||||
json_string!(self, { ".enqueuedAt" => "[date]", ".processedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
|
||||
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -1760,6 +1760,181 @@ async fn add_documents_invalid_geo_field() {
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
// The three next tests are related to #4333
|
||||
|
||||
// _geo has a lat and lng but set to `null`
|
||||
let documents = json!([
|
||||
{
|
||||
"id": "12",
|
||||
"_geo": { "lng": null, "lat": 67}
|
||||
}
|
||||
]);
|
||||
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let response = index.wait_task(response.uid()).await;
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": 14,
|
||||
"indexUid": "test",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Could not parse longitude in the document with the id: `12`. Was expecting a finite number but instead got `null`.",
|
||||
"code": "invalid_document_geo_field",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
// _geo has a lat and lng but set to `null`
|
||||
let documents = json!([
|
||||
{
|
||||
"id": "12",
|
||||
"_geo": { "lng": 35, "lat": null }
|
||||
}
|
||||
]);
|
||||
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let response = index.wait_task(response.uid()).await;
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": 15,
|
||||
"indexUid": "test",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Could not parse latitude in the document with the id: `12`. Was expecting a finite number but instead got `null`.",
|
||||
"code": "invalid_document_geo_field",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
// _geo has a lat and lng but set to `null`
|
||||
let documents = json!([
|
||||
{
|
||||
"id": "13",
|
||||
"_geo": { "lng": null, "lat": null }
|
||||
}
|
||||
]);
|
||||
|
||||
let (response, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let response = index.wait_task(response.uid()).await;
|
||||
snapshot!(json_string!(response, { ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": 16,
|
||||
"indexUid": "test",
|
||||
"status": "failed",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 0
|
||||
},
|
||||
"error": {
|
||||
"message": "Could not parse latitude nor longitude in the document with the id: `13`. Was expecting finite numbers but instead got `null` and `null`.",
|
||||
"code": "invalid_document_geo_field",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
// Related to #4333
|
||||
#[actix_rt::test]
|
||||
async fn add_invalid_geo_and_then_settings() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
index.create(Some("id")).await;
|
||||
|
||||
// _geo is not an object
|
||||
let documents = json!([
|
||||
{
|
||||
"id": "11",
|
||||
"_geo": { "lat": null, "lng": null },
|
||||
}
|
||||
]);
|
||||
let (ret, code) = index.add_documents(documents, None).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let ret = index.wait_task(ret.uid()).await;
|
||||
snapshot!(ret, @r###"
|
||||
{
|
||||
"uid": 1,
|
||||
"indexUid": "test",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (ret, code) = index.update_settings(json!({"sortableAttributes": ["_geo"]})).await;
|
||||
snapshot!(code, @"202 Accepted");
|
||||
let ret = index.wait_task(ret.uid()).await;
|
||||
snapshot!(ret, @r###"
|
||||
{
|
||||
"uid": 2,
|
||||
"indexUid": "test",
|
||||
"status": "failed",
|
||||
"type": "settingsUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"sortableAttributes": [
|
||||
"_geo"
|
||||
]
|
||||
},
|
||||
"error": {
|
||||
"message": "Could not parse latitude in the document with the id: `\"11\"`. Was expecting a finite number but instead got `null`.",
|
||||
"code": "invalid_document_geo_field",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_document_geo_field"
|
||||
},
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
|
@ -20,6 +20,8 @@ pub enum GetDump {
|
||||
RubyGemsWithSettingsV4,
|
||||
|
||||
TestV5,
|
||||
|
||||
TestV6WithExperimental,
|
||||
}
|
||||
|
||||
impl GetDump {
|
||||
@ -68,6 +70,10 @@ impl GetDump {
|
||||
GetDump::TestV5 => {
|
||||
exist_relative_path!("tests/assets/v5_v0.28.0_test_dump.dump").into()
|
||||
}
|
||||
GetDump::TestV6WithExperimental => exist_relative_path!(
|
||||
"tests/assets/v6_v1.6.0_use_deactivated_experimental_setting.dump"
|
||||
)
|
||||
.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -59,6 +59,7 @@ async fn import_dump_v1_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -219,6 +220,7 @@ async fn import_dump_v1_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -365,6 +367,7 @@ async fn import_dump_v1_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -497,6 +500,7 @@ async fn import_dump_v2_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -641,6 +645,7 @@ async fn import_dump_v2_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -784,6 +789,7 @@ async fn import_dump_v2_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -916,6 +922,7 @@ async fn import_dump_v3_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -1060,6 +1067,7 @@ async fn import_dump_v3_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -1203,6 +1211,7 @@ async fn import_dump_v3_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -1335,6 +1344,7 @@ async fn import_dump_v4_movie_raw() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -1479,6 +1489,7 @@ async fn import_dump_v4_movie_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -1622,6 +1633,7 @@ async fn import_dump_v4_rubygems_with_settings() {
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byWord",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
@ -1810,3 +1822,107 @@ async fn import_dump_v5() {
|
||||
json_string!(tasks, { ".results[].details.dumpUid" => "[uid]", ".results[].duration" => "[duration]" , ".results[].startedAt" => "[date]" , ".results[].finishedAt" => "[date]" })
|
||||
);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn import_dump_v6_containing_experimental_features() {
|
||||
let temp = tempfile::tempdir().unwrap();
|
||||
|
||||
let options = Opt {
|
||||
import_dump: Some(GetDump::TestV6WithExperimental.path()),
|
||||
..default_settings(temp.path())
|
||||
};
|
||||
let mut server = Server::new_auth_with_options(options, temp).await;
|
||||
server.use_api_key("MASTER_KEY");
|
||||
|
||||
let (indexes, code) = server.list_indexes(None, None).await;
|
||||
assert_eq!(code, 200, "{indexes}");
|
||||
|
||||
assert_eq!(indexes["results"].as_array().unwrap().len(), 1);
|
||||
assert_eq!(indexes["results"][0]["uid"], json!("movies"));
|
||||
assert_eq!(indexes["results"][0]["primaryKey"], json!("id"));
|
||||
|
||||
let (response, code) = server.get_features().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": false,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let index = server.index("movies");
|
||||
|
||||
let (response, code) = index.settings().await;
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"displayedAttributes": [
|
||||
"*"
|
||||
],
|
||||
"searchableAttributes": [
|
||||
"*"
|
||||
],
|
||||
"filterableAttributes": [],
|
||||
"sortableAttributes": [],
|
||||
"rankingRules": [
|
||||
"words",
|
||||
"typo",
|
||||
"proximity"
|
||||
],
|
||||
"stopWords": [],
|
||||
"nonSeparatorTokens": [],
|
||||
"separatorTokens": [],
|
||||
"dictionary": [],
|
||||
"synonyms": {},
|
||||
"distinctAttribute": null,
|
||||
"proximityPrecision": "byAttribute",
|
||||
"typoTolerance": {
|
||||
"enabled": true,
|
||||
"minWordSizeForTypos": {
|
||||
"oneTypo": 5,
|
||||
"twoTypos": 9
|
||||
},
|
||||
"disableOnWords": [],
|
||||
"disableOnAttributes": []
|
||||
},
|
||||
"faceting": {
|
||||
"maxValuesPerFacet": 100,
|
||||
"sortFacetValuesBy": {
|
||||
"*": "alpha"
|
||||
}
|
||||
},
|
||||
"pagination": {
|
||||
"maxTotalHits": 1000
|
||||
}
|
||||
}
|
||||
"###);
|
||||
|
||||
// the expected order is [1, 3, 2] instead of [3, 1, 2]
|
||||
// because the attribute scale doesn't make the difference between 1 and 3.
|
||||
index
|
||||
.search(json!({"q": "the soup of day"}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
@ -4,23 +4,111 @@ use once_cell::sync::Lazy;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{"productId": 1, "shopId": 1},
|
||||
{"productId": 2, "shopId": 1},
|
||||
{"productId": 3, "shopId": 2},
|
||||
{"productId": 4, "shopId": 2},
|
||||
{"productId": 5, "shopId": 3},
|
||||
{"productId": 6, "shopId": 3},
|
||||
{"productId": 7, "shopId": 4},
|
||||
{"productId": 8, "shopId": 4},
|
||||
{"productId": 9, "shopId": 5},
|
||||
{"productId": 10, "shopId": 5}
|
||||
{
|
||||
"id": 1,
|
||||
"description": "Leather Jacket",
|
||||
"brand": "Lee Jeans",
|
||||
"product_id": "123456",
|
||||
"color": "Brown"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"description": "Leather Jacket",
|
||||
"brand": "Lee Jeans",
|
||||
"product_id": "123456",
|
||||
"color": "Black"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"description": "Leather Jacket",
|
||||
"brand": "Lee Jeans",
|
||||
"product_id": "123456",
|
||||
"color": "Blue"
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"description": "T-Shirt",
|
||||
"brand": "Nike",
|
||||
"product_id": "789012",
|
||||
"color": "Red"
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"description": "T-Shirt",
|
||||
"brand": "Nike",
|
||||
"product_id": "789012",
|
||||
"color": "Blue"
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"description": "Running Shoes",
|
||||
"brand": "Adidas",
|
||||
"product_id": "456789",
|
||||
"color": "Black"
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"description": "Running Shoes",
|
||||
"brand": "Adidas",
|
||||
"product_id": "456789",
|
||||
"color": "White"
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"description": "Hoodie",
|
||||
"brand": "Puma",
|
||||
"product_id": "987654",
|
||||
"color": "Gray"
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"description": "Sweater",
|
||||
"brand": "Gap",
|
||||
"product_id": "234567",
|
||||
"color": "Green"
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"description": "Sweater",
|
||||
"brand": "Gap",
|
||||
"product_id": "234567",
|
||||
"color": "Red"
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"description": "Sweater",
|
||||
"brand": "Gap",
|
||||
"product_id": "234567",
|
||||
"color": "Blue"
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"description": "Jeans",
|
||||
"brand": "Levi's",
|
||||
"product_id": "345678",
|
||||
"color": "Indigo"
|
||||
},
|
||||
{
|
||||
"id": 13,
|
||||
"description": "Jeans",
|
||||
"brand": "Levi's",
|
||||
"product_id": "345678",
|
||||
"color": "Black"
|
||||
},
|
||||
{
|
||||
"id": 14,
|
||||
"description": "Jeans",
|
||||
"brand": "Levi's",
|
||||
"product_id": "345678",
|
||||
"color": "Stone Wash"
|
||||
}
|
||||
])
|
||||
});
|
||||
|
||||
pub(self) static DOCUMENT_PRIMARY_KEY: &str = "productId";
|
||||
pub(self) static DOCUMENT_DISTINCT_KEY: &str = "shopId";
|
||||
static DOCUMENT_PRIMARY_KEY: &str = "id";
|
||||
static DOCUMENT_DISTINCT_KEY: &str = "product_id";
|
||||
|
||||
/// testing: https://github.com/meilisearch/meilisearch/issues/4078
|
||||
#[actix_rt::test]
|
||||
@ -33,31 +121,121 @@ async fn distinct_search_with_offset_no_ranking() {
|
||||
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
fn get_hits(Value(response): Value) -> Vec<i64> {
|
||||
fn get_hits(response: &Value) -> Vec<&str> {
|
||||
let hits_array = response["hits"].as_array().unwrap();
|
||||
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_i64().unwrap()).collect::<Vec<_>>()
|
||||
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 2, "offset": 0})).await;
|
||||
let hits = get_hits(response);
|
||||
let (response, code) = index.search_post(json!({"offset": 0, "limit": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @"[1, 2]");
|
||||
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
|
||||
snapshot!(response["estimatedTotalHits"] , @"11");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 2, "offset": 2})).await;
|
||||
let hits = get_hits(response);
|
||||
let (response, code) = index.search_post(json!({"offset": 2, "limit": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @"[3, 4]");
|
||||
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
|
||||
snapshot!(response["estimatedTotalHits"], @"10");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 10, "offset": 4})).await;
|
||||
let hits = get_hits(response);
|
||||
let (response, code) = index.search_post(json!({"offset": 4, "limit": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
|
||||
snapshot!(response["estimatedTotalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"offset": 5, "limit": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"1");
|
||||
snapshot!(format!("{:?}", hits), @"[5]");
|
||||
snapshot!(format!("{:?}", hits), @r#"["345678"]"#);
|
||||
snapshot!(response["estimatedTotalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"limit": 10, "offset": 5})).await;
|
||||
let hits = get_hits(response);
|
||||
let (response, code) = index.search_post(json!({"offset": 6, "limit": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"0");
|
||||
snapshot!(format!("{:?}", hits), @r#"[]"#);
|
||||
snapshot!(response["estimatedTotalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"offset": 7, "limit": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"0");
|
||||
snapshot!(format!("{:?}", hits), @r#"[]"#);
|
||||
snapshot!(response["estimatedTotalHits"], @"6");
|
||||
}
|
||||
|
||||
/// testing: https://github.com/meilisearch/meilisearch/issues/4130
|
||||
#[actix_rt::test]
|
||||
async fn distinct_search_with_pagination_no_ranking() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.add_documents(documents, Some(DOCUMENT_PRIMARY_KEY)).await;
|
||||
index.update_distinct_attribute(json!(DOCUMENT_DISTINCT_KEY)).await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
fn get_hits(response: &Value) -> Vec<&str> {
|
||||
let hits_array = response["hits"].as_array().unwrap();
|
||||
hits_array.iter().map(|h| h[DOCUMENT_DISTINCT_KEY].as_str().unwrap()).collect::<Vec<_>>()
|
||||
}
|
||||
|
||||
let (response, code) = index.search_post(json!({"page": 0, "hitsPerPage": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"0");
|
||||
snapshot!(format!("{:?}", hits), @r#"[]"#);
|
||||
snapshot!(response["page"], @"0");
|
||||
snapshot!(response["totalPages"], @"3");
|
||||
snapshot!(response["totalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"page": 1, "hitsPerPage": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @r#"["123456", "789012"]"#);
|
||||
snapshot!(response["page"], @"1");
|
||||
snapshot!(response["totalPages"], @"3");
|
||||
snapshot!(response["totalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @r#"["456789", "987654"]"#);
|
||||
snapshot!(response["page"], @"2");
|
||||
snapshot!(response["totalPages"], @"3");
|
||||
snapshot!(response["totalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"page": 3, "hitsPerPage": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"2");
|
||||
snapshot!(format!("{:?}", hits), @r#"["234567", "345678"]"#);
|
||||
snapshot!(response["page"], @"3");
|
||||
snapshot!(response["totalPages"], @"3");
|
||||
snapshot!(response["totalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"page": 4, "hitsPerPage": 2})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"0");
|
||||
snapshot!(format!("{:?}", hits), @r#"[]"#);
|
||||
snapshot!(response["page"], @"4");
|
||||
snapshot!(response["totalPages"], @"3");
|
||||
snapshot!(response["totalHits"], @"6");
|
||||
|
||||
let (response, code) = index.search_post(json!({"page": 2, "hitsPerPage": 3})).await;
|
||||
let hits = get_hits(&response);
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(hits.len(), @"3");
|
||||
snapshot!(format!("{:?}", hits), @r#"["987654", "234567", "345678"]"#);
|
||||
snapshot!(response["page"], @"2");
|
||||
snapshot!(response["totalPages"], @"2");
|
||||
snapshot!(response["totalHits"], @"6");
|
||||
}
|
||||
|
@ -4,7 +4,7 @@ use once_cell::sync::Lazy;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
@ -105,6 +105,24 @@ async fn more_advanced_facet_search() {
|
||||
snapshot!(response["facetHits"].as_array().unwrap().len(), @"1");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_facet_search_with_max_values() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
let documents = DOCUMENTS.clone();
|
||||
index.update_settings_faceting(json!({ "maxValuesPerFacet": 1 })).await;
|
||||
index.update_settings_filterable_attributes(json!(["genres"])).await;
|
||||
index.add_documents(documents, None).await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
let (response, code) =
|
||||
index.facet_search(json!({"facetName": "genres", "facetQuery": "a"})).await;
|
||||
|
||||
assert_eq!(code, 200, "{}", response);
|
||||
assert_eq!(dbg!(response)["facetHits"].as_array().unwrap().len(), 1);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn non_filterable_facet_search_error() {
|
||||
let server = Server::new().await;
|
||||
|
@ -4,7 +4,7 @@ use once_cell::sync::Lazy;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"id": 1,
|
||||
|
221
meilisearch/tests/search/hybrid.rs
Normal file
221
meilisearch/tests/search/hybrid.rs
Normal file
@ -0,0 +1,221 @@
|
||||
use meili_snap::snapshot;
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::common::index::Index;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"scoreDetails": false,
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({ "embedders": {"default": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 2}}} ))
|
||||
.await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(response.uid()).await;
|
||||
|
||||
let (response, code) = index.add_documents(documents.clone(), None).await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(response.uid()).await;
|
||||
index
|
||||
}
|
||||
|
||||
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
"_vectors": {"default": [1.0, 3.0]},
|
||||
},
|
||||
{
|
||||
"title": "Captain Planet",
|
||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
||||
"id": "2",
|
||||
"_vectors": {"default": [1.0, 2.0]},
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "3",
|
||||
"_vectors": {"default": [2.0, 3.0]},
|
||||
}])
|
||||
});
|
||||
|
||||
static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
|
||||
json!([{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
"_vectors": {"default": [1.0, 3.0]},
|
||||
}])
|
||||
});
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_semanticScore":0.9472136}]"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn highlighter() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
"hybrid": {"semanticRatio": 0.2},
|
||||
"attributesToHighlight": [
|
||||
"desc"
|
||||
],
|
||||
"highlightPreTag": "**BEGIN**",
|
||||
"highlightPostTag": "**END**"
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
"hybrid": {"semanticRatio": 0.8},
|
||||
"attributesToHighlight": [
|
||||
"desc"
|
||||
],
|
||||
"highlightPreTag": "**BEGIN**",
|
||||
"highlightPostTag": "**END**"
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_semanticScore":0.9472136}]"###);
|
||||
|
||||
// no highlighting on full semantic
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
"hybrid": {"semanticRatio": 1.0},
|
||||
"attributesToHighlight": [
|
||||
"desc"
|
||||
],
|
||||
"highlightPreTag": "**BEGIN**",
|
||||
"highlightPostTag": "**END**"
|
||||
}))
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_semanticScore":0.99029034},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_semanticScore":0.97434163},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}}]"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn invalid_semantic_ratio() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.2}}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value at `.hybrid.semanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
|
||||
"code": "invalid_search_semantic_ratio",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": -0.8}}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value at `.hybrid.semanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
|
||||
"code": "invalid_search_semantic_ratio",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_get(
|
||||
&yaup::to_string(
|
||||
&json!({"q": "Captain", "vector": [1.0, 1.0], "hybridSemanticRatio": 1.2}),
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value in parameter `hybridSemanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
|
||||
"code": "invalid_search_semantic_ratio",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.search_get(
|
||||
&yaup::to_string(
|
||||
&json!({"q": "Captain", "vector": [1.0, 1.0], "hybridSemanticRatio": -0.2}),
|
||||
)
|
||||
.unwrap(),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"400 Bad Request");
|
||||
snapshot!(response, @r###"
|
||||
{
|
||||
"message": "Invalid value in parameter `hybridSemanticRatio`: the value of `semanticRatio` is invalid, expected a float between `0.0` and `1.0`.",
|
||||
"code": "invalid_search_semantic_ratio",
|
||||
"type": "invalid_request",
|
||||
"link": "https://docs.meilisearch.com/errors#invalid_search_semantic_ratio"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn single_document() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SINGLE_DOCUMENT).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}),
|
||||
)
|
||||
.await;
|
||||
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0,"_semanticScore":1.0}"###);
|
||||
}
|
@ -6,6 +6,7 @@ mod errors;
|
||||
mod facet_search;
|
||||
mod formatted;
|
||||
mod geo;
|
||||
mod hybrid;
|
||||
mod multi;
|
||||
mod pagination;
|
||||
mod restrict_searchable;
|
||||
@ -15,32 +16,37 @@ use once_cell::sync::Lazy;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
pub(self) static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
static DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"id": "287947",
|
||||
"_vectors": { "manual": [1, 2, 3]},
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"id": "299537",
|
||||
"_vectors": { "manual": [1, 2, 54] },
|
||||
},
|
||||
{
|
||||
"title": "Escape Room",
|
||||
"id": "522681",
|
||||
"_vectors": { "manual": [10, -23, 32] },
|
||||
},
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428",
|
||||
"_vectors": { "manual": [-100, 231, 32] },
|
||||
},
|
||||
{
|
||||
"title": "Gläss",
|
||||
"id": "450465",
|
||||
"_vectors": { "manual": [-100, 340, 90] },
|
||||
}
|
||||
])
|
||||
});
|
||||
|
||||
pub(self) static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"id": 852,
|
||||
@ -57,6 +63,7 @@ pub(self) static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
},
|
||||
],
|
||||
"cattos": "pésti",
|
||||
"_vectors": { "manual": [1, 2, 3]},
|
||||
},
|
||||
{
|
||||
"id": 654,
|
||||
@ -69,12 +76,14 @@ pub(self) static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
},
|
||||
],
|
||||
"cattos": ["simba", "pestiféré"],
|
||||
"_vectors": { "manual": [1, 2, 54] },
|
||||
},
|
||||
{
|
||||
"id": 750,
|
||||
"father": "romain",
|
||||
"mother": "michelle",
|
||||
"cattos": ["enigma"],
|
||||
"_vectors": { "manual": [10, 23, 32] },
|
||||
},
|
||||
{
|
||||
"id": 951,
|
||||
@ -91,6 +100,7 @@ pub(self) static NESTED_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
},
|
||||
],
|
||||
"cattos": ["moumoute", "gomez"],
|
||||
"_vectors": { "manual": [10, 23, 32] },
|
||||
},
|
||||
])
|
||||
});
|
||||
@ -802,6 +812,13 @@ async fn experimental_feature_score_details() {
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
]
|
||||
},
|
||||
"_rankingScoreDetails": {
|
||||
"words": {
|
||||
"order": 0,
|
||||
@ -823,7 +840,7 @@ async fn experimental_feature_score_details() {
|
||||
"order": 3,
|
||||
"attributeRankingOrderScore": 1.0,
|
||||
"queryWordDistanceScore": 0.8095238095238095,
|
||||
"score": 0.9365079365079364
|
||||
"score": 0.9727891156462584
|
||||
},
|
||||
"exactness": {
|
||||
"order": 4,
|
||||
@ -870,13 +887,100 @@ async fn experimental_feature_vector_store() {
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(response["vectorStore"], @"true");
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({"embedders": {
|
||||
"manual": {
|
||||
"source": "userProvided",
|
||||
"dimensions": 3,
|
||||
}
|
||||
}}))
|
||||
.await;
|
||||
|
||||
meili_snap::snapshot!(response, @r###"
|
||||
{
|
||||
"taskUid": 1,
|
||||
"indexUid": "test",
|
||||
"status": "enqueued",
|
||||
"type": "settingsUpdate",
|
||||
"enqueuedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
meili_snap::snapshot!(code, @"202 Accepted");
|
||||
let response = index.wait_task(response.uid()).await;
|
||||
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response["status"]), @"\"succeeded\"");
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({
|
||||
"vector": [1.0, 2.0, 3.0],
|
||||
}))
|
||||
.await;
|
||||
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @"[]");
|
||||
// vector search returns all documents that don't have vectors in the last bucket, like all sorts
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"id": "287947",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
},
|
||||
"_semanticScore": 1.0
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"id": "299537",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
54
|
||||
]
|
||||
},
|
||||
"_semanticScore": 0.9129112
|
||||
},
|
||||
{
|
||||
"title": "Gläss",
|
||||
"id": "450465",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
340,
|
||||
90
|
||||
]
|
||||
},
|
||||
"_semanticScore": 0.8106413
|
||||
},
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
]
|
||||
},
|
||||
"_semanticScore": 0.74120104
|
||||
},
|
||||
{
|
||||
"title": "Escape Room",
|
||||
"id": "522681",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
10,
|
||||
-23,
|
||||
32
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
"###);
|
||||
}
|
||||
|
||||
#[cfg(feature = "default")]
|
||||
@ -1126,7 +1230,14 @@ async fn simple_search_with_strange_synonyms() {
|
||||
[
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428"
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
"###);
|
||||
@ -1140,7 +1251,14 @@ async fn simple_search_with_strange_synonyms() {
|
||||
[
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428"
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
"###);
|
||||
@ -1154,7 +1272,14 @@ async fn simple_search_with_strange_synonyms() {
|
||||
[
|
||||
{
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
"id": "166428"
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
"###);
|
||||
|
@ -72,7 +72,14 @@ async fn simple_search_single_index() {
|
||||
"hits": [
|
||||
{
|
||||
"title": "Gläss",
|
||||
"id": "450465"
|
||||
"id": "450465",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
340,
|
||||
90
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "glass",
|
||||
@ -86,7 +93,14 @@ async fn simple_search_single_index() {
|
||||
"hits": [
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"id": "299537"
|
||||
"id": "299537",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
54
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "captain",
|
||||
@ -177,7 +191,14 @@ async fn simple_search_two_indexes() {
|
||||
"hits": [
|
||||
{
|
||||
"title": "Gläss",
|
||||
"id": "450465"
|
||||
"id": "450465",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
340,
|
||||
90
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "glass",
|
||||
@ -203,7 +224,14 @@ async fn simple_search_two_indexes() {
|
||||
"age": 4
|
||||
}
|
||||
],
|
||||
"cattos": "pésti"
|
||||
"cattos": "pésti",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 654,
|
||||
@ -218,7 +246,14 @@ async fn simple_search_two_indexes() {
|
||||
"cattos": [
|
||||
"simba",
|
||||
"pestiféré"
|
||||
]
|
||||
],
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
54
|
||||
]
|
||||
}
|
||||
}
|
||||
],
|
||||
"query": "pésti",
|
||||
|
@ -335,3 +335,35 @@ async fn exactness_ranking_rule_order() {
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn search_on_exact_field() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(
|
||||
&server,
|
||||
&json!([
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"exact": "Captain Marivel",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marivel",
|
||||
"exact": "Captain the Marvel",
|
||||
"id": "2",
|
||||
}]),
|
||||
)
|
||||
.await;
|
||||
|
||||
let (response, code) =
|
||||
index.update_settings_typo_tolerance(json!({ "disableOnAttributes": ["exact"] })).await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(1).await;
|
||||
// Searching on an exact attribute should only return the document matching without typo.
|
||||
index
|
||||
.search(json!({"q": "Marvel", "attributesToSearchOn": ["exact"]}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"].as_array().unwrap().len(), @"1");
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
@ -54,7 +54,7 @@ async fn get_settings() {
|
||||
let (response, code) = index.settings().await;
|
||||
assert_eq!(code, 200);
|
||||
let settings = response.as_object().unwrap();
|
||||
assert_eq!(settings.keys().len(), 14);
|
||||
assert_eq!(settings.keys().len(), 15);
|
||||
assert_eq!(settings["displayedAttributes"], json!(["*"]));
|
||||
assert_eq!(settings["searchableAttributes"], json!(["*"]));
|
||||
assert_eq!(settings["filterableAttributes"], json!([]));
|
||||
@ -83,6 +83,7 @@ async fn get_settings() {
|
||||
"maxTotalHits": 1000,
|
||||
})
|
||||
);
|
||||
assert_eq!(settings["proximityPrecision"], json!("byWord"));
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
|
@ -1,4 +1,5 @@
|
||||
mod distinct;
|
||||
mod errors;
|
||||
mod get_settings;
|
||||
mod proximity_settings;
|
||||
mod tokenizer_customization;
|
||||
|
352
meilisearch/tests/settings/proximity_settings.rs
Normal file
352
meilisearch/tests/settings/proximity_settings.rs
Normal file
@ -0,0 +1,352 @@
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use once_cell::sync::Lazy;
|
||||
|
||||
use crate::common::Server;
|
||||
use crate::json;
|
||||
|
||||
static DOCUMENTS: Lazy<crate::common::Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish",
|
||||
},
|
||||
])
|
||||
});
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn attribute_scale_search() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
index.add_documents(DOCUMENTS.clone(), None).await;
|
||||
index.wait_task(0).await;
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"proximityPrecision": "byAttribute",
|
||||
"rankingRules": ["words", "typo", "proximity"],
|
||||
}))
|
||||
.await;
|
||||
assert_eq!("202", code.as_str(), "{:?}", response);
|
||||
index.wait_task(1).await;
|
||||
|
||||
// the expected order is [1, 3, 2] instead of [3, 1, 2]
|
||||
// because the attribute scale doesn't make the difference between 1 and 3.
|
||||
index
|
||||
.search(json!({"q": "the soup of day"}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
// the expected order is [1, 2, 3] instead of [1, 3, 2]
|
||||
// because the attribute scale sees all the word in the same attribute
|
||||
// and so doesn't make the difference between the documents.
|
||||
index
|
||||
.search(json!({"q": "many the fish"}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn attribute_scale_phrase_search() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
index.add_documents(DOCUMENTS.clone(), None).await;
|
||||
index.wait_task(0).await;
|
||||
|
||||
let (_response, _code) = index
|
||||
.update_settings(json!({
|
||||
"proximityPrecision": "byAttribute",
|
||||
"rankingRules": ["words", "typo", "proximity"],
|
||||
}))
|
||||
.await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
// the expected order is [1, 3] instead of [3, 1]
|
||||
// because the attribute scale doesn't make the difference between 1 and 3.
|
||||
// But 2 shouldn't be returned because "the" is not in the same attribute.
|
||||
index
|
||||
.search(json!({"q": "\"the soup of day\""}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
// the expected order is [1, 2, 3] instead of [1, 3]
|
||||
// because the attribute scale sees all the word in the same attribute
|
||||
// and so doesn't make the difference between the documents.
|
||||
index
|
||||
.search(json!({"q": "\"many the fish\""}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn word_scale_set_and_reset() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
index.add_documents(DOCUMENTS.clone(), None).await;
|
||||
index.wait_task(0).await;
|
||||
|
||||
// Set and reset the setting ensuring the swap between the 2 settings is applied.
|
||||
let (_response, _code) = index
|
||||
.update_settings(json!({
|
||||
"proximityPrecision": "byAttribute",
|
||||
"rankingRules": ["words", "typo", "proximity"],
|
||||
}))
|
||||
.await;
|
||||
index.wait_task(1).await;
|
||||
|
||||
let (_response, _code) = index
|
||||
.update_settings(json!({
|
||||
"proximityPrecision": "byWord",
|
||||
"rankingRules": ["words", "typo", "proximity"],
|
||||
}))
|
||||
.await;
|
||||
index.wait_task(2).await;
|
||||
|
||||
// [3, 1, 2]
|
||||
index
|
||||
.search(json!({"q": "the soup of day"}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
// [1, 3, 2]
|
||||
index
|
||||
.search(json!({"q": "many the fish"}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
// [3]
|
||||
index
|
||||
.search(json!({"q": "\"the soup of day\""}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
// [1, 3]
|
||||
index
|
||||
.search(json!({"q": "\"many the fish\""}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn attribute_scale_default_ranking_rules() {
|
||||
let server = Server::new().await;
|
||||
let index = server.index("test");
|
||||
|
||||
index.add_documents(DOCUMENTS.clone(), None).await;
|
||||
index.wait_task(0).await;
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({
|
||||
"proximityPrecision": "byAttribute"
|
||||
}))
|
||||
.await;
|
||||
assert_eq!("202", code.as_str(), "{:?}", response);
|
||||
index.wait_task(1).await;
|
||||
|
||||
// the expected order is [3, 1, 2]
|
||||
index
|
||||
.search(json!({"q": "the soup of day"}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
|
||||
// the expected order is [1, 3, 2] instead of [1, 3]
|
||||
// because the attribute scale sees all the word in the same attribute
|
||||
// and so doesn't remove the document 2.
|
||||
index
|
||||
.search(json!({"q": "\"many the fish\""}), |response, code| {
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(json_string!(response["hits"]), @r###"
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"a": "Soup of the day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"a": "the Soup of day",
|
||||
"b": "many the fish"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"a": "Soup of day",
|
||||
"b": "many the lazy fish"
|
||||
}
|
||||
]
|
||||
"###);
|
||||
})
|
||||
.await;
|
||||
}
|
@ -1,4 +1,5 @@
|
||||
mod errors;
|
||||
mod webhook;
|
||||
|
||||
use meili_snap::insta::assert_json_snapshot;
|
||||
use time::format_description::well_known::Rfc3339;
|
||||
|
123
meilisearch/tests/tasks/webhook.rs
Normal file
123
meilisearch/tests/tasks/webhook.rs
Normal file
@ -0,0 +1,123 @@
|
||||
//! To test the webhook, we need to spawn a new server with a URL listening for
|
||||
//! post requests. The webhook handle starts a server and forwards all the
|
||||
//! received requests into a channel for you to handle.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use actix_http::body::MessageBody;
|
||||
use actix_web::dev::{ServiceFactory, ServiceResponse};
|
||||
use actix_web::web::{Bytes, Data};
|
||||
use actix_web::{post, App, HttpResponse, HttpServer};
|
||||
use meili_snap::{json_string, snapshot};
|
||||
use meilisearch::Opt;
|
||||
use tokio::sync::mpsc;
|
||||
use url::Url;
|
||||
|
||||
use crate::common::{default_settings, Server};
|
||||
use crate::json;
|
||||
|
||||
#[post("/")]
|
||||
async fn forward_body(sender: Data<mpsc::UnboundedSender<Vec<u8>>>, body: Bytes) -> HttpResponse {
|
||||
let body = body.to_vec();
|
||||
sender.send(body).unwrap();
|
||||
HttpResponse::Ok().into()
|
||||
}
|
||||
|
||||
fn create_app(
|
||||
sender: Arc<mpsc::UnboundedSender<Vec<u8>>>,
|
||||
) -> actix_web::App<
|
||||
impl ServiceFactory<
|
||||
actix_web::dev::ServiceRequest,
|
||||
Config = (),
|
||||
Response = ServiceResponse<impl MessageBody>,
|
||||
Error = actix_web::Error,
|
||||
InitError = (),
|
||||
>,
|
||||
> {
|
||||
App::new().service(forward_body).app_data(Data::from(sender))
|
||||
}
|
||||
|
||||
struct WebhookHandle {
|
||||
pub server_handle: tokio::task::JoinHandle<Result<(), std::io::Error>>,
|
||||
pub url: String,
|
||||
pub receiver: mpsc::UnboundedReceiver<Vec<u8>>,
|
||||
}
|
||||
|
||||
async fn create_webhook_server() -> WebhookHandle {
|
||||
let mut log_builder = env_logger::Builder::new();
|
||||
log_builder.parse_filters("info");
|
||||
log_builder.init();
|
||||
|
||||
let (sender, receiver) = mpsc::unbounded_channel();
|
||||
let sender = Arc::new(sender);
|
||||
|
||||
// By listening on the port 0, the system will give us any available port.
|
||||
let server =
|
||||
HttpServer::new(move || create_app(sender.clone())).bind(("127.0.0.1", 0)).unwrap();
|
||||
let (ip, scheme) = server.addrs_with_scheme()[0];
|
||||
let url = format!("{scheme}://{ip}/");
|
||||
|
||||
let server_handle = tokio::spawn(server.run());
|
||||
WebhookHandle { server_handle, url, receiver }
|
||||
}
|
||||
|
||||
#[actix_web::test]
|
||||
async fn test_basic_webhook() {
|
||||
let WebhookHandle { server_handle, url, mut receiver } = create_webhook_server().await;
|
||||
|
||||
let db_path = tempfile::tempdir().unwrap();
|
||||
let server = Server::new_with_options(Opt {
|
||||
task_webhook_url: Some(Url::parse(&url).unwrap()),
|
||||
..default_settings(db_path.path())
|
||||
})
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let index = server.index("tamo");
|
||||
// May be flaky: we're relying on the fact that while the first document addition is processed, the other
|
||||
// operations will be received and will be batched together. If it doesn't happen it's not a problem
|
||||
// the rest of the test won't assume anything about the number of tasks per batch.
|
||||
for i in 0..5 {
|
||||
let (_, _status) = index.add_documents(json!({ "id": i, "doggo": "bone" }), None).await;
|
||||
}
|
||||
|
||||
let mut nb_tasks = 0;
|
||||
while let Some(payload) = receiver.recv().await {
|
||||
let payload = String::from_utf8(payload).unwrap();
|
||||
let jsonl = payload.split('\n');
|
||||
for json in jsonl {
|
||||
if json.is_empty() {
|
||||
break; // we reached EOF
|
||||
}
|
||||
nb_tasks += 1;
|
||||
let json: serde_json::Value = serde_json::from_str(json).unwrap();
|
||||
snapshot!(
|
||||
json_string!(json, { ".uid" => "[uid]", ".duration" => "[duration]", ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]" }),
|
||||
@r###"
|
||||
{
|
||||
"uid": "[uid]",
|
||||
"indexUid": "tamo",
|
||||
"status": "succeeded",
|
||||
"type": "documentAdditionOrUpdate",
|
||||
"canceledBy": null,
|
||||
"details": {
|
||||
"receivedDocuments": 1,
|
||||
"indexedDocuments": 1
|
||||
},
|
||||
"error": null,
|
||||
"duration": "[duration]",
|
||||
"enqueuedAt": "[date]",
|
||||
"startedAt": "[date]",
|
||||
"finishedAt": "[date]"
|
||||
}
|
||||
"###);
|
||||
}
|
||||
if nb_tasks == 5 {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
assert!(nb_tasks == 5, "We should have received the 5 tasks but only received {nb_tasks}");
|
||||
|
||||
server_handle.abort();
|
||||
}
|
19
meilitool/Cargo.toml
Normal file
19
meilitool/Cargo.toml
Normal file
@ -0,0 +1,19 @@
|
||||
[package]
|
||||
name = "meilitool"
|
||||
description = "A CLI to edit a Meilisearch database from the command line"
|
||||
version.workspace = true
|
||||
authors.workspace = true
|
||||
homepage.workspace = true
|
||||
readme.workspace = true
|
||||
edition.workspace = true
|
||||
license.workspace = true
|
||||
|
||||
[dependencies]
|
||||
anyhow = "1.0.75"
|
||||
clap = { version = "4.2.1", features = ["derive"] }
|
||||
dump = { path = "../dump" }
|
||||
file-store = { path = "../file-store" }
|
||||
meilisearch-auth = { path = "../meilisearch-auth" }
|
||||
meilisearch-types = { path = "../meilisearch-types" }
|
||||
time = { version = "0.3.30", features = ["formatting"] }
|
||||
uuid = { version = "1.5.0", features = ["v4"], default-features = false }
|
314
meilitool/src/main.rs
Normal file
314
meilitool/src/main.rs
Normal file
@ -0,0 +1,314 @@
|
||||
use std::fs::{read_dir, read_to_string, remove_file, File};
|
||||
use std::io::BufWriter;
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::Context;
|
||||
use clap::{Parser, Subcommand};
|
||||
use dump::{DumpWriter, IndexMetadata};
|
||||
use file_store::FileStore;
|
||||
use meilisearch_auth::AuthController;
|
||||
use meilisearch_types::heed::types::{SerdeJson, Str};
|
||||
use meilisearch_types::heed::{Database, Env, EnvOpenOptions, RoTxn, RwTxn, Unspecified};
|
||||
use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
|
||||
use meilisearch_types::milli::{obkv_to_json, BEU32};
|
||||
use meilisearch_types::tasks::{Status, Task};
|
||||
use meilisearch_types::versioning::check_version_file;
|
||||
use meilisearch_types::Index;
|
||||
use time::macros::format_description;
|
||||
use time::OffsetDateTime;
|
||||
use uuid_codec::UuidCodec;
|
||||
|
||||
mod uuid_codec;
|
||||
|
||||
#[derive(Parser)]
|
||||
#[command(author, version, about, long_about = None)]
|
||||
struct Cli {
|
||||
/// The database path where the Meilisearch is running.
|
||||
#[arg(long, default_value = "data.ms/")]
|
||||
db_path: PathBuf,
|
||||
|
||||
#[command(subcommand)]
|
||||
command: Command,
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
enum Command {
|
||||
/// Clears the task queue and make it empty.
|
||||
///
|
||||
/// This command can be safely executed even if Meilisearch is running and processing tasks.
|
||||
/// Once the task queue is empty you can restart Meilisearch and no more tasks must be visible,
|
||||
/// even the ones that were processing. However, it's highly possible that you see the processing
|
||||
/// tasks in the queue again with an associated internal error message.
|
||||
ClearTaskQueue,
|
||||
|
||||
/// Exports a dump from the Meilisearch database.
|
||||
///
|
||||
/// Make sure to run this command when Meilisearch is not running or running but not processing tasks.
|
||||
/// If tasks are being processed while a dump is being exported there are chances for the dump to be
|
||||
/// malformed with missing tasks.
|
||||
///
|
||||
/// TODO Verify this claim or make sure it cannot happen and we can export dumps
|
||||
/// without caring about killing Meilisearch first!
|
||||
ExportADump {
|
||||
/// The directory in which the dump will be created.
|
||||
#[arg(long, default_value = "dumps/")]
|
||||
dump_dir: PathBuf,
|
||||
|
||||
/// Skip dumping the enqueued or processing tasks.
|
||||
///
|
||||
/// Can be useful when there are a lot of them and it is not particularly useful
|
||||
/// to keep them. Note that only the enqueued tasks takes up space so skipping
|
||||
/// the processed ones is not particularly interesting.
|
||||
#[arg(long)]
|
||||
skip_enqueued_tasks: bool,
|
||||
},
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
let Cli { db_path, command } = Cli::parse();
|
||||
|
||||
check_version_file(&db_path).context("While checking the version file")?;
|
||||
|
||||
match command {
|
||||
Command::ClearTaskQueue => clear_task_queue(db_path),
|
||||
Command::ExportADump { dump_dir, skip_enqueued_tasks } => {
|
||||
export_a_dump(db_path, dump_dir, skip_enqueued_tasks)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Clears the task queue located at `db_path`.
|
||||
fn clear_task_queue(db_path: PathBuf) -> anyhow::Result<()> {
|
||||
let path = db_path.join("tasks");
|
||||
let env = EnvOpenOptions::new()
|
||||
.max_dbs(100)
|
||||
.open(&path)
|
||||
.with_context(|| format!("While trying to open {:?}", path.display()))?;
|
||||
|
||||
eprintln!("Deleting tasks from the database...");
|
||||
|
||||
let mut wtxn = env.write_txn()?;
|
||||
let all_tasks = try_opening_poly_database(&env, &wtxn, "all-tasks")?;
|
||||
let total = all_tasks.len(&wtxn)?;
|
||||
let status = try_opening_poly_database(&env, &wtxn, "status")?;
|
||||
let kind = try_opening_poly_database(&env, &wtxn, "kind")?;
|
||||
let index_tasks = try_opening_poly_database(&env, &wtxn, "index-tasks")?;
|
||||
let canceled_by = try_opening_poly_database(&env, &wtxn, "canceled_by")?;
|
||||
let enqueued_at = try_opening_poly_database(&env, &wtxn, "enqueued-at")?;
|
||||
let started_at = try_opening_poly_database(&env, &wtxn, "started-at")?;
|
||||
let finished_at = try_opening_poly_database(&env, &wtxn, "finished-at")?;
|
||||
|
||||
try_clearing_poly_database(&mut wtxn, all_tasks, "all-tasks")?;
|
||||
try_clearing_poly_database(&mut wtxn, status, "status")?;
|
||||
try_clearing_poly_database(&mut wtxn, kind, "kind")?;
|
||||
try_clearing_poly_database(&mut wtxn, index_tasks, "index-tasks")?;
|
||||
try_clearing_poly_database(&mut wtxn, canceled_by, "canceled_by")?;
|
||||
try_clearing_poly_database(&mut wtxn, enqueued_at, "enqueued-at")?;
|
||||
try_clearing_poly_database(&mut wtxn, started_at, "started-at")?;
|
||||
try_clearing_poly_database(&mut wtxn, finished_at, "finished-at")?;
|
||||
|
||||
wtxn.commit().context("While committing the transaction")?;
|
||||
|
||||
eprintln!("Successfully deleted {total} tasks from the tasks database!");
|
||||
eprintln!("Deleting the content files from disk...");
|
||||
|
||||
let mut count = 0usize;
|
||||
let update_files = db_path.join("update_files");
|
||||
let entries = read_dir(&update_files).with_context(|| {
|
||||
format!("While trying to read the content of {:?}", update_files.display())
|
||||
})?;
|
||||
for result in entries {
|
||||
match result {
|
||||
Ok(ent) => match remove_file(ent.path()) {
|
||||
Ok(_) => count += 1,
|
||||
Err(e) => eprintln!("Error while deleting {:?}: {}", ent.path().display(), e),
|
||||
},
|
||||
Err(e) => {
|
||||
eprintln!("Error while reading a file in {:?}: {}", update_files.display(), e)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
eprintln!("Sucessfully deleted {count} content files from disk!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_opening_database<KC: 'static, DC: 'static>(
|
||||
env: &Env,
|
||||
rtxn: &RoTxn,
|
||||
db_name: &str,
|
||||
) -> anyhow::Result<Database<KC, DC>> {
|
||||
env.open_database(rtxn, Some(db_name))
|
||||
.with_context(|| format!("While opening the {db_name:?} database"))?
|
||||
.with_context(|| format!("Missing the {db_name:?} database"))
|
||||
}
|
||||
|
||||
fn try_opening_poly_database(
|
||||
env: &Env,
|
||||
rtxn: &RoTxn,
|
||||
db_name: &str,
|
||||
) -> anyhow::Result<Database<Unspecified, Unspecified>> {
|
||||
env.database_options()
|
||||
.name(db_name)
|
||||
.open(rtxn)
|
||||
.with_context(|| format!("While opening the {db_name:?} poly database"))?
|
||||
.with_context(|| format!("Missing the {db_name:?} poly database"))
|
||||
}
|
||||
|
||||
fn try_clearing_poly_database(
|
||||
wtxn: &mut RwTxn,
|
||||
database: Database<Unspecified, Unspecified>,
|
||||
db_name: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
database.clear(wtxn).with_context(|| format!("While clearing the {db_name:?} database"))
|
||||
}
|
||||
|
||||
/// Exports a dump into the dump directory.
|
||||
fn export_a_dump(
|
||||
db_path: PathBuf,
|
||||
dump_dir: PathBuf,
|
||||
skip_enqueued_tasks: bool,
|
||||
) -> Result<(), anyhow::Error> {
|
||||
let started_at = OffsetDateTime::now_utc();
|
||||
|
||||
// 1. Extracts the instance UID from disk
|
||||
let instance_uid_path = db_path.join("instance-uid");
|
||||
let instance_uid = match read_to_string(&instance_uid_path) {
|
||||
Ok(content) => match content.trim().parse() {
|
||||
Ok(uuid) => Some(uuid),
|
||||
Err(e) => {
|
||||
eprintln!("Impossible to parse instance-uid: {e}");
|
||||
None
|
||||
}
|
||||
},
|
||||
Err(e) => {
|
||||
eprintln!("Impossible to read {}: {}", instance_uid_path.display(), e);
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
let dump = DumpWriter::new(instance_uid).context("While creating a new dump")?;
|
||||
let file_store =
|
||||
FileStore::new(db_path.join("update_files")).context("While opening the FileStore")?;
|
||||
|
||||
let index_scheduler_path = db_path.join("tasks");
|
||||
let env = EnvOpenOptions::new()
|
||||
.max_dbs(100)
|
||||
.open(&index_scheduler_path)
|
||||
.with_context(|| format!("While trying to open {:?}", index_scheduler_path.display()))?;
|
||||
|
||||
eprintln!("Dumping the keys...");
|
||||
|
||||
// 2. dump the keys
|
||||
let auth_store = AuthController::new(&db_path, &None)
|
||||
.with_context(|| format!("While opening the auth store at {}", db_path.display()))?;
|
||||
let mut dump_keys = dump.create_keys()?;
|
||||
let mut count = 0;
|
||||
for key in auth_store.list_keys()? {
|
||||
dump_keys.push_key(&key)?;
|
||||
count += 1;
|
||||
}
|
||||
dump_keys.flush()?;
|
||||
|
||||
eprintln!("Successfully dumped {count} keys!");
|
||||
|
||||
let rtxn = env.read_txn()?;
|
||||
let all_tasks: Database<BEU32, SerdeJson<Task>> =
|
||||
try_opening_database(&env, &rtxn, "all-tasks")?;
|
||||
let index_mapping: Database<Str, UuidCodec> =
|
||||
try_opening_database(&env, &rtxn, "index-mapping")?;
|
||||
|
||||
if skip_enqueued_tasks {
|
||||
eprintln!("Skip dumping the enqueued tasks...");
|
||||
} else {
|
||||
eprintln!("Dumping the enqueued tasks...");
|
||||
|
||||
// 3. dump the tasks
|
||||
let mut dump_tasks = dump.create_tasks_queue()?;
|
||||
let mut count = 0;
|
||||
for ret in all_tasks.iter(&rtxn)? {
|
||||
let (_, t) = ret?;
|
||||
let status = t.status;
|
||||
let content_file = t.content_uuid();
|
||||
let mut dump_content_file = dump_tasks.push_task(&t.into())?;
|
||||
|
||||
// 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet.
|
||||
if let Some(content_file_uuid) = content_file {
|
||||
if status == Status::Enqueued {
|
||||
let content_file = file_store.get_update(content_file_uuid)?;
|
||||
|
||||
let reader =
|
||||
DocumentsBatchReader::from_reader(content_file).with_context(|| {
|
||||
format!("While reading content file {:?}", content_file_uuid)
|
||||
})?;
|
||||
|
||||
let (mut cursor, documents_batch_index) = reader.into_cursor_and_fields_index();
|
||||
while let Some(doc) = cursor.next_document().with_context(|| {
|
||||
format!("While iterating on content file {:?}", content_file_uuid)
|
||||
})? {
|
||||
dump_content_file
|
||||
.push_document(&obkv_to_object(&doc, &documents_batch_index)?)?;
|
||||
}
|
||||
dump_content_file.flush()?;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
dump_tasks.flush()?;
|
||||
|
||||
eprintln!("Successfully dumped {count} enqueued tasks!");
|
||||
}
|
||||
|
||||
eprintln!("Dumping the indexes...");
|
||||
|
||||
// 4. Dump the indexes
|
||||
let mut count = 0;
|
||||
for result in index_mapping.iter(&rtxn)? {
|
||||
let (uid, uuid) = result?;
|
||||
let index_path = db_path.join("indexes").join(uuid.to_string());
|
||||
let index = Index::new(EnvOpenOptions::new(), &index_path).with_context(|| {
|
||||
format!("While trying to open the index at path {:?}", index_path.display())
|
||||
})?;
|
||||
|
||||
let rtxn = index.read_txn()?;
|
||||
let metadata = IndexMetadata {
|
||||
uid: uid.to_owned(),
|
||||
primary_key: index.primary_key(&rtxn)?.map(String::from),
|
||||
created_at: index.created_at(&rtxn)?,
|
||||
updated_at: index.updated_at(&rtxn)?,
|
||||
};
|
||||
let mut index_dumper = dump.create_index(uid, &metadata)?;
|
||||
|
||||
let fields_ids_map = index.fields_ids_map(&rtxn)?;
|
||||
let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
|
||||
|
||||
// 4.1. Dump the documents
|
||||
for ret in index.all_documents(&rtxn)? {
|
||||
let (_id, doc) = ret?;
|
||||
let document = obkv_to_json(&all_fields, &fields_ids_map, doc)?;
|
||||
index_dumper.push_document(&document)?;
|
||||
}
|
||||
|
||||
// 4.2. Dump the settings
|
||||
let settings = meilisearch_types::settings::settings(&index, &rtxn)?;
|
||||
index_dumper.settings(&settings)?;
|
||||
count += 1;
|
||||
}
|
||||
|
||||
eprintln!("Successfully dumped {count} indexes!");
|
||||
// We will not dump experimental feature settings
|
||||
eprintln!("The tool is not dumping experimental features, please set them by hand afterward");
|
||||
|
||||
let dump_uid = started_at.format(format_description!(
|
||||
"[year repr:full][month repr:numerical][day padding:zero]-[hour padding:zero][minute padding:zero][second padding:zero][subsecond digits:3]"
|
||||
)).unwrap();
|
||||
|
||||
let path = dump_dir.join(format!("{}.dump", dump_uid));
|
||||
let file = File::create(&path)?;
|
||||
dump.persist_to(BufWriter::new(file))?;
|
||||
|
||||
eprintln!("Dump exported at path {:?}", path.display());
|
||||
|
||||
Ok(())
|
||||
}
|
24
meilitool/src/uuid_codec.rs
Normal file
24
meilitool/src/uuid_codec.rs
Normal file
@ -0,0 +1,24 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use meilisearch_types::heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use uuid::Uuid;
|
||||
|
||||
/// A heed codec for value of struct Uuid.
|
||||
pub struct UuidCodec;
|
||||
|
||||
impl<'a> BytesDecode<'a> for UuidCodec {
|
||||
type DItem = Uuid;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
bytes.try_into().map(Uuid::from_bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesEncode<'_> for UuidCodec {
|
||||
type EItem = Uuid;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_bytes()))
|
||||
}
|
||||
}
|
@ -17,23 +17,25 @@ bincode = "1.3.3"
|
||||
bstr = "1.4.0"
|
||||
bytemuck = { version = "1.13.1", features = ["extern_crate_alloc"] }
|
||||
byteorder = "1.4.3"
|
||||
charabia = { version = "0.8.3", default-features = false }
|
||||
charabia = { version = "0.8.5", default-features = false }
|
||||
concat-arrays = "0.1.2"
|
||||
crossbeam-channel = "0.5.8"
|
||||
deserr = { version = "0.6.0", features = ["actix-web"]}
|
||||
deserr = "0.6.0"
|
||||
either = { version = "1.8.1", features = ["serde"] }
|
||||
flatten-serde-json = { path = "../flatten-serde-json" }
|
||||
fst = "0.4.7"
|
||||
fxhash = "0.2.1"
|
||||
geoutils = "0.5.1"
|
||||
grenad = { version = "0.4.5", default-features = false, features = [
|
||||
"rayon", "tempfile"
|
||||
"rayon",
|
||||
"tempfile",
|
||||
] }
|
||||
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
|
||||
"lmdb", "read-txn-no-tls"
|
||||
heed = { version = "0.20.0-alpha.9", default-features = false, features = [
|
||||
"serde-json",
|
||||
"serde-bincode",
|
||||
"read-txn-no-tls",
|
||||
] }
|
||||
indexmap = { version = "2.0.0", features = ["serde"] }
|
||||
instant-distance = { version = "0.6.1", features = ["with-serde"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
memmap2 = "0.7.1"
|
||||
@ -72,6 +74,23 @@ puffin = "0.16.0"
|
||||
log = "0.4.17"
|
||||
logging_timer = "1.1.0"
|
||||
csv = "1.2.1"
|
||||
candle-core = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
|
||||
candle-transformers = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
|
||||
candle-nn = { git = "https://github.com/huggingface/candle.git", version = "0.3.1" }
|
||||
tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.14.1", version = "0.14.1", default_features = false, features = ["onig"] }
|
||||
hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [
|
||||
"online",
|
||||
] }
|
||||
tokio = { version = "1.34.0", features = ["rt"] }
|
||||
futures = "0.3.29"
|
||||
reqwest = { version = "0.11.16", features = [
|
||||
"rustls-tls",
|
||||
"json",
|
||||
], default-features = false }
|
||||
tiktoken-rs = "0.5.7"
|
||||
liquid = "0.26.4"
|
||||
arroy = { git = "https://github.com/meilisearch/arroy.git", version = "0.1.0" }
|
||||
rand = "0.8.5"
|
||||
|
||||
[dev-dependencies]
|
||||
mimalloc = { version = "0.1.37", default-features = false }
|
||||
@ -83,7 +102,15 @@ meili-snap = { path = "../meili-snap" }
|
||||
rand = { version = "0.8.5", features = ["small_rng"] }
|
||||
|
||||
[features]
|
||||
all-tokenizations = ["charabia/chinese", "charabia/hebrew", "charabia/japanese", "charabia/thai", "charabia/korean", "charabia/greek"]
|
||||
all-tokenizations = [
|
||||
"charabia/chinese",
|
||||
"charabia/hebrew",
|
||||
"charabia/japanese",
|
||||
"charabia/thai",
|
||||
"charabia/korean",
|
||||
"charabia/greek",
|
||||
"charabia/khmer",
|
||||
]
|
||||
|
||||
# Use POSIX semaphores instead of SysV semaphores in LMDB
|
||||
# For more information on this feature, see heed's Cargo.toml
|
||||
@ -107,3 +134,6 @@ thai = ["charabia/thai"]
|
||||
|
||||
# allow greek specialized tokenization
|
||||
greek = ["charabia/greek"]
|
||||
|
||||
# allow khmer specialized tokenization
|
||||
khmer = ["charabia/khmer"]
|
||||
|
@ -5,8 +5,8 @@ use std::time::Instant;
|
||||
|
||||
use heed::EnvOpenOptions;
|
||||
use milli::{
|
||||
execute_search, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext, SearchLogger,
|
||||
TermsMatchingStrategy,
|
||||
execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, Index, SearchContext,
|
||||
SearchLogger, TermsMatchingStrategy,
|
||||
};
|
||||
|
||||
#[global_allocator]
|
||||
@ -49,14 +49,15 @@ fn main() -> Result<(), Box<dyn Error>> {
|
||||
let start = Instant::now();
|
||||
|
||||
let mut ctx = SearchContext::new(&index, &txn);
|
||||
let universe = filtered_universe(&ctx, &None)?;
|
||||
|
||||
let docs = execute_search(
|
||||
&mut ctx,
|
||||
&(!query.trim().is_empty()).then(|| query.trim().to_owned()),
|
||||
&None,
|
||||
(!query.trim().is_empty()).then(|| query.trim()),
|
||||
TermsMatchingStrategy::Last,
|
||||
milli::score_details::ScoringStrategy::Skip,
|
||||
false,
|
||||
&None,
|
||||
universe,
|
||||
&None,
|
||||
GeoSortStrategy::default(),
|
||||
0,
|
||||
|
@ -1,41 +0,0 @@
|
||||
use std::ops;
|
||||
|
||||
use instant_distance::Point;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::normalize_vector;
|
||||
|
||||
#[derive(Debug, Default, Clone, Serialize, Deserialize)]
|
||||
pub struct NDotProductPoint(Vec<f32>);
|
||||
|
||||
impl NDotProductPoint {
|
||||
pub fn new(point: Vec<f32>) -> Self {
|
||||
NDotProductPoint(normalize_vector(point))
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Vec<f32> {
|
||||
self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl ops::Deref for NDotProductPoint {
|
||||
type Target = [f32];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_slice()
|
||||
}
|
||||
}
|
||||
|
||||
impl Point for NDotProductPoint {
|
||||
fn distance(&self, other: &Self) -> f32 {
|
||||
let dist = 1.0 - dot_product_similarity(&self.0, &other.0);
|
||||
debug_assert!(!dist.is_nan());
|
||||
dist
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the dot product similarity score that will between 0.0 and 1.0
|
||||
/// if both vectors are normalized. The higher the more similar the vectors are.
|
||||
pub fn dot_product_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b).map(|(a, b)| a * b).sum()
|
||||
}
|
@ -1,5 +1,6 @@
|
||||
mod builder;
|
||||
mod enriched;
|
||||
mod primary_key;
|
||||
mod reader;
|
||||
mod serde_impl;
|
||||
|
||||
@ -11,6 +12,7 @@ use bimap::BiHashMap;
|
||||
pub use builder::DocumentsBatchBuilder;
|
||||
pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
|
||||
use obkv::KvReader;
|
||||
pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
|
||||
pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@ -87,6 +89,12 @@ impl DocumentsBatchIndex {
|
||||
}
|
||||
}
|
||||
|
||||
impl FieldIdMapper for DocumentsBatchIndex {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum Error {
|
||||
#[error("Error parsing number {value:?} at line {line}: {error}")]
|
||||
|
172
milli/src/documents/primary_key.rs
Normal file
172
milli/src/documents/primary_key.rs
Normal file
@ -0,0 +1,172 @@
|
||||
use std::iter;
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::{FieldId, InternalError, Object, Result, UserError};
|
||||
|
||||
/// The symbol used to define levels in a nested primary key.
|
||||
const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
|
||||
|
||||
/// The default primary that is used when not specified.
|
||||
pub const DEFAULT_PRIMARY_KEY: &str = "id";
|
||||
|
||||
/// Trait for objects that can map the name of a field to its [`FieldId`].
|
||||
pub trait FieldIdMapper {
|
||||
/// Attempts to map the passed name to its [`FieldId`].
|
||||
///
|
||||
/// `None` if the field with this name was not found.
|
||||
fn id(&self, name: &str) -> Option<FieldId>;
|
||||
}
|
||||
|
||||
/// A type that represent the type of primary key that has been set
|
||||
/// for this index, a classic flat one or a nested one.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum PrimaryKey<'a> {
|
||||
Flat { name: &'a str, field_id: FieldId },
|
||||
Nested { name: &'a str },
|
||||
}
|
||||
|
||||
pub enum DocumentIdExtractionError {
|
||||
InvalidDocumentId(UserError),
|
||||
MissingDocumentId,
|
||||
TooManyDocumentIds(usize),
|
||||
}
|
||||
|
||||
impl<'a> PrimaryKey<'a> {
|
||||
pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
|
||||
Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
|
||||
Self::Nested { name: path }
|
||||
} else {
|
||||
let field_id = fields.id(path)?;
|
||||
Self::Flat { name: path, field_id }
|
||||
})
|
||||
}
|
||||
|
||||
pub fn name(&self) -> &str {
|
||||
match self {
|
||||
PrimaryKey::Flat { name, .. } => name,
|
||||
PrimaryKey::Nested { name } => name,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn document_id(
|
||||
&self,
|
||||
document: &obkv::KvReader<FieldId>,
|
||||
fields: &impl FieldIdMapper,
|
||||
) -> Result<StdResult<String, DocumentIdExtractionError>> {
|
||||
match self {
|
||||
PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
|
||||
Some(document_id_bytes) => {
|
||||
let document_id = serde_json::from_slice(document_id_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
}
|
||||
}
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
},
|
||||
nested @ PrimaryKey::Nested { .. } => {
|
||||
let mut matching_documents_ids = Vec::new();
|
||||
for (first_level_name, right) in nested.possible_level_names() {
|
||||
if let Some(field_id) = fields.id(first_level_name) {
|
||||
if let Some(value_bytes) = document.get(field_id) {
|
||||
let object = serde_json::from_slice(value_bytes)
|
||||
.map_err(InternalError::SerdeJson)?;
|
||||
fetch_matching_values(object, right, &mut matching_documents_ids);
|
||||
|
||||
if matching_documents_ids.len() >= 2 {
|
||||
return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
|
||||
matching_documents_ids.len(),
|
||||
)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match matching_documents_ids.pop() {
|
||||
Some(document_id) => match validate_document_id_value(document_id)? {
|
||||
Ok(document_id) => Ok(Ok(document_id)),
|
||||
Err(user_error) => {
|
||||
Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
|
||||
}
|
||||
},
|
||||
None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an `Iterator` that gives all the possible fields names the primary key
|
||||
/// can have depending of the first level name and depth of the objects.
|
||||
pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
|
||||
let name = self.name();
|
||||
name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
|
||||
.map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
|
||||
.chain(iter::once((name, "")))
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
|
||||
match value {
|
||||
Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
|
||||
otherwise => output.push(otherwise),
|
||||
}
|
||||
}
|
||||
|
||||
fn fetch_matching_values_in_object(
|
||||
object: Object,
|
||||
selector: &str,
|
||||
base_key: &str,
|
||||
output: &mut Vec<Value>,
|
||||
) {
|
||||
for (key, value) in object {
|
||||
let base_key = if base_key.is_empty() {
|
||||
key.to_string()
|
||||
} else {
|
||||
format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
|
||||
};
|
||||
|
||||
if starts_with(selector, &base_key) {
|
||||
match value {
|
||||
Value::Object(object) => {
|
||||
fetch_matching_values_in_object(object, selector, &base_key, output)
|
||||
}
|
||||
value => output.push(value),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn starts_with(selector: &str, key: &str) -> bool {
|
||||
selector.strip_prefix(key).map_or(false, |tail| {
|
||||
tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
|
||||
})
|
||||
}
|
||||
|
||||
// FIXME: move to a DocumentId struct
|
||||
|
||||
fn validate_document_id(document_id: &str) -> Option<&str> {
|
||||
if !document_id.is_empty()
|
||||
&& document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
|
||||
{
|
||||
Some(document_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
|
||||
match document_id {
|
||||
Value::String(string) => match validate_document_id(&string) {
|
||||
Some(s) if s.len() == string.len() => Ok(Ok(string)),
|
||||
Some(s) => Ok(Ok(s.to_string())),
|
||||
None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
|
||||
},
|
||||
Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
|
||||
content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
|
||||
}
|
||||
}
|
@ -61,6 +61,10 @@ pub enum InternalError {
|
||||
AbortedIndexation,
|
||||
#[error("The matching words list contains at least one invalid member.")]
|
||||
InvalidMatchingWords,
|
||||
#[error(transparent)]
|
||||
ArroyError(#[from] arroy::Error),
|
||||
#[error(transparent)]
|
||||
VectorEmbeddingError(#[from] crate::vector::Error),
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
@ -110,8 +114,10 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
InvalidGeoField(#[from] GeoError),
|
||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||
InvalidVectorDimensions { expected: usize, found: usize },
|
||||
#[error("The `_vectors` field in the document with the id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
|
||||
InvalidVectorsType { document_id: Value, value: Value },
|
||||
#[error("The `_vectors.{subfield}` field in the document with id: `{document_id}` is not an array. Was expecting an array of floats or an array of arrays of floats but instead got `{value}`.")]
|
||||
InvalidVectorsType { document_id: Value, value: Value, subfield: String },
|
||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||
InvalidVectorsMapType { document_id: Value, value: Value },
|
||||
#[error("{0}")]
|
||||
InvalidFilter(String),
|
||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||
@ -152,7 +158,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
valid_fields: BTreeSet<String>,
|
||||
hidden_fields: bool,
|
||||
},
|
||||
#[error("{}", HeedError::BadOpenOptions)]
|
||||
#[error("an environment is already opened with different options")]
|
||||
InvalidLmdbOpenOptions,
|
||||
#[error("You must specify where `sort` is listed in the rankingRules setting to use the sort parameter at search time.")]
|
||||
SortRankingRuleMissing,
|
||||
@ -180,6 +186,76 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
||||
UnknownInternalDocumentId { document_id: DocumentId },
|
||||
#[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")]
|
||||
InvalidMinTypoWordLenSetting(u8, u8),
|
||||
#[error(transparent)]
|
||||
VectorEmbeddingError(#[from] crate::vector::Error),
|
||||
#[error(transparent)]
|
||||
MissingDocumentField(#[from] crate::prompt::error::RenderPromptError),
|
||||
#[error(transparent)]
|
||||
InvalidPrompt(#[from] crate::prompt::error::NewPromptError),
|
||||
#[error("`.embedders.{0}.documentTemplate`: Invalid template: {1}.")]
|
||||
InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError),
|
||||
#[error("Too many embedders in the configuration. Found {0}, but limited to 256.")]
|
||||
TooManyEmbedders(usize),
|
||||
#[error("Cannot find embedder with name {0}.")]
|
||||
InvalidEmbedder(String),
|
||||
#[error("Too many vectors for document with id {0}: found {1}, but limited to 256.")]
|
||||
TooManyVectors(String, usize),
|
||||
#[error("`.embedders.{embedder_name}`: Field `{field}` unavailable for source `{source_}` (only available for sources: {}). Available fields: {}",
|
||||
allowed_sources_for_field
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", "),
|
||||
allowed_fields_for_source
|
||||
.iter()
|
||||
.map(|accepted| format!("`{}`", accepted))
|
||||
.collect::<Vec<String>>()
|
||||
.join(", ")
|
||||
)]
|
||||
InvalidFieldForSource {
|
||||
embedder_name: String,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
field: &'static str,
|
||||
allowed_fields_for_source: &'static [&'static str],
|
||||
allowed_sources_for_field: &'static [crate::vector::settings::EmbedderSource],
|
||||
},
|
||||
#[error("`.embedders.{embedder_name}.model`: Invalid model `{model}` for OpenAI. Supported models: {:?}", crate::vector::openai::EmbeddingModel::supported_models())]
|
||||
InvalidOpenAiModel { embedder_name: String, model: String },
|
||||
#[error("`.embedders.{embedder_name}`: Missing field `{field}` (note: this field is mandatory for source {source_})")]
|
||||
MissingFieldForSource {
|
||||
field: &'static str,
|
||||
source_: crate::vector::settings::EmbedderSource,
|
||||
embedder_name: String,
|
||||
},
|
||||
}
|
||||
|
||||
impl From<crate::vector::Error> for Error {
|
||||
fn from(value: crate::vector::Error) -> Self {
|
||||
match value.fault() {
|
||||
FaultSource::User => Error::UserError(value.into()),
|
||||
FaultSource::Runtime => Error::InternalError(value.into()),
|
||||
FaultSource::Bug => Error::InternalError(value.into()),
|
||||
FaultSource::Undecided => Error::InternalError(value.into()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<arroy::Error> for Error {
|
||||
fn from(value: arroy::Error) -> Self {
|
||||
match value {
|
||||
arroy::Error::Heed(heed) => heed.into(),
|
||||
arroy::Error::Io(io) => io.into(),
|
||||
arroy::Error::InvalidVecDimension { expected, received } => {
|
||||
Error::UserError(UserError::InvalidVectorDimensions { expected, found: received })
|
||||
}
|
||||
arroy::Error::DatabaseFull
|
||||
| arroy::Error::InvalidItemAppend
|
||||
| arroy::Error::UnmatchingDistance { .. }
|
||||
| arroy::Error::MissingMetadata => {
|
||||
Error::InternalError(InternalError::ArroyError(value))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
@ -326,15 +402,36 @@ impl From<HeedError> for Error {
|
||||
HeedError::Mdb(MdbError::MapFull) => UserError(MaxDatabaseSizeReached),
|
||||
HeedError::Mdb(MdbError::Invalid) => UserError(InvalidStoreFile),
|
||||
HeedError::Mdb(error) => InternalError(Store(error)),
|
||||
HeedError::Encoding => InternalError(Serialization(Encoding { db_name: None })),
|
||||
HeedError::Decoding => InternalError(Serialization(Decoding { db_name: None })),
|
||||
// TODO use the encoding
|
||||
HeedError::Encoding(_) => InternalError(Serialization(Encoding { db_name: None })),
|
||||
HeedError::Decoding(_) => InternalError(Serialization(Decoding { db_name: None })),
|
||||
HeedError::InvalidDatabaseTyping => InternalError(InvalidDatabaseTyping),
|
||||
HeedError::DatabaseClosing => InternalError(DatabaseClosing),
|
||||
HeedError::BadOpenOptions => UserError(InvalidLmdbOpenOptions),
|
||||
HeedError::BadOpenOptions { .. } => UserError(InvalidLmdbOpenOptions),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub enum FaultSource {
|
||||
User,
|
||||
Runtime,
|
||||
Bug,
|
||||
Undecided,
|
||||
}
|
||||
|
||||
impl std::fmt::Display for FaultSource {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
let s = match self {
|
||||
FaultSource::User => "user error",
|
||||
FaultSource::Runtime => "runtime error",
|
||||
FaultSource::Bug => "coding error",
|
||||
FaultSource::Undecided => "error",
|
||||
};
|
||||
f.write_str(s)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn conditionally_lookup_for_error_message() {
|
||||
let prefix = "Attribute `name` is not sortable.";
|
||||
|
@ -1,8 +1,7 @@
|
||||
use std::collections::HashMap;
|
||||
|
||||
use heed::types::{OwnedType, Str};
|
||||
use heed::types::Str;
|
||||
use heed::{Database, RoIter, RoTxn, RwTxn};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::{DocumentId, BEU32};
|
||||
|
||||
@ -17,10 +16,10 @@ pub struct DocumentOperation {
|
||||
pub kind: DocumentOperationKind,
|
||||
}
|
||||
|
||||
pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
|
||||
pub struct ExternalDocumentsIds(Database<Str, BEU32>);
|
||||
|
||||
impl ExternalDocumentsIds {
|
||||
pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
|
||||
pub fn new(db: Database<Str, BEU32>) -> ExternalDocumentsIds {
|
||||
ExternalDocumentsIds(db)
|
||||
}
|
||||
|
||||
@ -30,7 +29,7 @@ impl ExternalDocumentsIds {
|
||||
}
|
||||
|
||||
pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
|
||||
Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
|
||||
self.0.get(rtxn, external_id.as_ref())
|
||||
}
|
||||
|
||||
/// An helper function to debug this type, returns an `HashMap` of both,
|
||||
@ -39,28 +38,11 @@ impl ExternalDocumentsIds {
|
||||
let mut map = HashMap::default();
|
||||
for result in self.0.iter(rtxn)? {
|
||||
let (external, internal) = result?;
|
||||
map.insert(external.to_owned(), internal.get());
|
||||
map.insert(external.to_owned(), internal);
|
||||
}
|
||||
Ok(map)
|
||||
}
|
||||
|
||||
/// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
|
||||
/// these internal ids and their external id.
|
||||
///
|
||||
/// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`,
|
||||
/// where the returned values can be:
|
||||
/// - `Ok((external_id, internal_id))`: if a mapping was found
|
||||
/// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
|
||||
/// In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
|
||||
/// the entire fst.
|
||||
pub fn find_external_id_of<'t>(
|
||||
&self,
|
||||
rtxn: &'t RoTxn,
|
||||
internal_ids: RoaringBitmap,
|
||||
) -> heed::Result<ExternalToInternalOwnedIterator<'t>> {
|
||||
self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids })
|
||||
}
|
||||
|
||||
/// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
|
||||
///
|
||||
/// If the list contains multiple operations on the same external id, then the result is unspecified.
|
||||
@ -73,7 +55,7 @@ impl ExternalDocumentsIds {
|
||||
for DocumentOperation { external_id, internal_id, kind } in operations {
|
||||
match kind {
|
||||
DocumentOperationKind::Create => {
|
||||
self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
|
||||
self.0.put(wtxn, &external_id, &internal_id)?;
|
||||
}
|
||||
DocumentOperationKind::Delete => {
|
||||
if !self.0.delete(wtxn, &external_id)? {
|
||||
@ -87,60 +69,7 @@ impl ExternalDocumentsIds {
|
||||
}
|
||||
|
||||
/// Returns an iterator over all the external ids.
|
||||
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
|
||||
pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, BEU32>> {
|
||||
self.0.iter(rtxn)
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator over mappings between requested internal ids and external ids.
|
||||
///
|
||||
/// See [`ExternalDocumentsIds::find_external_id_of`] for details.
|
||||
pub struct ExternalToInternalOwnedIterator<'t> {
|
||||
iter: RoIter<'t, Str, OwnedType<BEU32>>,
|
||||
internal_ids: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> {
|
||||
/// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
|
||||
type Item = Result<(&'t str, DocumentId), RoaringBitmap>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
// if all requested ids were found, we won't find any other, so short-circuit
|
||||
if self.internal_ids.is_empty() {
|
||||
return None;
|
||||
}
|
||||
loop {
|
||||
let (external, internal) = match self.iter.next() {
|
||||
Some(Ok((external, internal))) => (external, internal),
|
||||
// TODO manage this better, remove panic
|
||||
Some(Err(e)) => panic!("{}", e),
|
||||
_ => {
|
||||
// we exhausted the stream but we still have some internal ids to find
|
||||
let remaining_ids = std::mem::take(&mut self.internal_ids);
|
||||
return Some(Err(remaining_ids));
|
||||
// note: next calls to `next` will return `None` since we replaced the internal_ids
|
||||
// with the default empty bitmap
|
||||
}
|
||||
};
|
||||
let internal = internal.get();
|
||||
let was_contained = self.internal_ids.remove(internal);
|
||||
if was_contained {
|
||||
return Some(Ok((external, internal)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> ExternalToInternalOwnedIterator<'t> {
|
||||
/// Returns the bitmap of internal ids whose external id are yet to be found
|
||||
pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
|
||||
&self.internal_ids
|
||||
}
|
||||
|
||||
/// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
|
||||
///
|
||||
/// Use this when you don't need the mapping between the external and the internal ids.
|
||||
pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't {
|
||||
self.map(|res| res.map(|(external, _internal)| external.to_owned()))
|
||||
}
|
||||
}
|
||||
|
@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
|
||||
}
|
||||
}
|
||||
|
||||
impl crate::documents::FieldIdMapper for FieldsIdsMap {
|
||||
fn id(&self, name: &str) -> Option<FieldId> {
|
||||
self.id(name)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
@ -2,26 +2,28 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct BEU16StrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
|
||||
type DItem = (u16, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n_bytes, str_bytes) = bytes.split_at(2);
|
||||
let n = n_bytes.try_into().map(u16::from_be_bytes).ok()?;
|
||||
let s = str::from_utf8(str_bytes).ok()?;
|
||||
Some((n, s))
|
||||
let n = n_bytes.try_into().map(u16::from_be_bytes)?;
|
||||
let s = str::from_utf8(str_bytes)?;
|
||||
Ok((n, s))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
|
||||
type EItem = (u16, &'a str);
|
||||
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s.len() + 2);
|
||||
bytes.extend_from_slice(&n.to_be_bytes());
|
||||
bytes.extend_from_slice(s.as_bytes());
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -2,26 +2,28 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct BEU32StrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for BEU32StrCodec {
|
||||
type DItem = (u32, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n_bytes, str_bytes) = bytes.split_at(4);
|
||||
let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||
let s = str::from_utf8(str_bytes).ok()?;
|
||||
Some((n, s))
|
||||
let n = n_bytes.try_into().map(u32::from_be_bytes)?;
|
||||
let s = str::from_utf8(str_bytes)?;
|
||||
Ok((n, s))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for BEU32StrCodec {
|
||||
type EItem = (u32, &'a str);
|
||||
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s.len() + 4);
|
||||
bytes.extend_from_slice(&n.to_be_bytes());
|
||||
bytes.extend_from_slice(s.as_bytes());
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -1,23 +1,23 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated
|
||||
/// A codec for values of type `&[u8]`. Unlike `Bytes`, its `EItem` and `DItem` associated
|
||||
/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure.
|
||||
pub struct ByteSliceRefCodec;
|
||||
pub struct BytesRefCodec;
|
||||
|
||||
impl<'a> BytesEncode<'a> for ByteSliceRefCodec {
|
||||
impl<'a> BytesEncode<'a> for BytesRefCodec {
|
||||
type EItem = &'a [u8];
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||
Some(Cow::Borrowed(item))
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for ByteSliceRefCodec {
|
||||
impl<'a> BytesDecode<'a> for BytesRefCodec {
|
||||
type DItem = &'a [u8];
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
Some(bytes)
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(bytes)
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,9 @@
|
||||
use std::borrow::Cow;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
use crate::heed_codec::SliceTooShortError;
|
||||
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetCodec<C>(PhantomData<C>);
|
||||
@ -13,16 +14,16 @@ where
|
||||
{
|
||||
type DItem = (FieldId, DocumentId, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let value = C::bytes_decode(bytes)?;
|
||||
|
||||
Some((field_id, document_id, value))
|
||||
Ok((field_id, document_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,13 +33,15 @@ where
|
||||
{
|
||||
type EItem = (FieldId, DocumentId, C::EItem);
|
||||
|
||||
fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode(
|
||||
(field_id, document_id, value): &'a Self::EItem,
|
||||
) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(32);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes
|
||||
bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes
|
||||
let value_bytes = C::bytes_encode(value)?;
|
||||
// variable length, if f64 -> 16 bytes, if string -> large, potentially
|
||||
bytes.extend_from_slice(&value_bytes);
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -5,8 +5,8 @@ use std::borrow::Cow;
|
||||
use std::convert::TryFrom;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use heed::types::{DecodeIgnore, OwnedType};
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use heed::types::DecodeIgnore;
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
|
||||
@ -18,7 +18,7 @@ pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
|
||||
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
|
||||
pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec<DecodeIgnore>;
|
||||
|
||||
pub type FieldIdCodec = OwnedType<BEU16>;
|
||||
pub type FieldIdCodec = BEU16;
|
||||
|
||||
/// Tries to split a slice in half at the given middle point,
|
||||
/// `None` if the slice is too short.
|
||||
@ -58,7 +58,7 @@ where
|
||||
{
|
||||
type EItem = FacetGroupKey<T::EItem>;
|
||||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![];
|
||||
v.extend_from_slice(&value.field_id.to_be_bytes());
|
||||
v.extend_from_slice(&[value.level]);
|
||||
@ -66,7 +66,7 @@ where
|
||||
let bound = T::bytes_encode(&value.left_bound)?;
|
||||
v.extend_from_slice(&bound);
|
||||
|
||||
Some(Cow::Owned(v))
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T>
|
||||
@ -75,11 +75,11 @@ where
|
||||
{
|
||||
type DItem = FacetGroupKey<T::DItem>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?);
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1])?);
|
||||
let level = bytes[2];
|
||||
let bound = T::bytes_decode(&bytes[3..])?;
|
||||
Some(FacetGroupKey { field_id: fid, level, left_bound: bound })
|
||||
Ok(FacetGroupKey { field_id: fid, level, left_bound: bound })
|
||||
}
|
||||
}
|
||||
|
||||
@ -87,17 +87,17 @@ pub struct FacetGroupValueCodec;
|
||||
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
||||
type EItem = FacetGroupValue;
|
||||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![value.size];
|
||||
CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v);
|
||||
Some(Cow::Owned(v))
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
|
||||
type DItem = FacetGroupValue;
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let size = bytes[0];
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?;
|
||||
Some(FacetGroupValue { size, bitmap })
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
|
||||
Ok(FacetGroupValue { size, bitmap })
|
||||
}
|
||||
}
|
||||
|
@ -1,37 +1,45 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::heed_codec::SliceTooShortError;
|
||||
|
||||
pub struct OrderedF64Codec;
|
||||
|
||||
impl<'a> BytesDecode<'a> for OrderedF64Codec {
|
||||
type DItem = f64;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
if bytes.len() < 16 {
|
||||
return None;
|
||||
Err(SliceTooShortError.into())
|
||||
} else {
|
||||
bytes[8..].try_into().map(f64::from_be_bytes).map_err(Into::into)
|
||||
}
|
||||
let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
|
||||
Some(f)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for OrderedF64Codec {
|
||||
type EItem = f64;
|
||||
|
||||
fn bytes_encode(f: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode(f: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut buffer = [0u8; 16];
|
||||
|
||||
// write the globally ordered float
|
||||
let bytes = f64_into_bytes(*f)?;
|
||||
let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?;
|
||||
buffer[..8].copy_from_slice(&bytes[..]);
|
||||
// Then the f64 value just to be able to read it back
|
||||
let bytes = f.to_be_bytes();
|
||||
buffer[8..16].copy_from_slice(&bytes[..]);
|
||||
|
||||
Some(Cow::Owned(buffer.to_vec()))
|
||||
Ok(Cow::Owned(buffer.to_vec()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the float {float} cannot be converted to a globally ordered representation")]
|
||||
pub struct InvalidGloballyOrderedFloatError {
|
||||
float: f64,
|
||||
}
|
||||
|
@ -1,5 +1,8 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
pub struct FieldIdWordCountCodec;
|
||||
@ -7,21 +10,21 @@ pub struct FieldIdWordCountCodec;
|
||||
impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
|
||||
type DItem = (FieldId, u8);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let ([word_count], _nothing) = try_split_array_at(bytes)?;
|
||||
Some((field_id, word_count))
|
||||
let ([word_count], _nothing) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
Ok((field_id, word_count))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
|
||||
type EItem = (FieldId, u8);
|
||||
|
||||
fn bytes_encode((field_id, word_count): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((field_id, word_count): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(2 + 1);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.push(*word_count);
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use fst::Set;
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `Set<&[u8]>`.
|
||||
pub struct FstSetCodec;
|
||||
@ -9,15 +9,15 @@ pub struct FstSetCodec;
|
||||
impl<'a> BytesEncode<'a> for FstSetCodec {
|
||||
type EItem = Set<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||
Some(Cow::Borrowed(item.as_fst().as_bytes()))
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_fst().as_bytes()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for FstSetCodec {
|
||||
type DItem = Set<&'a [u8]>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
Set::new(bytes).ok()
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Set::new(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
@ -12,8 +12,10 @@ mod str_beu32_codec;
|
||||
mod str_ref;
|
||||
mod str_str_u8_codec;
|
||||
|
||||
pub use byte_slice_ref::ByteSliceRefCodec;
|
||||
pub use byte_slice_ref::BytesRefCodec;
|
||||
use heed::BoxedError;
|
||||
pub use str_ref::StrRefCodec;
|
||||
use thiserror::Error;
|
||||
|
||||
pub use self::beu16_str_codec::BEU16StrCodec;
|
||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
@ -31,5 +33,9 @@ pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
pub trait BytesDecodeOwned {
|
||||
type DItem;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem>;
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError>;
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the slice is too short")]
|
||||
pub struct SliceTooShortError;
|
||||
|
@ -1,5 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
use obkv::{KvReaderU16, KvWriterU16};
|
||||
|
||||
pub struct ObkvCodec;
|
||||
@ -7,15 +8,15 @@ pub struct ObkvCodec;
|
||||
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
||||
type DItem = KvReaderU16<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
Some(KvReaderU16::new(bytes))
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(KvReaderU16::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for ObkvCodec {
|
||||
type EItem = KvWriterU16<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
item.clone().into_inner().map(Cow::Owned).ok()
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
item.clone().into_inner().map(Cow::Owned).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
@ -19,22 +19,22 @@ impl BoRoaringBitmapCodec {
|
||||
impl BytesDecode<'_> for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
|
||||
for chunk in bytes.chunks(size_of::<u32>()) {
|
||||
let bytes = chunk.try_into().ok()?;
|
||||
let bytes = chunk.try_into()?;
|
||||
bitmap.push(u32::from_ne_bytes(bytes));
|
||||
}
|
||||
|
||||
Some(bitmap)
|
||||
Ok(bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
@ -42,9 +42,9 @@ impl BytesDecodeOwned for BoRoaringBitmapCodec {
|
||||
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut out = Vec::new();
|
||||
BoRoaringBitmapCodec::serialize_into(item, &mut out);
|
||||
Some(Cow::Owned(out))
|
||||
Ok(Cow::Owned(out))
|
||||
}
|
||||
}
|
||||
|
@ -3,6 +3,7 @@ use std::io;
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use heed::BoxedError;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
@ -132,26 +133,26 @@ impl CboRoaringBitmapCodec {
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Self::deserialize_from(bytes).ok()
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Self::deserialize_from(bytes).ok()
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut vec = Vec::with_capacity(Self::serialized_size(item));
|
||||
Self::serialize_into(item, &mut vec);
|
||||
Some(Cow::Owned(vec))
|
||||
Ok(Cow::Owned(vec))
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
@ -9,25 +10,25 @@ pub struct RoaringBitmapCodec;
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmap::deserialize_unchecked_from(bytes).ok()
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmap::deserialize_unchecked_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmap::deserialize_from(bytes).ok()
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmap::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(item.serialized_size());
|
||||
item.serialize_into(&mut bytes).ok()?;
|
||||
Some(Cow::Owned(bytes))
|
||||
item.serialize_into(&mut bytes)?;
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::mem;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
@ -9,15 +9,15 @@ pub struct BoRoaringBitmapLenCodec;
|
||||
impl BytesDecode<'_> for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
Some((bytes.len() / mem::size_of::<u32>()) as u64)
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok((bytes.len() / mem::size_of::<u32>()) as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::mem;
|
||||
|
||||
use heed::BytesDecode;
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
|
||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
||||
@ -11,7 +11,7 @@ pub struct CboRoaringBitmapLenCodec;
|
||||
impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
|
||||
// If there is threshold or less than threshold integers that can fit into this array
|
||||
// of bytes it means that we used the ByteOrder codec serializer.
|
||||
@ -27,7 +27,7 @@ impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
impl BytesDecodeOwned for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
@ -2,6 +2,7 @@ use std::io::{self, BufRead, Read};
|
||||
use std::mem;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use heed::BoxedError;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
@ -56,16 +57,16 @@ impl RoaringBitmapLenCodec {
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Option<Self::DItem> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).ok()
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,30 +1,31 @@
|
||||
use std::borrow::Cow;
|
||||
use std::ffi::CStr;
|
||||
use std::str;
|
||||
|
||||
use charabia::{Language, Script};
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct ScriptLanguageCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for ScriptLanguageCodec {
|
||||
type DItem = (Script, Language);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let sep = bytes.iter().position(|b| *b == 0)?;
|
||||
let (s_bytes, l_bytes) = bytes.split_at(sep);
|
||||
let script = str::from_utf8(s_bytes).ok()?;
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let script = cstr.to_str()?;
|
||||
let script_name = Script::from_name(script);
|
||||
let lan = str::from_utf8(l_bytes).ok()?;
|
||||
// skip '\0' byte between the two strings.
|
||||
let lan_name = Language::from_name(&lan[1..]);
|
||||
let lan = str::from_utf8(&bytes[script.len() + 1..])?;
|
||||
let lan_name = Language::from_name(lan);
|
||||
|
||||
Some((script_name, lan_name))
|
||||
Ok((script_name, lan_name))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
|
||||
type EItem = (Script, Language);
|
||||
|
||||
fn bytes_encode((script, lan): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((script, lan): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let script_name = script.name().as_bytes();
|
||||
let lan_name = lan.name().as_bytes();
|
||||
|
||||
@ -33,6 +34,6 @@ impl<'a> heed::BytesEncode<'a> for ScriptLanguageCodec {
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(lan_name);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -3,37 +3,41 @@ use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
|
||||
pub struct StrBEU32Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU32Codec {
|
||||
type DItem = (&'a str, u32);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let footer_len = size_of::<u32>();
|
||||
|
||||
if bytes.len() < footer_len {
|
||||
return None;
|
||||
return Err(SliceTooShortError.into());
|
||||
}
|
||||
|
||||
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let word = str::from_utf8(word).ok()?;
|
||||
let pos = bytes.try_into().map(u32::from_be_bytes).ok()?;
|
||||
let word = str::from_utf8(word)?;
|
||||
let pos = bytes.try_into().map(u32::from_be_bytes)?;
|
||||
|
||||
Some((word, pos))
|
||||
Ok((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
|
||||
type EItem = (&'a str, u32);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + pos.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
@ -42,26 +46,27 @@ pub struct StrBEU16Codec;
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
|
||||
type DItem = (&'a str, u16);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let footer_len = size_of::<u16>();
|
||||
|
||||
if bytes.len() < footer_len + 1 {
|
||||
return None;
|
||||
return Err(SliceTooShortError.into());
|
||||
}
|
||||
|
||||
let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let (_, word) = word_plus_nul_byte.split_last()?;
|
||||
let word = str::from_utf8(word).ok()?;
|
||||
let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
|
||||
// unwrap: we just checked the footer + 1 above.
|
||||
let (_, word) = word_plus_nul_byte.split_last().unwrap();
|
||||
let word = str::from_utf8(word)?;
|
||||
let pos = bytes.try_into().map(u16::from_be_bytes)?;
|
||||
|
||||
Some((word, pos))
|
||||
Ok((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
||||
type EItem = (&'a str, u16);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
|
||||
@ -69,6 +74,6 @@ impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
use std::borrow::Cow;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated
|
||||
/// types are equivalent (= `&'a str`) and these values can reside within another structure.
|
||||
@ -8,15 +8,14 @@ pub struct StrRefCodec;
|
||||
impl<'a> BytesEncode<'a> for StrRefCodec {
|
||||
type EItem = &'a str;
|
||||
|
||||
fn bytes_encode(item: &'a &'a str) -> Option<Cow<'a, [u8]>> {
|
||||
Some(Cow::Borrowed(item.as_bytes()))
|
||||
fn bytes_encode(item: &'a &'a str) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_bytes()))
|
||||
}
|
||||
}
|
||||
impl<'a> BytesDecode<'a> for StrRefCodec {
|
||||
type DItem = &'a str;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let s = std::str::from_utf8(bytes).ok()?;
|
||||
Some(s)
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
std::str::from_utf8(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
@ -1,32 +1,36 @@
|
||||
use std::borrow::Cow;
|
||||
use std::ffi::CStr;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
|
||||
pub struct U8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for U8StrStrCodec {
|
||||
type DItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (n, bytes) = bytes.split_first()?;
|
||||
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
||||
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
||||
let s2_bytes = &rest[1..];
|
||||
let s1 = str::from_utf8(s1_bytes).ok()?;
|
||||
let s2 = str::from_utf8(s2_bytes).ok()?;
|
||||
Some((*n, s1, s2))
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let s1 = cstr.to_str()?;
|
||||
// skip '\0' byte between the two strings.
|
||||
let s2 = str::from_utf8(&bytes[s1.len() + 1..])?;
|
||||
Ok((*n, s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for U8StrStrCodec {
|
||||
type EItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2.as_bytes());
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
pub struct UncheckedU8StrStrCodec;
|
||||
@ -34,24 +38,25 @@ pub struct UncheckedU8StrStrCodec;
|
||||
impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec {
|
||||
type DItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (n, bytes) = bytes.split_first()?;
|
||||
let s1_end = bytes.iter().position(|b| *b == 0)?;
|
||||
let (s1_bytes, rest) = bytes.split_at(s1_end);
|
||||
let s2_bytes = &rest[1..];
|
||||
Some((*n, s1_bytes, s2_bytes))
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let s1_bytes = cstr.to_bytes();
|
||||
// skip '\0' byte between the two strings.
|
||||
let s2_bytes = &bytes[s1_bytes.len() + 1..];
|
||||
Ok((*n, s1_bytes, s2_bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec {
|
||||
type EItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1);
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2);
|
||||
Some(Cow::Owned(bytes))
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user