mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-12-05 12:15:42 +00:00
Compare commits
19 Commits
v1.9.0-rc.
...
japanese-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a87c390244 | ||
|
|
0df84bbba7 | ||
|
|
e53de15b8e | ||
|
|
8c4921b9dd | ||
|
|
f6a00f4a90 | ||
|
|
ce08dc509b | ||
|
|
1daaed163a | ||
|
|
298c7b0c93 | ||
|
|
606e108420 | ||
|
|
7be17b7e4c | ||
|
|
1693332cab | ||
|
|
ddd564665b | ||
|
|
4ae11bfd31 | ||
|
|
9736e16a88 | ||
|
|
6fa4da8ae7 | ||
|
|
19d7cdc20d | ||
|
|
c229200820 | ||
|
|
bad28cc9e2 | ||
|
|
a04041c8f2 |
24
Cargo.lock
generated
24
Cargo.lock
generated
@@ -381,9 +381,9 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arroy"
|
name = "arroy"
|
||||||
version = "0.3.1"
|
version = "0.4.0"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9"
|
checksum = "2ece9e5347e7fdaaea3181dec7f916677ad5f3fcbac183648ce1924eb4aeef9a"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck",
|
"bytemuck",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -679,9 +679,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "bytemuck"
|
name = "bytemuck"
|
||||||
version = "1.15.0"
|
version = "1.16.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15"
|
checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bytemuck_derive",
|
"bytemuck_derive",
|
||||||
]
|
]
|
||||||
@@ -2273,9 +2273,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "heed"
|
name = "heed"
|
||||||
version = "0.20.1"
|
version = "0.20.2"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd"
|
checksum = "f60d7cff16094be9627830b399c087a25017e93fb3768b87cd656a68ccb1ebe8"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"bitflags 2.5.0",
|
"bitflags 2.5.0",
|
||||||
"byteorder",
|
"byteorder",
|
||||||
@@ -3172,9 +3172,9 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "lmdb-master-sys"
|
name = "lmdb-master-sys"
|
||||||
version = "0.2.0"
|
version = "0.2.1"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a"
|
checksum = "a5142795c220effa4c8f4813537bd4c88113a07e45e93100ccb2adc5cec6c7f3"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"cc",
|
"cc",
|
||||||
"doxygen-rs",
|
"doxygen-rs",
|
||||||
@@ -5053,18 +5053,18 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror"
|
name = "thiserror"
|
||||||
version = "1.0.58"
|
version = "1.0.61"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
|
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"thiserror-impl",
|
"thiserror-impl",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "thiserror-impl"
|
name = "thiserror-impl"
|
||||||
version = "1.0.58"
|
version = "1.0.61"
|
||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
|
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"proc-macro2",
|
"proc-macro2",
|
||||||
"quote",
|
"quote",
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ RUN set -eux; \
|
|||||||
if [ "$apkArch" = "aarch64" ]; then \
|
if [ "$apkArch" = "aarch64" ]; then \
|
||||||
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
|
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
|
||||||
fi && \
|
fi && \
|
||||||
cargo build --release -p meilisearch -p meilitool
|
cargo build --release -p meilisearch -p meilitool --no-default-features --features "analytics mini-dashboard japanese"
|
||||||
|
|
||||||
# Run
|
# Run
|
||||||
FROM alpine:3.16
|
FROM alpine:3.16
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ ureq = "2.9.7"
|
|||||||
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
arroy = "0.3.1"
|
arroy = "0.4.0"
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
crossbeam = "0.8.4"
|
crossbeam = "0.8.4"
|
||||||
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
||||||
|
|||||||
@@ -5396,7 +5396,7 @@ mod tests {
|
|||||||
let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy)
|
let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.or_else(|e| match e {
|
.or_else(|e| match e {
|
||||||
arroy::Error::MissingMetadata => Ok(None),
|
arroy::Error::MissingMetadata(_) => Ok(None),
|
||||||
e => Err(e),
|
e => Err(e),
|
||||||
})
|
})
|
||||||
.transpose();
|
.transpose();
|
||||||
|
|||||||
@@ -398,7 +398,8 @@ impl ErrorCode for milli::Error {
|
|||||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
||||||
UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
|
UserError::InvalidVectorsMapType { .. }
|
||||||
|
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
|
||||||
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
||||||
UserError::SortError(_) => Code::InvalidSearchSort,
|
UserError::SortError(_) => Code::InvalidSearchSort,
|
||||||
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
UserError::InvalidMinTypoWordLenSetting(_, _) => {
|
||||||
|
|||||||
@@ -158,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"]
|
|||||||
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
|
||||||
|
|
||||||
[package.metadata.mini-dashboard]
|
[package.metadata.mini-dashboard]
|
||||||
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
|
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
|
||||||
sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"
|
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ impl Display for Value {
|
|||||||
write!(
|
write!(
|
||||||
f,
|
f,
|
||||||
"{}",
|
"{}",
|
||||||
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
|
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" })
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -150,6 +150,35 @@ async fn simple_search() {
|
|||||||
snapshot!(response["semanticHitCount"], @"3");
|
snapshot!(response["semanticHitCount"], @"3");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn limit_offset() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.search_post(
|
||||||
|
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###);
|
||||||
|
snapshot!(response["semanticHitCount"], @"0");
|
||||||
|
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
|
||||||
|
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.search_post(
|
||||||
|
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###);
|
||||||
|
snapshot!(response["semanticHitCount"], @"1");
|
||||||
|
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
|
||||||
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn simple_search_hf() {
|
async fn simple_search_hf() {
|
||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
|
|||||||
@@ -190,6 +190,285 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index {
|
|||||||
index
|
index
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn user_provided_embeddings_error() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = generate_default_user_provided_documents(&server).await;
|
||||||
|
|
||||||
|
// First case, we forget to specify the `regenerate`
|
||||||
|
let documents =
|
||||||
|
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 2,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// Second case, we don't specify anything
|
||||||
|
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 3,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
// Third case, we specify something wrong in place of regenerate
|
||||||
|
let documents =
|
||||||
|
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 4,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let documents =
|
||||||
|
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 5,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let documents =
|
||||||
|
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 6,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let documents =
|
||||||
|
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 7,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task["status"], @r###""succeeded""###);
|
||||||
|
|
||||||
|
let documents =
|
||||||
|
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task["status"], @r###""succeeded""###);
|
||||||
|
|
||||||
|
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 10,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 11,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}});
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": 12,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
|
||||||
|
"code": "invalid_vectors_type",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn clear_documents() {
|
async fn clear_documents() {
|
||||||
let server = Server::new().await;
|
let server = Server::new().await;
|
||||||
@@ -213,11 +492,11 @@ async fn clear_documents() {
|
|||||||
|
|
||||||
// Make sure the arroy DB has been cleared
|
// Make sure the arroy DB has been cleared
|
||||||
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
|
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
|
||||||
snapshot!(json_string!(documents), @r###"
|
snapshot!(documents, @r###"
|
||||||
{
|
{
|
||||||
"hits": [],
|
"hits": [],
|
||||||
"query": "",
|
"query": "",
|
||||||
"processingTimeMs": 0,
|
"processingTimeMs": "[duration]",
|
||||||
"limit": 20,
|
"limit": 20,
|
||||||
"offset": 0,
|
"offset": 0,
|
||||||
"estimatedTotalHits": 0,
|
"estimatedTotalHits": 0,
|
||||||
@@ -225,3 +504,85 @@ async fn clear_documents() {
|
|||||||
}
|
}
|
||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn add_remove_one_vector_4588() {
|
||||||
|
// https://github.com/meilisearch/meilisearch/issues/4588
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("doggo");
|
||||||
|
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||||
|
snapshot!(code, @"200 OK");
|
||||||
|
snapshot!(value, @r###"
|
||||||
|
{
|
||||||
|
"vectorStore": true,
|
||||||
|
"metrics": false,
|
||||||
|
"logsRoute": false
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"embedders": {
|
||||||
|
"manual": {
|
||||||
|
"source": "userProvided",
|
||||||
|
"dimensions": 3,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = server.wait_task(response.uid()).await;
|
||||||
|
snapshot!(task, name: "settings-processed");
|
||||||
|
|
||||||
|
let documents = json!([
|
||||||
|
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
|
||||||
|
]);
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, name: "document-added");
|
||||||
|
|
||||||
|
let documents = json!([
|
||||||
|
{"id": 0, "name": "kefir", "_vectors": { "manual": null }},
|
||||||
|
]);
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, name: "document-deleted");
|
||||||
|
|
||||||
|
let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await;
|
||||||
|
snapshot!(documents, @r###"
|
||||||
|
{
|
||||||
|
"hits": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"name": "kefir"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"query": "",
|
||||||
|
"processingTimeMs": "[duration]",
|
||||||
|
"limit": 20,
|
||||||
|
"offset": 0,
|
||||||
|
"estimatedTotalHits": 1,
|
||||||
|
"semanticHitCount": 1
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let (documents, _code) = index
|
||||||
|
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
|
||||||
|
.await;
|
||||||
|
snapshot!(json_string!(documents), @r###"
|
||||||
|
{
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"name": "kefir",
|
||||||
|
"_vectors": {}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"offset": 0,
|
||||||
|
"limit": 20,
|
||||||
|
"total": 1
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|||||||
@@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
source: meilisearch/tests/vector/mod.rs
|
||||||
|
---
|
||||||
|
{
|
||||||
|
"uid": 1,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "succeeded",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 1
|
||||||
|
},
|
||||||
|
"error": null,
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
@@ -0,0 +1,19 @@
|
|||||||
|
---
|
||||||
|
source: meilisearch/tests/vector/mod.rs
|
||||||
|
---
|
||||||
|
{
|
||||||
|
"uid": 2,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "succeeded",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 1
|
||||||
|
},
|
||||||
|
"error": null,
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
---
|
||||||
|
source: meilisearch/tests/vector/mod.rs
|
||||||
|
---
|
||||||
|
{
|
||||||
|
"uid": 0,
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "succeeded",
|
||||||
|
"type": "settingsUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"embedders": {
|
||||||
|
"manual": {
|
||||||
|
"source": "userProvided",
|
||||||
|
"dimensions": 3
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"error": null,
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
@@ -79,7 +79,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
|
|||||||
] }
|
] }
|
||||||
tiktoken-rs = "0.5.8"
|
tiktoken-rs = "0.5.8"
|
||||||
liquid = "0.26.4"
|
liquid = "0.26.4"
|
||||||
arroy = "0.3.1"
|
arroy = "0.4.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
tracing = "0.1.40"
|
tracing = "0.1.40"
|
||||||
ureq = { version = "2.9.7", features = ["json"] }
|
ureq = { version = "2.9.7", features = ["json"] }
|
||||||
|
|||||||
@@ -119,6 +119,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
|
|||||||
InvalidVectorDimensions { expected: usize, found: usize },
|
InvalidVectorDimensions { expected: usize, found: usize },
|
||||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||||
InvalidVectorsMapType { document_id: String, value: Value },
|
InvalidVectorsMapType { document_id: String, value: Value },
|
||||||
|
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
||||||
|
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError },
|
||||||
#[error("{0}")]
|
#[error("{0}")]
|
||||||
InvalidFilter(String),
|
InvalidFilter(String),
|
||||||
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
|
||||||
@@ -281,8 +283,9 @@ impl From<arroy::Error> for Error {
|
|||||||
arroy::Error::DatabaseFull
|
arroy::Error::DatabaseFull
|
||||||
| arroy::Error::InvalidItemAppend
|
| arroy::Error::InvalidItemAppend
|
||||||
| arroy::Error::UnmatchingDistance { .. }
|
| arroy::Error::UnmatchingDistance { .. }
|
||||||
| arroy::Error::MissingNode
|
| arroy::Error::NeedBuild(_)
|
||||||
| arroy::Error::MissingMetadata => {
|
| arroy::Error::MissingKey { .. }
|
||||||
|
| arroy::Error::MissingMetadata(_) => {
|
||||||
Error::InternalError(InternalError::ArroyError(value))
|
Error::InternalError(InternalError::ArroyError(value))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1610,7 +1610,7 @@ impl Index {
|
|||||||
arroy::Reader::open(rtxn, k, self.vector_arroy)
|
arroy::Reader::open(rtxn, k, self.vector_arroy)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.or_else(|e| match e {
|
.or_else(|e| match e {
|
||||||
arroy::Error::MissingMetadata => Ok(None),
|
arroy::Error::MissingMetadata(_) => Ok(None),
|
||||||
e => Err(e.into()),
|
e => Err(e.into()),
|
||||||
})
|
})
|
||||||
.transpose()
|
.transpose()
|
||||||
@@ -1643,7 +1643,7 @@ impl Index {
|
|||||||
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
|
||||||
.map(Some)
|
.map(Some)
|
||||||
.or_else(|e| match e {
|
.or_else(|e| match e {
|
||||||
arroy::Error::MissingMetadata => Ok(None),
|
arroy::Error::MissingMetadata(_) => Ok(None),
|
||||||
e => Err(e),
|
e => Err(e),
|
||||||
})
|
})
|
||||||
.transpose();
|
.transpose();
|
||||||
|
|||||||
@@ -6,9 +6,11 @@ use heed::Result;
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use super::{get_first_facet_value, get_highest_level};
|
use super::{get_first_facet_value, get_highest_level};
|
||||||
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
use crate::heed_codec::facet::{
|
||||||
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
|
||||||
|
};
|
||||||
use crate::heed_codec::BytesRefCodec;
|
use crate::heed_codec::BytesRefCodec;
|
||||||
use crate::DocumentId;
|
use crate::{CboRoaringBitmapCodec, DocumentId};
|
||||||
|
|
||||||
/// Call the given closure on the facet distribution of the candidate documents.
|
/// Call the given closure on the facet distribution of the candidate documents.
|
||||||
///
|
///
|
||||||
@@ -31,12 +33,9 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
|
|||||||
where
|
where
|
||||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||||
{
|
{
|
||||||
|
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
||||||
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
|
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
|
||||||
let highest_level = get_highest_level(
|
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||||
rtxn,
|
|
||||||
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
|
||||||
field_id,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||||
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
||||||
@@ -75,11 +74,8 @@ where
|
|||||||
|
|
||||||
// Represents the list of keys that we must explore.
|
// Represents the list of keys that we must explore.
|
||||||
let mut heap = BinaryHeap::new();
|
let mut heap = BinaryHeap::new();
|
||||||
let highest_level = get_highest_level(
|
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
|
||||||
rtxn,
|
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
||||||
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
|
|
||||||
field_id,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
|
||||||
// We first fill the heap with values from the highest level
|
// We first fill the heap with values from the highest level
|
||||||
@@ -92,7 +88,10 @@ where
|
|||||||
if key.field_id != field_id {
|
if key.field_id != field_id {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let intersection = value.bitmap & candidates;
|
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
|
value.bitmap_bytes,
|
||||||
|
candidates,
|
||||||
|
)?;
|
||||||
let count = intersection.len();
|
let count = intersection.len();
|
||||||
if count != 0 {
|
if count != 0 {
|
||||||
heap.push(LevelEntry {
|
heap.push(LevelEntry {
|
||||||
@@ -121,7 +120,10 @@ where
|
|||||||
if key.field_id != field_id {
|
if key.field_id != field_id {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
let intersection = value.bitmap & candidates;
|
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
|
value.bitmap_bytes,
|
||||||
|
candidates,
|
||||||
|
)?;
|
||||||
let count = intersection.len();
|
let count = intersection.len();
|
||||||
if count != 0 {
|
if count != 0 {
|
||||||
heap.push(LevelEntry {
|
heap.push(LevelEntry {
|
||||||
@@ -146,7 +148,7 @@ where
|
|||||||
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||||
{
|
{
|
||||||
rtxn: &'t heed::RoTxn<'t>,
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
|
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
callback: CB,
|
callback: CB,
|
||||||
}
|
}
|
||||||
@@ -171,7 +173,10 @@ where
|
|||||||
if key.field_id != self.field_id {
|
if key.field_id != self.field_id {
|
||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
let docids_in_common = value.bitmap & candidates;
|
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
|
value.bitmap_bytes,
|
||||||
|
candidates,
|
||||||
|
)?;
|
||||||
if !docids_in_common.is_empty() {
|
if !docids_in_common.is_empty() {
|
||||||
let any_docid_in_common = docids_in_common.min().unwrap();
|
let any_docid_in_common = docids_in_common.min().unwrap();
|
||||||
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
||||||
@@ -205,7 +210,10 @@ where
|
|||||||
if key.field_id != self.field_id {
|
if key.field_id != self.field_id {
|
||||||
return Ok(ControlFlow::Break(()));
|
return Ok(ControlFlow::Break(()));
|
||||||
}
|
}
|
||||||
let docids_in_common = value.bitmap & candidates;
|
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
|
||||||
|
value.bitmap_bytes,
|
||||||
|
candidates,
|
||||||
|
)?;
|
||||||
if !docids_in_common.is_empty() {
|
if !docids_in_common.is_empty() {
|
||||||
let cf = self.iterate(
|
let cf = self.iterate(
|
||||||
&docids_in_common,
|
&docids_in_common,
|
||||||
|
|||||||
@@ -178,16 +178,16 @@ impl<'a> Search<'a> {
|
|||||||
|
|
||||||
// completely skip semantic search if the results of the keyword search are good enough
|
// completely skip semantic search if the results of the keyword search are good enough
|
||||||
if self.results_good_enough(&keyword_results, semantic_ratio) {
|
if self.results_good_enough(&keyword_results, semantic_ratio) {
|
||||||
return Ok((keyword_results, Some(0)));
|
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||||
}
|
}
|
||||||
|
|
||||||
// no vector search against placeholder search
|
// no vector search against placeholder search
|
||||||
let Some(query) = search.query.take() else {
|
let Some(query) = search.query.take() else {
|
||||||
return Ok((keyword_results, Some(0)));
|
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||||
};
|
};
|
||||||
// no embedder, no semantic search
|
// no embedder, no semantic search
|
||||||
let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else {
|
let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else {
|
||||||
return Ok((keyword_results, Some(0)));
|
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
|
||||||
};
|
};
|
||||||
|
|
||||||
let vector_query = match vector {
|
let vector_query = match vector {
|
||||||
@@ -239,3 +239,44 @@ impl<'a> Search<'a> {
|
|||||||
true
|
true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn return_keyword_results(
|
||||||
|
limit: usize,
|
||||||
|
offset: usize,
|
||||||
|
SearchResult {
|
||||||
|
matching_words,
|
||||||
|
candidates,
|
||||||
|
mut documents_ids,
|
||||||
|
mut document_scores,
|
||||||
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
|
}: SearchResult,
|
||||||
|
) -> (SearchResult, Option<u32>) {
|
||||||
|
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
|
||||||
|
// technically redudant because documents_ids.len() == document_scores.len(),
|
||||||
|
// defensive programming
|
||||||
|
offset >= document_scores.len()
|
||||||
|
{
|
||||||
|
(vec![], vec![])
|
||||||
|
} else {
|
||||||
|
// PANICS: offset < len
|
||||||
|
documents_ids.rotate_left(offset);
|
||||||
|
documents_ids.truncate(limit);
|
||||||
|
|
||||||
|
// PANICS: offset < len
|
||||||
|
document_scores.rotate_left(offset);
|
||||||
|
document_scores.truncate(limit);
|
||||||
|
(documents_ids, document_scores)
|
||||||
|
};
|
||||||
|
(
|
||||||
|
SearchResult {
|
||||||
|
matching_words,
|
||||||
|
candidates,
|
||||||
|
documents_ids,
|
||||||
|
document_scores,
|
||||||
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
|
},
|
||||||
|
Some(0),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|||||||
@@ -290,7 +290,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
regenerate_if_prompt_changed(
|
regenerate_if_prompt_changed(
|
||||||
obkv,
|
obkv,
|
||||||
(old_prompt, prompt),
|
(old_prompt, prompt),
|
||||||
(&old_fields_ids_map, &new_fields_ids_map),
|
(old_fields_ids_map, new_fields_ids_map),
|
||||||
)?
|
)?
|
||||||
} else {
|
} else {
|
||||||
// we can simply ignore user provided vectors as they are not regenerated and are
|
// we can simply ignore user provided vectors as they are not regenerated and are
|
||||||
@@ -306,7 +306,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
prompt,
|
prompt,
|
||||||
(add_to_user_provided, remove_from_user_provided),
|
(add_to_user_provided, remove_from_user_provided),
|
||||||
(old, new),
|
(old, new),
|
||||||
(&old_fields_ids_map, &new_fields_ids_map),
|
(old_fields_ids_map, new_fields_ids_map),
|
||||||
document_id,
|
document_id,
|
||||||
)?,
|
)?,
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ mod extract_word_position_docids;
|
|||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::BufReader;
|
use std::io::BufReader;
|
||||||
use std::sync::Arc;
|
use std::sync::{Arc, OnceLock};
|
||||||
|
|
||||||
use crossbeam_channel::Sender;
|
use crossbeam_channel::Sender;
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
@@ -32,7 +32,7 @@ use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
|
|||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::index::IndexEmbeddingConfig;
|
use crate::index::IndexEmbeddingConfig;
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
|
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
|
||||||
|
|
||||||
/// Extract data for each databases from obkv documents in parallel.
|
/// Extract data for each databases from obkv documents in parallel.
|
||||||
/// Send data in grenad file over provided Sender.
|
/// Send data in grenad file over provided Sender.
|
||||||
@@ -207,6 +207,18 @@ fn run_extraction_task<FE, FS, M>(
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn request_threads() -> &'static ThreadPoolNoAbort {
|
||||||
|
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
|
||||||
|
|
||||||
|
REQUEST_THREADS.get_or_init(|| {
|
||||||
|
ThreadPoolNoAbortBuilder::new()
|
||||||
|
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
||||||
|
.thread_name(|index| format!("embedding-request-{index}"))
|
||||||
|
.build()
|
||||||
|
.unwrap()
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
/// Extract chunked data and send it into lmdb_writer_sx sender:
|
||||||
/// - documents
|
/// - documents
|
||||||
fn send_original_documents_data(
|
fn send_original_documents_data(
|
||||||
@@ -219,11 +231,6 @@ fn send_original_documents_data(
|
|||||||
let original_documents_chunk =
|
let original_documents_chunk =
|
||||||
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
|
||||||
|
|
||||||
let request_threads = ThreadPoolNoAbortBuilder::new()
|
|
||||||
.num_threads(crate::vector::REQUEST_PARALLELISM)
|
|
||||||
.thread_name(|index| format!("embedding-request-{index}"))
|
|
||||||
.build()?;
|
|
||||||
|
|
||||||
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
|
||||||
// no point in indexing vectors without embedders
|
// no point in indexing vectors without embedders
|
||||||
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
|
||||||
@@ -256,7 +263,7 @@ fn send_original_documents_data(
|
|||||||
prompts,
|
prompts,
|
||||||
indexer,
|
indexer,
|
||||||
embedder.clone(),
|
embedder.clone(),
|
||||||
&request_threads,
|
request_threads(),
|
||||||
) {
|
) {
|
||||||
Ok(results) => Some(results),
|
Ok(results) => Some(results),
|
||||||
Err(error) => {
|
Err(error) => {
|
||||||
|
|||||||
@@ -547,10 +547,11 @@ where
|
|||||||
pool.install(|| {
|
pool.install(|| {
|
||||||
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
|
||||||
let writer = arroy::Writer::new(vector_arroy, k, dimension);
|
let writer = arroy::Writer::new(vector_arroy, k, dimension);
|
||||||
if writer.is_empty(wtxn)? {
|
if writer.need_build(wtxn)? {
|
||||||
|
writer.build(wtxn, &mut rng, None)?;
|
||||||
|
} else if writer.is_empty(wtxn)? {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
writer.build(wtxn, &mut rng, None)?;
|
|
||||||
}
|
}
|
||||||
Result::Ok(())
|
Result::Ok(())
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -1,5 +1,6 @@
|
|||||||
use std::collections::{BTreeMap, BTreeSet};
|
use std::collections::{BTreeMap, BTreeSet};
|
||||||
|
|
||||||
|
use deserr::{take_cf_content, DeserializeError, Deserr, Sequence};
|
||||||
use obkv::KvReader;
|
use obkv::KvReader;
|
||||||
use serde_json::{from_slice, Value};
|
use serde_json::{from_slice, Value};
|
||||||
|
|
||||||
@@ -10,13 +11,44 @@ use crate::{DocumentId, FieldId, InternalError, UserError};
|
|||||||
|
|
||||||
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
|
||||||
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
#[derive(serde::Serialize, Debug)]
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum Vectors {
|
pub enum Vectors {
|
||||||
ImplicitlyUserProvided(VectorOrArrayOfVectors),
|
ImplicitlyUserProvided(VectorOrArrayOfVectors),
|
||||||
Explicit(ExplicitVectors),
|
Explicit(ExplicitVectors),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<E: DeserializeError> Deserr<E> for Vectors {
|
||||||
|
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||||
|
value: deserr::Value<V>,
|
||||||
|
location: deserr::ValuePointerRef,
|
||||||
|
) -> Result<Self, E> {
|
||||||
|
match value {
|
||||||
|
deserr::Value::Sequence(_) | deserr::Value::Null => {
|
||||||
|
Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value(
|
||||||
|
value, location,
|
||||||
|
)?))
|
||||||
|
}
|
||||||
|
deserr::Value::Map(_) => {
|
||||||
|
Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?))
|
||||||
|
}
|
||||||
|
|
||||||
|
value => Err(take_cf_content(E::error(
|
||||||
|
None,
|
||||||
|
deserr::ErrorKind::IncorrectValueKind {
|
||||||
|
actual: value,
|
||||||
|
accepted: &[
|
||||||
|
deserr::ValueKind::Sequence,
|
||||||
|
deserr::ValueKind::Map,
|
||||||
|
deserr::ValueKind::Null,
|
||||||
|
],
|
||||||
|
},
|
||||||
|
location,
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl Vectors {
|
impl Vectors {
|
||||||
pub fn must_regenerate(&self) -> bool {
|
pub fn must_regenerate(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
@@ -37,9 +69,11 @@ impl Vectors {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
#[derive(serde::Serialize, Deserr, Debug)]
|
||||||
#[serde(rename_all = "camelCase")]
|
#[serde(rename_all = "camelCase")]
|
||||||
pub struct ExplicitVectors {
|
pub struct ExplicitVectors {
|
||||||
|
#[serde(default)]
|
||||||
|
#[deserr(default)]
|
||||||
pub embeddings: Option<VectorOrArrayOfVectors>,
|
pub embeddings: Option<VectorOrArrayOfVectors>,
|
||||||
pub regenerate: bool,
|
pub regenerate: bool,
|
||||||
}
|
}
|
||||||
@@ -149,13 +183,20 @@ impl ParsedVectorsDiff {
|
|||||||
|
|
||||||
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
|
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
|
||||||
|
|
||||||
|
impl<E: DeserializeError> Deserr<E> for ParsedVectors {
|
||||||
|
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||||
|
value: deserr::Value<V>,
|
||||||
|
location: deserr::ValuePointerRef,
|
||||||
|
) -> Result<Self, E> {
|
||||||
|
let value = <BTreeMap<String, Vectors>>::deserialize_from_value(value, location)?;
|
||||||
|
Ok(ParsedVectors(value))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl ParsedVectors {
|
impl ParsedVectors {
|
||||||
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
|
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
|
||||||
let Ok(value) = from_slice(value) else {
|
let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?;
|
||||||
let value = from_slice(value).map_err(Error::InternalSerdeJson)?;
|
deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error })
|
||||||
return Err(Error::InvalidMap(value));
|
|
||||||
};
|
|
||||||
Ok(ParsedVectors(value))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
|
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
|
||||||
@@ -165,6 +206,7 @@ impl ParsedVectors {
|
|||||||
|
|
||||||
pub enum Error {
|
pub enum Error {
|
||||||
InvalidMap(Value),
|
InvalidMap(Value),
|
||||||
|
InvalidEmbedderConf { error: deserr::errors::JsonError },
|
||||||
InternalSerdeJson(serde_json::Error),
|
InternalSerdeJson(serde_json::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,6 +216,12 @@ impl Error {
|
|||||||
Error::InvalidMap(value) => {
|
Error::InvalidMap(value) => {
|
||||||
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
|
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
|
||||||
}
|
}
|
||||||
|
Error::InvalidEmbedderConf { error } => {
|
||||||
|
crate::Error::UserError(UserError::InvalidVectorsEmbedderConf {
|
||||||
|
document_id,
|
||||||
|
error,
|
||||||
|
})
|
||||||
|
}
|
||||||
Error::InternalSerdeJson(error) => {
|
Error::InternalSerdeJson(error) => {
|
||||||
crate::Error::InternalError(InternalError::SerdeJson(error))
|
crate::Error::InternalError(InternalError::SerdeJson(error))
|
||||||
}
|
}
|
||||||
@@ -194,13 +242,84 @@ fn to_vector_map(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Represents either a vector or an array of multiple vectors.
|
/// Represents either a vector or an array of multiple vectors.
|
||||||
#[derive(serde::Serialize, serde::Deserialize, Debug)]
|
#[derive(serde::Serialize, Debug)]
|
||||||
#[serde(transparent)]
|
#[serde(transparent)]
|
||||||
pub struct VectorOrArrayOfVectors {
|
pub struct VectorOrArrayOfVectors {
|
||||||
#[serde(with = "either::serde_untagged_optional")]
|
#[serde(with = "either::serde_untagged_optional")]
|
||||||
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
|
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl<E: DeserializeError> Deserr<E> for VectorOrArrayOfVectors {
|
||||||
|
fn deserialize_from_value<V: deserr::IntoValue>(
|
||||||
|
value: deserr::Value<V>,
|
||||||
|
location: deserr::ValuePointerRef,
|
||||||
|
) -> Result<Self, E> {
|
||||||
|
match value {
|
||||||
|
deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }),
|
||||||
|
deserr::Value::Sequence(seq) => {
|
||||||
|
let mut iter = seq.into_iter();
|
||||||
|
match iter.next().map(|v| v.into_value()) {
|
||||||
|
None => {
|
||||||
|
// With the strange way serde serialize the `Either`, we must send the left part
|
||||||
|
// otherwise it'll consider we returned [[]]
|
||||||
|
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) })
|
||||||
|
}
|
||||||
|
Some(val @ deserr::Value::Sequence(_)) => {
|
||||||
|
let first = Embedding::deserialize_from_value(val, location.push_index(0))?;
|
||||||
|
let mut collect = vec![first];
|
||||||
|
let mut tail = iter
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, v)| {
|
||||||
|
Embedding::deserialize_from_value(
|
||||||
|
v.into_value(),
|
||||||
|
location.push_index(i + 1),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
collect.append(&mut tail);
|
||||||
|
|
||||||
|
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) })
|
||||||
|
}
|
||||||
|
Some(
|
||||||
|
val @ deserr::Value::Integer(_)
|
||||||
|
| val @ deserr::Value::NegativeInteger(_)
|
||||||
|
| val @ deserr::Value::Float(_),
|
||||||
|
) => {
|
||||||
|
let first = <f32>::deserialize_from_value(val, location.push_index(0))?;
|
||||||
|
let mut embedding = iter
|
||||||
|
.enumerate()
|
||||||
|
.map(|(i, v)| {
|
||||||
|
<f32>::deserialize_from_value(
|
||||||
|
v.into_value(),
|
||||||
|
location.push_index(i + 1),
|
||||||
|
)
|
||||||
|
})
|
||||||
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
embedding.insert(0, first);
|
||||||
|
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) })
|
||||||
|
}
|
||||||
|
Some(value) => Err(take_cf_content(E::error(
|
||||||
|
None,
|
||||||
|
deserr::ErrorKind::IncorrectValueKind {
|
||||||
|
actual: value,
|
||||||
|
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float],
|
||||||
|
},
|
||||||
|
location.push_index(0),
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
value => Err(take_cf_content(E::error(
|
||||||
|
None,
|
||||||
|
deserr::ErrorKind::IncorrectValueKind {
|
||||||
|
actual: value,
|
||||||
|
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null],
|
||||||
|
},
|
||||||
|
location,
|
||||||
|
))),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl VectorOrArrayOfVectors {
|
impl VectorOrArrayOfVectors {
|
||||||
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
|
||||||
match self.inner? {
|
match self.inner? {
|
||||||
@@ -234,15 +353,19 @@ impl From<Vec<Embedding>> for VectorOrArrayOfVectors {
|
|||||||
mod test {
|
mod test {
|
||||||
use super::VectorOrArrayOfVectors;
|
use super::VectorOrArrayOfVectors;
|
||||||
|
|
||||||
|
fn embedding_from_str(s: &str) -> Result<VectorOrArrayOfVectors, deserr::errors::JsonError> {
|
||||||
|
let value: serde_json::Value = serde_json::from_str(s).unwrap();
|
||||||
|
deserr::deserialize(value)
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn array_of_vectors() {
|
fn array_of_vectors() {
|
||||||
let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap();
|
let null = embedding_from_str("null").unwrap();
|
||||||
let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap();
|
let empty = embedding_from_str("[]").unwrap();
|
||||||
let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap();
|
let one = embedding_from_str("[0.1]").unwrap();
|
||||||
let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap();
|
let two = embedding_from_str("[0.1, 0.2]").unwrap();
|
||||||
let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap();
|
let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap();
|
||||||
let two_vecs: VectorOrArrayOfVectors =
|
let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
|
||||||
serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
|
|
||||||
|
|
||||||
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
|
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
|
||||||
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");
|
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");
|
||||||
|
|||||||
Reference in New Issue
Block a user