Compare commits

...

19 Commits

Author SHA1 Message Date
ManyTheFish
a87c390244 Activate only the necessary features for Japanese 2024-07-08 18:17:11 +02:00
meili-bors[bot]
0df84bbba7 Merge #4746
4746: Fix hybrid search limit offset r=irevoire a=dureuill

# Pull Request

## Related issue
Fixes #4745

## What does this PR do?
- Apply offset and limit to the keyword search results when they are returned early.
- Add a test that is initially failing, and then passes


Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-06-27 12:47:08 +00:00
Louis Dureuil
e53de15b8e Fix behavior of limit and offset for hybrid search when keyword results are returned early
The test is fixed
2024-06-27 14:25:33 +02:00
Louis Dureuil
8c4921b9dd Add failing test on limit+offset for hybrid search 2024-06-27 14:21:34 +02:00
meili-bors[bot]
f6a00f4a90 Merge #4740
4740: Make `embeddings` optional and improve error message for `regenerate` r=dureuill a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/4741

## What does this PR do?
- Make the `embeddings` parameter optional when manually specifying embeddings for an embedder
- Adds a lot of tests around malformed `_vectors.embedder` objects
- Use `deserr` to deserialize the `_vectors.embedder` field, improving error messages


Co-authored-by: Tamo <tamo@meilisearch.com>
2024-06-27 10:06:28 +00:00
Tamo
ce08dc509b add more tests and improve the location of the error 2024-06-27 11:51:45 +02:00
Tamo
1daaed163a Make _vectors.:embedding.regenerate mandatory + tests + error messages 2024-06-27 11:04:58 +02:00
meili-bors[bot]
298c7b0c93 Merge #4715
4715: Build all arroy indexes that need to be built r=dureuill a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/4588

## What does this PR do?
- Update arroy
- Ensure we always rebuild the arroy indexes that need to be built


Co-authored-by: Tamo <tamo@meilisearch.com>
2024-06-24 09:32:04 +00:00
Tamo
606e108420 fix all the flaky snapshots 2024-06-24 11:13:45 +02:00
Tamo
7be17b7e4c add the missing snapshots 2024-06-24 10:52:57 +02:00
Tamo
1693332cab Update arroy and always build the tree that need to be built 2024-06-24 10:14:03 +02:00
meili-bors[bot]
ddd564665b Merge #4713
4713: Speed up facet distribution r=ManyTheFish a=Kerollmops

This PR is akin to #4682, but this time, the same logic is applied to the facets. Bitmaps are not decoded, and we do an intersection on the bytes with the search candidates instead of materializing the RoaringBitmap to destroy it just after the operation.

A prospect raised some slow requests when performing facet searches, and I found out that the disk optimization intersection wasn't performed on the facets.

Co-authored-by: Clément Renault <clement@meilisearch.com>
2024-06-24 05:23:46 +00:00
meili-bors[bot]
4ae11bfd31 Merge #4710
4710: Only spawn thread pool once (v1.9) r=irevoire a=dureuill

# Pull Request

See #4707 

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-06-20 11:45:32 +00:00
Clément Renault
9736e16a88 Make clippy happy 2024-06-20 13:02:44 +02:00
Clément Renault
6fa4da8ae7 Improve facet distribution speed in count mode 2024-06-20 12:58:51 +02:00
Clément Renault
19d7cdc20d Improve facet distribution speed in lexico mode 2024-06-20 12:57:08 +02:00
meili-bors[bot]
c229200820 Merge #4712
4712: Update mini-dashboard 2.14 r=irevoire a=curquiza

Fixes #4668

Co-authored-by: curquiza <clementine@meilisearch.com>
2024-06-20 08:47:22 +00:00
curquiza
bad28cc9e2 Update mini-dashboard 2.14 2024-06-20 10:01:36 +02:00
Louis Dureuil
a04041c8f2 Only spawn the pool once 2024-06-19 16:25:33 +02:00
21 changed files with 708 additions and 73 deletions

24
Cargo.lock generated
View File

@@ -381,9 +381,9 @@ dependencies = [
[[package]]
name = "arroy"
version = "0.3.1"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73897699bf04bac935c0b120990d2a511e91e563e0f9769f9c8bb983d98dfbc9"
checksum = "2ece9e5347e7fdaaea3181dec7f916677ad5f3fcbac183648ce1924eb4aeef9a"
dependencies = [
"bytemuck",
"byteorder",
@@ -679,9 +679,9 @@ checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]]
name = "bytemuck"
version = "1.15.0"
version = "1.16.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15"
checksum = "b236fc92302c97ed75b38da1f4917b5cdda4984745740f153a5d3059e48d725e"
dependencies = [
"bytemuck_derive",
]
@@ -2273,9 +2273,9 @@ checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
[[package]]
name = "heed"
version = "0.20.1"
version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f7acb9683d7c7068aa46d47557bfa4e35a277964b350d9504a87b03610163fd"
checksum = "f60d7cff16094be9627830b399c087a25017e93fb3768b87cd656a68ccb1ebe8"
dependencies = [
"bitflags 2.5.0",
"byteorder",
@@ -3172,9 +3172,9 @@ checksum = "f9d642685b028806386b2b6e75685faadd3eb65a85fff7df711ce18446a422da"
[[package]]
name = "lmdb-master-sys"
version = "0.2.0"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc9048db3a58c0732d7236abc4909058f9d2708cfb6d7d047eb895fddec6419a"
checksum = "a5142795c220effa4c8f4813537bd4c88113a07e45e93100ccb2adc5cec6c7f3"
dependencies = [
"cc",
"doxygen-rs",
@@ -5053,18 +5053,18 @@ dependencies = [
[[package]]
name = "thiserror"
version = "1.0.58"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03468839009160513471e86a034bb2c5c0e4baae3b43f79ffc55c4a5427b3297"
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.58"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c61f3ba182994efc43764a46c018c347bc492c79f024e705f46567b418f6d4f7"
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
dependencies = [
"proc-macro2",
"quote",

View File

@@ -17,7 +17,7 @@ RUN set -eux; \
if [ "$apkArch" = "aarch64" ]; then \
export JEMALLOC_SYS_WITH_LG_PAGE=16; \
fi && \
cargo build --release -p meilisearch -p meilitool
cargo build --release -p meilisearch -p meilitool --no-default-features --features "analytics mini-dashboard japanese"
# Run
FROM alpine:3.16

View File

@@ -40,7 +40,7 @@ ureq = "2.9.7"
uuid = { version = "1.6.1", features = ["serde", "v4"] }
[dev-dependencies]
arroy = "0.3.1"
arroy = "0.4.0"
big_s = "1.0.2"
crossbeam = "0.8.4"
insta = { version = "1.34.0", features = ["json", "redactions"] }

View File

@@ -5396,7 +5396,7 @@ mod tests {
let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
arroy::Error::MissingMetadata(_) => Ok(None),
e => Err(e),
})
.transpose();

View File

@@ -398,7 +398,8 @@ impl ErrorCode for milli::Error {
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType,
UserError::InvalidVectorsMapType { .. }
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
UserError::SortError(_) => Code::InvalidSearchSort,
UserError::InvalidMinTypoWordLenSetting(_, _) => {

View File

@@ -158,5 +158,5 @@ vietnamese = ["meilisearch-types/vietnamese"]
swedish-recomposition = ["meilisearch-types/swedish-recomposition"]
[package.metadata.mini-dashboard]
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.13/build.zip"
sha1 = "e20cc9b390003c6c844f4b8bcc5c5013191a77ff"
assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.14/build.zip"
sha1 = "592d1b5a3459d621d0aae1dded8fe3154f5c38fe"

View File

@@ -65,7 +65,7 @@ impl Display for Value {
write!(
f,
"{}",
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" })
json_string!(self, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]", ".processingTimeMs" => "[duration]" })
)
}
}

View File

@@ -150,6 +150,35 @@ async fn simple_search() {
snapshot!(response["semanticHitCount"], @"3");
}
#[actix_rt::test]
async fn limit_offset() {
let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true, "offset": 1, "limit": 1}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}}]"###);
snapshot!(response["semanticHitCount"], @"0");
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(
json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.9}, "retrieveVectors": true, "offset": 1, "limit": 1}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}}]"###);
snapshot!(response["semanticHitCount"], @"1");
assert_eq!(response["hits"].as_array().unwrap().len(), 1);
}
#[actix_rt::test]
async fn simple_search_hf() {
let server = Server::new().await;

View File

@@ -190,6 +190,285 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index {
index
}
#[actix_rt::test]
async fn user_provided_embeddings_error() {
let server = Server::new().await;
let index = generate_default_user_provided_documents(&server).await;
// First case, we forget to specify the `regenerate`
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 2,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Second case, we don't specify anything
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 3,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
// Third case, we specify something wrong in place of regenerate
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 4,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 5,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 6,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 7,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents =
json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task["status"], @r###""succeeded""###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [0.1, [0.2, 0.3]] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 10,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected a number, but found an array: `[0.2,0.3]`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, 0.2], 0.3] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 11,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[1]`: expected an array, but found a number: `0.3`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": false, "embeddings": [[0.1, true], 0.3] }}});
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, @r###"
{
"uid": 12,
"indexUid": "doggo",
"status": "failed",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 0
},
"error": {
"message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][1]`: expected a number, but found a boolean: `true`",
"code": "invalid_vectors_type",
"type": "invalid_request",
"link": "https://docs.meilisearch.com/errors#invalid_vectors_type"
},
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}
"###);
}
#[actix_rt::test]
async fn clear_documents() {
let server = Server::new().await;
@@ -213,11 +492,11 @@ async fn clear_documents() {
// Make sure the arroy DB has been cleared
let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await;
snapshot!(json_string!(documents), @r###"
snapshot!(documents, @r###"
{
"hits": [],
"query": "",
"processingTimeMs": 0,
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 0,
@@ -225,3 +504,85 @@ async fn clear_documents() {
}
"###);
}
#[actix_rt::test]
async fn add_remove_one_vector_4588() {
// https://github.com/meilisearch/meilisearch/issues/4588
let server = Server::new().await;
let index = server.index("doggo");
let (value, code) = server.set_features(json!({"vectorStore": true})).await;
snapshot!(code, @"200 OK");
snapshot!(value, @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false
}
"###);
let (response, code) = index
.update_settings(json!({
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3,
}
},
}))
.await;
snapshot!(code, @"202 Accepted");
let task = server.wait_task(response.uid()).await;
snapshot!(task, name: "settings-processed");
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, name: "document-added");
let documents = json!([
{"id": 0, "name": "kefir", "_vectors": { "manual": null }},
]);
let (value, code) = index.add_documents(documents, None).await;
snapshot!(code, @"202 Accepted");
let task = index.wait_task(value.uid()).await;
snapshot!(task, name: "document-deleted");
let (documents, _code) = index.search_post(json!({"vector": [1, 1, 1] })).await;
snapshot!(documents, @r###"
{
"hits": [
{
"id": 0,
"name": "kefir"
}
],
"query": "",
"processingTimeMs": "[duration]",
"limit": 20,
"offset": 0,
"estimatedTotalHits": 1,
"semanticHitCount": 1
}
"###);
let (documents, _code) = index
.get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() })
.await;
snapshot!(json_string!(documents), @r###"
{
"results": [
{
"id": 0,
"name": "kefir",
"_vectors": {}
}
],
"offset": 0,
"limit": 20,
"total": 1
}
"###);
}

View File

@@ -0,0 +1,19 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 1,
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@@ -0,0 +1,19 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 2,
"indexUid": "doggo",
"status": "succeeded",
"type": "documentAdditionOrUpdate",
"canceledBy": null,
"details": {
"receivedDocuments": 1,
"indexedDocuments": 1
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@@ -0,0 +1,23 @@
---
source: meilisearch/tests/vector/mod.rs
---
{
"uid": 0,
"indexUid": "doggo",
"status": "succeeded",
"type": "settingsUpdate",
"canceledBy": null,
"details": {
"embedders": {
"manual": {
"source": "userProvided",
"dimensions": 3
}
}
},
"error": null,
"duration": "[duration]",
"enqueuedAt": "[date]",
"startedAt": "[date]",
"finishedAt": "[date]"
}

View File

@@ -79,7 +79,7 @@ hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls",
] }
tiktoken-rs = "0.5.8"
liquid = "0.26.4"
arroy = "0.3.1"
arroy = "0.4.0"
rand = "0.8.5"
tracing = "0.1.40"
ureq = { version = "2.9.7", features = ["json"] }

View File

@@ -119,6 +119,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
InvalidVectorDimensions { expected: usize, found: usize },
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
InvalidVectorsMapType { document_id: String, value: Value },
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError },
#[error("{0}")]
InvalidFilter(String),
#[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))]
@@ -281,8 +283,9 @@ impl From<arroy::Error> for Error {
arroy::Error::DatabaseFull
| arroy::Error::InvalidItemAppend
| arroy::Error::UnmatchingDistance { .. }
| arroy::Error::MissingNode
| arroy::Error::MissingMetadata => {
| arroy::Error::NeedBuild(_)
| arroy::Error::MissingKey { .. }
| arroy::Error::MissingMetadata(_) => {
Error::InternalError(InternalError::ArroyError(value))
}
}

View File

@@ -1610,7 +1610,7 @@ impl Index {
arroy::Reader::open(rtxn, k, self.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
arroy::Error::MissingMetadata(_) => Ok(None),
e => Err(e.into()),
})
.transpose()
@@ -1643,7 +1643,7 @@ impl Index {
let reader = arroy::Reader::open(rtxn, embedder_id | (i as u16), self.vector_arroy)
.map(Some)
.or_else(|e| match e {
arroy::Error::MissingMetadata => Ok(None),
arroy::Error::MissingMetadata(_) => Ok(None),
e => Err(e),
})
.transpose();

View File

@@ -6,9 +6,11 @@ use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::DocumentId;
use crate::{CboRoaringBitmapCodec, DocumentId};
/// Call the given closure on the facet distribution of the candidate documents.
///
@@ -31,12 +33,9 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
@@ -75,11 +74,8 @@ where
// Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new();
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level
@@ -92,7 +88,10 @@ where
if key.field_id != field_id {
break;
}
let intersection = value.bitmap & candidates;
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
@@ -121,7 +120,10 @@ where
if key.field_id != field_id {
break;
}
let intersection = value.bitmap & candidates;
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
@@ -146,7 +148,7 @@ where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
field_id: u16,
callback: CB,
}
@@ -171,7 +173,10 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
@@ -205,7 +210,10 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let cf = self.iterate(
&docids_in_common,

View File

@@ -178,16 +178,16 @@ impl<'a> Search<'a> {
// completely skip semantic search if the results of the keyword search are good enough
if self.results_good_enough(&keyword_results, semantic_ratio) {
return Ok((keyword_results, Some(0)));
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
}
// no vector search against placeholder search
let Some(query) = search.query.take() else {
return Ok((keyword_results, Some(0)));
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
};
// no embedder, no semantic search
let Some(SemanticSearch { vector, embedder_name, embedder }) = semantic else {
return Ok((keyword_results, Some(0)));
return Ok(return_keyword_results(self.limit, self.offset, keyword_results));
};
let vector_query = match vector {
@@ -239,3 +239,44 @@ impl<'a> Search<'a> {
true
}
}
fn return_keyword_results(
limit: usize,
offset: usize,
SearchResult {
matching_words,
candidates,
mut documents_ids,
mut document_scores,
degraded,
used_negative_operator,
}: SearchResult,
) -> (SearchResult, Option<u32>) {
let (documents_ids, document_scores) = if offset >= documents_ids.len() ||
// technically redudant because documents_ids.len() == document_scores.len(),
// defensive programming
offset >= document_scores.len()
{
(vec![], vec![])
} else {
// PANICS: offset < len
documents_ids.rotate_left(offset);
documents_ids.truncate(limit);
// PANICS: offset < len
document_scores.rotate_left(offset);
document_scores.truncate(limit);
(documents_ids, document_scores)
};
(
SearchResult {
matching_words,
candidates,
documents_ids,
document_scores,
degraded,
used_negative_operator,
},
Some(0),
)
}

View File

@@ -290,7 +290,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
regenerate_if_prompt_changed(
obkv,
(old_prompt, prompt),
(&old_fields_ids_map, &new_fields_ids_map),
(old_fields_ids_map, new_fields_ids_map),
)?
} else {
// we can simply ignore user provided vectors as they are not regenerated and are
@@ -306,7 +306,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(&old_fields_ids_map, &new_fields_ids_map),
(old_fields_ids_map, new_fields_ids_map),
document_id,
)?,
};

View File

@@ -11,7 +11,7 @@ mod extract_word_position_docids;
use std::fs::File;
use std::io::BufReader;
use std::sync::Arc;
use std::sync::{Arc, OnceLock};
use crossbeam_channel::Sender;
use rayon::prelude::*;
@@ -32,7 +32,7 @@ use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters};
use super::{helpers, TypedChunk};
use crate::index::IndexEmbeddingConfig;
use crate::update::settings::InnerIndexSettingsDiff;
use crate::{FieldId, Result, ThreadPoolNoAbortBuilder};
use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder};
/// Extract data for each databases from obkv documents in parallel.
/// Send data in grenad file over provided Sender.
@@ -207,6 +207,18 @@ fn run_extraction_task<FE, FS, M>(
})
}
fn request_threads() -> &'static ThreadPoolNoAbort {
static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();
REQUEST_THREADS.get_or_init(|| {
ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()
.unwrap()
})
}
/// Extract chunked data and send it into lmdb_writer_sx sender:
/// - documents
fn send_original_documents_data(
@@ -219,11 +231,6 @@ fn send_original_documents_data(
let original_documents_chunk =
original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
let request_threads = ThreadPoolNoAbortBuilder::new()
.num_threads(crate::vector::REQUEST_PARALLELISM)
.thread_name(|index| format!("embedding-request-{index}"))
.build()?;
let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only())
// no point in indexing vectors without embedders
&& (!settings_diff.new.embedding_configs.inner_as_ref().is_empty());
@@ -256,7 +263,7 @@ fn send_original_documents_data(
prompts,
indexer,
embedder.clone(),
&request_threads,
request_threads(),
) {
Ok(results) => Some(results),
Err(error) => {

View File

@@ -547,10 +547,11 @@ where
pool.install(|| {
for k in crate::vector::arroy_db_range_for_embedder(embedder_index) {
let writer = arroy::Writer::new(vector_arroy, k, dimension);
if writer.is_empty(wtxn)? {
if writer.need_build(wtxn)? {
writer.build(wtxn, &mut rng, None)?;
} else if writer.is_empty(wtxn)? {
break;
}
writer.build(wtxn, &mut rng, None)?;
}
Result::Ok(())
})

View File

@@ -1,5 +1,6 @@
use std::collections::{BTreeMap, BTreeSet};
use deserr::{take_cf_content, DeserializeError, Deserr, Sequence};
use obkv::KvReader;
use serde_json::{from_slice, Value};
@@ -10,13 +11,44 @@ use crate::{DocumentId, FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[derive(serde::Serialize, Debug)]
#[serde(untagged)]
pub enum Vectors {
ImplicitlyUserProvided(VectorOrArrayOfVectors),
Explicit(ExplicitVectors),
}
impl<E: DeserializeError> Deserr<E> for Vectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
match value {
deserr::Value::Sequence(_) | deserr::Value::Null => {
Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value(
value, location,
)?))
}
deserr::Value::Map(_) => {
Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?))
}
value => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[
deserr::ValueKind::Sequence,
deserr::ValueKind::Map,
deserr::ValueKind::Null,
],
},
location,
))),
}
}
}
impl Vectors {
pub fn must_regenerate(&self) -> bool {
match self {
@@ -37,9 +69,11 @@ impl Vectors {
}
}
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[derive(serde::Serialize, Deserr, Debug)]
#[serde(rename_all = "camelCase")]
pub struct ExplicitVectors {
#[serde(default)]
#[deserr(default)]
pub embeddings: Option<VectorOrArrayOfVectors>,
pub regenerate: bool,
}
@@ -149,13 +183,20 @@ impl ParsedVectorsDiff {
pub struct ParsedVectors(pub BTreeMap<String, Vectors>);
impl<E: DeserializeError> Deserr<E> for ParsedVectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
let value = <BTreeMap<String, Vectors>>::deserialize_from_value(value, location)?;
Ok(ParsedVectors(value))
}
}
impl ParsedVectors {
pub fn from_bytes(value: &[u8]) -> Result<Self, Error> {
let Ok(value) = from_slice(value) else {
let value = from_slice(value).map_err(Error::InternalSerdeJson)?;
return Err(Error::InvalidMap(value));
};
Ok(ParsedVectors(value))
let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?;
deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error })
}
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
@@ -165,6 +206,7 @@ impl ParsedVectors {
pub enum Error {
InvalidMap(Value),
InvalidEmbedderConf { error: deserr::errors::JsonError },
InternalSerdeJson(serde_json::Error),
}
@@ -174,6 +216,12 @@ impl Error {
Error::InvalidMap(value) => {
crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value })
}
Error::InvalidEmbedderConf { error } => {
crate::Error::UserError(UserError::InvalidVectorsEmbedderConf {
document_id,
error,
})
}
Error::InternalSerdeJson(error) => {
crate::Error::InternalError(InternalError::SerdeJson(error))
}
@@ -194,13 +242,84 @@ fn to_vector_map(
}
/// Represents either a vector or an array of multiple vectors.
#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[derive(serde::Serialize, Debug)]
#[serde(transparent)]
pub struct VectorOrArrayOfVectors {
#[serde(with = "either::serde_untagged_optional")]
inner: Option<either::Either<Vec<Embedding>, Embedding>>,
}
impl<E: DeserializeError> Deserr<E> for VectorOrArrayOfVectors {
fn deserialize_from_value<V: deserr::IntoValue>(
value: deserr::Value<V>,
location: deserr::ValuePointerRef,
) -> Result<Self, E> {
match value {
deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }),
deserr::Value::Sequence(seq) => {
let mut iter = seq.into_iter();
match iter.next().map(|v| v.into_value()) {
None => {
// With the strange way serde serialize the `Either`, we must send the left part
// otherwise it'll consider we returned [[]]
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) })
}
Some(val @ deserr::Value::Sequence(_)) => {
let first = Embedding::deserialize_from_value(val, location.push_index(0))?;
let mut collect = vec![first];
let mut tail = iter
.enumerate()
.map(|(i, v)| {
Embedding::deserialize_from_value(
v.into_value(),
location.push_index(i + 1),
)
})
.collect::<Result<Vec<_>, _>>()?;
collect.append(&mut tail);
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) })
}
Some(
val @ deserr::Value::Integer(_)
| val @ deserr::Value::NegativeInteger(_)
| val @ deserr::Value::Float(_),
) => {
let first = <f32>::deserialize_from_value(val, location.push_index(0))?;
let mut embedding = iter
.enumerate()
.map(|(i, v)| {
<f32>::deserialize_from_value(
v.into_value(),
location.push_index(i + 1),
)
})
.collect::<Result<Vec<_>, _>>()?;
embedding.insert(0, first);
Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) })
}
Some(value) => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float],
},
location.push_index(0),
))),
}
}
value => Err(take_cf_content(E::error(
None,
deserr::ErrorKind::IncorrectValueKind {
actual: value,
accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null],
},
location,
))),
}
}
}
impl VectorOrArrayOfVectors {
pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self.inner? {
@@ -234,15 +353,19 @@ impl From<Vec<Embedding>> for VectorOrArrayOfVectors {
mod test {
use super::VectorOrArrayOfVectors;
fn embedding_from_str(s: &str) -> Result<VectorOrArrayOfVectors, deserr::errors::JsonError> {
let value: serde_json::Value = serde_json::from_str(s).unwrap();
deserr::deserialize(value)
}
#[test]
fn array_of_vectors() {
let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap();
let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap();
let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap();
let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap();
let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap();
let two_vecs: VectorOrArrayOfVectors =
serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
let null = embedding_from_str("null").unwrap();
let empty = embedding_from_str("[]").unwrap();
let one = embedding_from_str("[0.1]").unwrap();
let two = embedding_from_str("[0.1, 0.2]").unwrap();
let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap();
let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap();
insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null");
insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");