4633: Allow to mark vectors as "userProvided" r=Kerollmops a=dureuill

# Pull Request

## Related issue
Fixes #4606 

## What does this PR do?

[See usage in PRD](https://meilisearch.notion.site/v1-9-AI-search-changes-e90d6803eca8417aa70a1ac5d0225697#deb96fb0595947bda7d4a371100326eb)

- Extends the shape of the special `_vectors` field in documents.
    - previously, the `_vectors` field had to be an object, with each field the name of a configured embedder, and each value either `null`, an embedding (array of numbers), or an array of embeddings.
    - In this PR, the value of an embedder in the `_vectors` field can additionally be an object. The object has two fields:
      1. `embeddings`: `null`, an embedding (array of numbers), or an array of embeddings.
      2. `userProvided`: a boolean indicating if the vector was provided by the user.
    - The previous form `embedder_or_array_of_embedders` is semantically equivalent to:
    ```json
    {
        "embeddings": embedder_or_array_of_embedders,
        "userProvided": true
    }
    ```
- During the indexing step, the subfields and values of the `_vectors` field that have `userProvided` set to **false** are added in the vector DB, but not in the documents DB: that means that future modifications of the documents will trigger a regeneration of that particular vector using the document template.
- This allows **importing** embeddings as a one-shot process, while still retaining the ability to regenerate embeddings on document change.
- The dump process now uses this ability: it enriches the `_vectors` fields of documents with the embeddings that were autogenerated, marking them as not `userProvided`. This allows importing the vectors from a dump without regenerating them.

### Tests

This PR adds the following tests

- Long-needed hybrid search tests of a simple hf embedder
- Dump test that imports vectors. Due to the difficulty of actually importing a dump in tests, we just read the dump and check it contains the expected content.
- Tests in the index-scheduler: this tests that documents containing the same kind of instructions as in the dump indexes as expected


Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot]
2024-05-23 08:17:54 +00:00
committed by GitHub
32 changed files with 4581 additions and 297 deletions

View File

@ -5,7 +5,10 @@ use crate::common::index::Index;
use crate::common::{Server, Value};
use crate::json;
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
async fn index_with_documents_user_provided<'a>(
server: &'a Server,
documents: &Value,
) -> Index<'a> {
let index = server.index("test");
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
@ -34,7 +37,39 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde
index
}
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
let index = server.index("test");
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
meili_snap::snapshot!(code, @"200 OK");
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
{
"vectorStore": true,
"metrics": false,
"logsRoute": false,
"exportPuffinReports": false
}
"###);
let (response, code) = index
.update_settings(json!({ "embedders": {"default": {
"source": "huggingFace",
"model": "sentence-transformers/all-MiniLM-L6-v2",
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
"documentTemplate": "{{doc.title}}, {{doc.desc}}"
}}} ))
.await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
let (response, code) = index.add_documents(documents.clone(), None).await;
assert_eq!(202, code, "{:?}", response);
index.wait_task(response.uid()).await;
index
}
static SIMPLE_SEARCH_DOCUMENTS_VEC: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
@ -56,7 +91,7 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
}])
});
static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
static SINGLE_DOCUMENT_VEC: Lazy<Value> = Lazy::new(|| {
json!([{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
@ -65,10 +100,29 @@ static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
}])
});
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
json!([
{
"title": "Shazam!",
"desc": "a Captain Marvel ersatz",
"id": "1",
},
{
"title": "Captain Planet",
"desc": "He's not part of the Marvel Cinematic Universe",
"id": "2",
},
{
"title": "Captain Marvel",
"desc": "a Shazam ersatz",
"id": "3",
}])
});
#[actix_rt::test]
async fn simple_search() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(
@ -98,10 +152,59 @@ async fn simple_search() {
snapshot!(response["semanticHitCount"], @"3");
}
#[actix_rt::test]
async fn simple_search_hf() {
let server = Server::new().await;
let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let (response, code) =
index.search_post(json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}})).await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###);
snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index
.search_post(
// disable ranking score as the vectors between architectures are not equal
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.55}, "showRankingScore": false}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###);
snapshot!(response["semanticHitCount"], @"1");
let (response, code) = index
.search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.8}, "showRankingScore": false}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###);
snapshot!(response["semanticHitCount"], @"3");
let (response, code) = index
.search_post(
json!({"q": "Movie World", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}]"###);
snapshot!(response["semanticHitCount"], @"3");
let (response, code) = index
.search_post(
json!({"q": "Wonder replacement", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}),
)
.await;
snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###);
snapshot!(response["semanticHitCount"], @"3");
}
#[actix_rt::test]
async fn distribution_shift() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}});
let (response, code) = index.search_post(search.clone()).await;
@ -133,7 +236,7 @@ async fn distribution_shift() {
#[actix_rt::test]
async fn highlighter() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
@ -184,7 +287,7 @@ async fn highlighter() {
#[actix_rt::test]
async fn invalid_semantic_ratio() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let (response, code) = index
.search_post(
@ -256,7 +359,7 @@ async fn invalid_semantic_ratio() {
#[actix_rt::test]
async fn single_document() {
let server = Server::new().await;
let index = index_with_documents(&server, &SINGLE_DOCUMENT).await;
let index = index_with_documents_user_provided(&server, &SINGLE_DOCUMENT_VEC).await;
let (response, code) = index
.search_post(
@ -272,7 +375,7 @@ async fn single_document() {
#[actix_rt::test]
async fn query_combination() {
let server = Server::new().await;
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
// search without query and vector, but with hybrid => still placeholder
let (response, code) = index

View File

@ -895,9 +895,9 @@ async fn test_score_details() {
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
-100.0,
231.0,
32.0
]
},
"_rankingScoreDetails": {
@ -1096,9 +1096,9 @@ async fn experimental_feature_vector_store() {
"id": "287947",
"_vectors": {
"manual": [
1,
2,
3
1.0,
2.0,
3.0
]
},
"_rankingScore": 1.0
@ -1108,9 +1108,9 @@ async fn experimental_feature_vector_store() {
"id": "299537",
"_vectors": {
"manual": [
1,
2,
54
1.0,
2.0,
54.0
]
},
"_rankingScore": 0.9129111766815186
@ -1120,9 +1120,9 @@ async fn experimental_feature_vector_store() {
"id": "450465",
"_vectors": {
"manual": [
-100,
340,
90
-100.0,
340.0,
90.0
]
},
"_rankingScore": 0.8106412887573242
@ -1132,9 +1132,9 @@ async fn experimental_feature_vector_store() {
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
-100.0,
231.0,
32.0
]
},
"_rankingScore": 0.7412010431289673
@ -1144,9 +1144,9 @@ async fn experimental_feature_vector_store() {
"id": "522681",
"_vectors": {
"manual": [
10,
-23,
32
10.0,
-23.0,
32.0
]
},
"_rankingScore": 0.6972063183784485
@ -1405,9 +1405,9 @@ async fn simple_search_with_strange_synonyms() {
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
-100.0,
231.0,
32.0
]
}
}
@ -1426,9 +1426,9 @@ async fn simple_search_with_strange_synonyms() {
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
-100.0,
231.0,
32.0
]
}
}
@ -1447,9 +1447,9 @@ async fn simple_search_with_strange_synonyms() {
"id": "166428",
"_vectors": {
"manual": [
-100,
231,
32
-100.0,
231.0,
32.0
]
}
}

View File

@ -75,9 +75,9 @@ async fn simple_search_single_index() {
"id": "450465",
"_vectors": {
"manual": [
-100,
340,
90
-100.0,
340.0,
90.0
]
}
}
@ -96,9 +96,9 @@ async fn simple_search_single_index() {
"id": "299537",
"_vectors": {
"manual": [
1,
2,
54
1.0,
2.0,
54.0
]
}
}
@ -194,9 +194,9 @@ async fn simple_search_two_indexes() {
"id": "450465",
"_vectors": {
"manual": [
-100,
340,
90
-100.0,
340.0,
90.0
]
}
}
@ -227,9 +227,9 @@ async fn simple_search_two_indexes() {
"cattos": "pésti",
"_vectors": {
"manual": [
1,
2,
3
1.0,
2.0,
3.0
]
}
},
@ -249,9 +249,9 @@ async fn simple_search_two_indexes() {
],
"_vectors": {
"manual": [
1,
2,
54
1.0,
2.0,
54.0
]
}
}