mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-07-26 16:21:07 +00:00
Merge #4633
4633: Allow to mark vectors as "userProvided" r=Kerollmops a=dureuill # Pull Request ## Related issue Fixes #4606 ## What does this PR do? [See usage in PRD](https://meilisearch.notion.site/v1-9-AI-search-changes-e90d6803eca8417aa70a1ac5d0225697#deb96fb0595947bda7d4a371100326eb) - Extends the shape of the special `_vectors` field in documents. - previously, the `_vectors` field had to be an object, with each field the name of a configured embedder, and each value either `null`, an embedding (array of numbers), or an array of embeddings. - In this PR, the value of an embedder in the `_vectors` field can additionally be an object. The object has two fields: 1. `embeddings`: `null`, an embedding (array of numbers), or an array of embeddings. 2. `userProvided`: a boolean indicating if the vector was provided by the user. - The previous form `embedder_or_array_of_embedders` is semantically equivalent to: ```json { "embeddings": embedder_or_array_of_embedders, "userProvided": true } ``` - During the indexing step, the subfields and values of the `_vectors` field that have `userProvided` set to **false** are added in the vector DB, but not in the documents DB: that means that future modifications of the documents will trigger a regeneration of that particular vector using the document template. - This allows **importing** embeddings as a one-shot process, while still retaining the ability to regenerate embeddings on document change. - The dump process now uses this ability: it enriches the `_vectors` fields of documents with the embeddings that were autogenerated, marking them as not `userProvided`. This allows importing the vectors from a dump without regenerating them. ### Tests This PR adds the following tests - Long-needed hybrid search tests of a simple hf embedder - Dump test that imports vectors. Due to the difficulty of actually importing a dump in tests, we just read the dump and check it contains the expected content. - Tests in the index-scheduler: this tests that documents containing the same kind of instructions as in the dump indexes as expected Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
@ -5,7 +5,10 @@ use crate::common::index::Index;
|
||||
use crate::common::{Server, Value};
|
||||
use crate::json;
|
||||
|
||||
async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
|
||||
async fn index_with_documents_user_provided<'a>(
|
||||
server: &'a Server,
|
||||
documents: &Value,
|
||||
) -> Index<'a> {
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||
@ -34,7 +37,39 @@ async fn index_with_documents<'a>(server: &'a Server, documents: &Value) -> Inde
|
||||
index
|
||||
}
|
||||
|
||||
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
async fn index_with_documents_hf<'a>(server: &'a Server, documents: &Value) -> Index<'a> {
|
||||
let index = server.index("test");
|
||||
|
||||
let (response, code) = server.set_features(json!({"vectorStore": true})).await;
|
||||
|
||||
meili_snap::snapshot!(code, @"200 OK");
|
||||
meili_snap::snapshot!(meili_snap::json_string!(response), @r###"
|
||||
{
|
||||
"vectorStore": true,
|
||||
"metrics": false,
|
||||
"logsRoute": false,
|
||||
"exportPuffinReports": false
|
||||
}
|
||||
"###);
|
||||
|
||||
let (response, code) = index
|
||||
.update_settings(json!({ "embedders": {"default": {
|
||||
"source": "huggingFace",
|
||||
"model": "sentence-transformers/all-MiniLM-L6-v2",
|
||||
"revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
||||
"documentTemplate": "{{doc.title}}, {{doc.desc}}"
|
||||
}}} ))
|
||||
.await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(response.uid()).await;
|
||||
|
||||
let (response, code) = index.add_documents(documents.clone(), None).await;
|
||||
assert_eq!(202, code, "{:?}", response);
|
||||
index.wait_task(response.uid()).await;
|
||||
index
|
||||
}
|
||||
|
||||
static SIMPLE_SEARCH_DOCUMENTS_VEC: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
@ -56,7 +91,7 @@ static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
}])
|
||||
});
|
||||
|
||||
static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
|
||||
static SINGLE_DOCUMENT_VEC: Lazy<Value> = Lazy::new(|| {
|
||||
json!([{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
@ -65,10 +100,29 @@ static SINGLE_DOCUMENT: Lazy<Value> = Lazy::new(|| {
|
||||
}])
|
||||
});
|
||||
|
||||
static SIMPLE_SEARCH_DOCUMENTS: Lazy<Value> = Lazy::new(|| {
|
||||
json!([
|
||||
{
|
||||
"title": "Shazam!",
|
||||
"desc": "a Captain Marvel ersatz",
|
||||
"id": "1",
|
||||
},
|
||||
{
|
||||
"title": "Captain Planet",
|
||||
"desc": "He's not part of the Marvel Cinematic Universe",
|
||||
"id": "2",
|
||||
},
|
||||
{
|
||||
"title": "Captain Marvel",
|
||||
"desc": "a Shazam ersatz",
|
||||
"id": "3",
|
||||
}])
|
||||
});
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
@ -98,10 +152,59 @@ async fn simple_search() {
|
||||
snapshot!(response["semanticHitCount"], @"3");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn simple_search_hf() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
|
||||
let (response, code) =
|
||||
index.search_post(json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}})).await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###);
|
||||
snapshot!(response["semanticHitCount"], @"0");
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
// disable ranking score as the vectors between architectures are not equal
|
||||
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.55}, "showRankingScore": false}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"}]"###);
|
||||
snapshot!(response["semanticHitCount"], @"1");
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Captain", "hybrid": {"semanticRatio": 0.8}, "showRankingScore": false}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###);
|
||||
snapshot!(response["semanticHitCount"], @"3");
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Movie World", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}]"###);
|
||||
snapshot!(response["semanticHitCount"], @"3");
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
json!({"q": "Wonder replacement", "hybrid": {"semanticRatio": 0.2}, "showRankingScore": false}),
|
||||
)
|
||||
.await;
|
||||
snapshot!(code, @"200 OK");
|
||||
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"}]"###);
|
||||
snapshot!(response["semanticHitCount"], @"3");
|
||||
}
|
||||
|
||||
#[actix_rt::test]
|
||||
async fn distribution_shift() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||
|
||||
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}});
|
||||
let (response, code) = index.search_post(search.clone()).await;
|
||||
@ -133,7 +236,7 @@ async fn distribution_shift() {
|
||||
#[actix_rt::test]
|
||||
async fn highlighter() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
|
||||
@ -184,7 +287,7 @@ async fn highlighter() {
|
||||
#[actix_rt::test]
|
||||
async fn invalid_semantic_ratio() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
@ -256,7 +359,7 @@ async fn invalid_semantic_ratio() {
|
||||
#[actix_rt::test]
|
||||
async fn single_document() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SINGLE_DOCUMENT).await;
|
||||
let index = index_with_documents_user_provided(&server, &SINGLE_DOCUMENT_VEC).await;
|
||||
|
||||
let (response, code) = index
|
||||
.search_post(
|
||||
@ -272,7 +375,7 @@ async fn single_document() {
|
||||
#[actix_rt::test]
|
||||
async fn query_combination() {
|
||||
let server = Server::new().await;
|
||||
let index = index_with_documents(&server, &SIMPLE_SEARCH_DOCUMENTS).await;
|
||||
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
|
||||
|
||||
// search without query and vector, but with hybrid => still placeholder
|
||||
let (response, code) = index
|
||||
|
@ -895,9 +895,9 @@ async fn test_score_details() {
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
-100.0,
|
||||
231.0,
|
||||
32.0
|
||||
]
|
||||
},
|
||||
"_rankingScoreDetails": {
|
||||
@ -1096,9 +1096,9 @@ async fn experimental_feature_vector_store() {
|
||||
"id": "287947",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
},
|
||||
"_rankingScore": 1.0
|
||||
@ -1108,9 +1108,9 @@ async fn experimental_feature_vector_store() {
|
||||
"id": "299537",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
54
|
||||
1.0,
|
||||
2.0,
|
||||
54.0
|
||||
]
|
||||
},
|
||||
"_rankingScore": 0.9129111766815186
|
||||
@ -1120,9 +1120,9 @@ async fn experimental_feature_vector_store() {
|
||||
"id": "450465",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
340,
|
||||
90
|
||||
-100.0,
|
||||
340.0,
|
||||
90.0
|
||||
]
|
||||
},
|
||||
"_rankingScore": 0.8106412887573242
|
||||
@ -1132,9 +1132,9 @@ async fn experimental_feature_vector_store() {
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
-100.0,
|
||||
231.0,
|
||||
32.0
|
||||
]
|
||||
},
|
||||
"_rankingScore": 0.7412010431289673
|
||||
@ -1144,9 +1144,9 @@ async fn experimental_feature_vector_store() {
|
||||
"id": "522681",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
10,
|
||||
-23,
|
||||
32
|
||||
10.0,
|
||||
-23.0,
|
||||
32.0
|
||||
]
|
||||
},
|
||||
"_rankingScore": 0.6972063183784485
|
||||
@ -1405,9 +1405,9 @@ async fn simple_search_with_strange_synonyms() {
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
-100.0,
|
||||
231.0,
|
||||
32.0
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1426,9 +1426,9 @@ async fn simple_search_with_strange_synonyms() {
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
-100.0,
|
||||
231.0,
|
||||
32.0
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -1447,9 +1447,9 @@ async fn simple_search_with_strange_synonyms() {
|
||||
"id": "166428",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
231,
|
||||
32
|
||||
-100.0,
|
||||
231.0,
|
||||
32.0
|
||||
]
|
||||
}
|
||||
}
|
||||
|
@ -75,9 +75,9 @@ async fn simple_search_single_index() {
|
||||
"id": "450465",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
340,
|
||||
90
|
||||
-100.0,
|
||||
340.0,
|
||||
90.0
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -96,9 +96,9 @@ async fn simple_search_single_index() {
|
||||
"id": "299537",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
54
|
||||
1.0,
|
||||
2.0,
|
||||
54.0
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -194,9 +194,9 @@ async fn simple_search_two_indexes() {
|
||||
"id": "450465",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
-100,
|
||||
340,
|
||||
90
|
||||
-100.0,
|
||||
340.0,
|
||||
90.0
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -227,9 +227,9 @@ async fn simple_search_two_indexes() {
|
||||
"cattos": "pésti",
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
3
|
||||
1.0,
|
||||
2.0,
|
||||
3.0
|
||||
]
|
||||
}
|
||||
},
|
||||
@ -249,9 +249,9 @@ async fn simple_search_two_indexes() {
|
||||
],
|
||||
"_vectors": {
|
||||
"manual": [
|
||||
1,
|
||||
2,
|
||||
54
|
||||
1.0,
|
||||
2.0,
|
||||
54.0
|
||||
]
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user