diff --git a/crates/meilisearch/tests/common/mod.rs b/crates/meilisearch/tests/common/mod.rs index 1a73a7532..1345fa197 100644 --- a/crates/meilisearch/tests/common/mod.rs +++ b/crates/meilisearch/tests/common/mod.rs @@ -3,8 +3,12 @@ pub mod index; pub mod server; pub mod service; -use std::fmt::{self, Display}; +use std::{ + collections::BTreeMap, + fmt::{self, Display}, +}; +use actix_http::StatusCode; #[allow(unused)] pub use index::GetAllDocumentsOptions; use meili_snap::json_string; @@ -13,6 +17,10 @@ use serde::{Deserialize, Serialize}; #[allow(unused)] pub use server::{default_settings, Server}; use tokio::sync::OnceCell; +use wiremock::{ + matchers::{method, path}, + Mock, MockServer, Request, ResponseTemplate, +}; use crate::common::index::Index; @@ -508,3 +516,166 @@ pub async fn shared_index_with_geo_documents() -> &'static Index<'static, Shared }) .await } + +pub async fn shared_index_for_fragments() -> Index<'static, Shared> { + static INDEX: OnceCell<(Server, String)> = OnceCell::const_new(); + let (server, uid) = INDEX + .get_or_init(|| async { + let (server, uid, _) = init_fragments_index().await; + (server.into_shared(), uid) + }) + .await; + server._index(uid).to_shared() +} + +async fn fragment_mock_server() -> String { + let text_to_embedding: BTreeMap<_, _> = vec![ + ("kefir", [0.5, -0.5, 0.0]), + ("intel", [1.0, 1.0, 0.0]), + ("dustin", [-0.5, 0.5, 0.0]), + ("bulldog", [0.0, 0.0, 1.0]), + ("labrador", [0.0, 0.0, -1.0]), + ("{{ doc.", [-9999.0, -9999.0, -9999.0]), // If a template didn't render + ] + .into_iter() + .collect(); + + let mock_server = Box::leak(Box::new(MockServer::start().await)); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let text = String::from_utf8_lossy(&req.body).to_string(); + + let mut data = [0.0, 0.0, 0.0]; + for (inner_text, inner_data) in &text_to_embedding { + if text.contains(inner_text) { + for (i, &value) in inner_data.iter().enumerate() { + data[i] += value; + } + } + } + ResponseTemplate::new(200).set_body_json(json!({ "data": data })) + }) + .mount(mock_server) + .await; + + mock_server.uri() +} + +pub async fn init_fragments_index() -> (Server, String, crate::common::Value) { + let url = fragment_mock_server().await; + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + assert_eq!(code, StatusCode::OK); + + // Configure the index to use our mock embedder + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "justBreed": {"value": "It's a {{ media.breed }}"}, + "justName": {"value": "{{ media.name }} is a dog"}, + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, StatusCode::ACCEPTED); + + server.wait_task(response.uid()).await.succeeded(); + + // Send documents + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel", "breed": "labrador"}, + {"id": 3, "name": "dustin", "breed": "bulldog"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + assert_eq!(code, StatusCode::ACCEPTED); + + let _task = index.wait_task(value.uid()).await.succeeded(); + + let uid = index.uid.clone(); + (server, uid, settings) +} + +pub async fn init_fragments_index_composite() -> (Server, String, crate::common::Value) { + let url = fragment_mock_server().await; + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + assert_eq!(code, StatusCode::OK); + + let (_response, code) = server.set_features(json!({"compositeEmbedders": true})).await; + assert_eq!(code, StatusCode::OK); + + // Configure the index to use our mock embedder + let settings = json!({ + "embedders": { + "rest": { + "source": "composite", + "searchEmbedder": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + "indexingEmbedder": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + } + }, + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, StatusCode::ACCEPTED); + + server.wait_task(response.uid()).await.succeeded(); + + // Send documents + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel", "breed": "labrador"}, + {"id": 3, "name": "dustin", "breed": "bulldog"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + assert_eq!(code, StatusCode::ACCEPTED); + + index.wait_task(value.uid()).await.succeeded(); + + let uid = index.uid.clone(); + (server, uid, settings) +} diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index aa4501636..5f82bb380 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -35,7 +35,7 @@ pub struct Server { pub static TEST_TEMP_DIR: Lazy = Lazy::new(|| TempDir::new().unwrap()); impl Server { - fn into_shared(self) -> Server { + pub(super) fn into_shared(self) -> Server { Server { service: self.service, _dir: self._dir, _marker: PhantomData } } diff --git a/crates/meilisearch/tests/search/multi/proxy.rs b/crates/meilisearch/tests/search/multi/proxy.rs index 55736d058..c1d40ef3b 100644 --- a/crates/meilisearch/tests/search/multi/proxy.rs +++ b/crates/meilisearch/tests/search/multi/proxy.rs @@ -2499,7 +2499,7 @@ pub struct LocalMeiliParams { /// A server that exploits [`MockServer`] to provide an URL for testing network and the network. pub struct LocalMeili { - mock_server: MockServer, + mock_server: &'static MockServer, } impl LocalMeili { @@ -2508,7 +2508,7 @@ impl LocalMeili { } pub async fn with_params(server: Arc, params: LocalMeiliParams) -> Self { - let mock_server = MockServer::start().await; + let mock_server = Box::leak(Box::new(MockServer::start().await)); // tokio won't let us execute asynchronous code from a sync function inside of an async test, // so instead we spawn another thread that will call the service on a brand new tokio runtime @@ -2572,7 +2572,7 @@ impl LocalMeili { response.set_body_json(value) } }) - .mount(&mock_server) + .mount(mock_server) .await; Self { mock_server } } diff --git a/crates/meilisearch/tests/vector/fragments.rs b/crates/meilisearch/tests/vector/fragments.rs new file mode 100644 index 000000000..a994eb64c --- /dev/null +++ b/crates/meilisearch/tests/vector/fragments.rs @@ -0,0 +1,2120 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{ + init_fragments_index, init_fragments_index_composite, shared_index_for_fragments, +}; +use crate::json; +use crate::vector::{GetAllDocumentsOptions, Server}; + +#[actix_rt::test] +async fn experimental_feature_not_enabled() { + let server = Server::new().await; + let index = server.unique_index(); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": "http://localhost:1337", + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r#" + { + "message": "setting `indexingFragments` requires enabling the `multimodal` experimental feature. See https://github.com/orgs/meilisearch/discussions/846", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "#); +} + +#[actix_rt::test] +async fn indexing_fragments() { + let index = shared_index_for_fragments().await; + + // Make sure the documents have been indexed and their embeddings retrieved + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn replace_document() { + let (server, uid, _settings) = init_fragments_index().await; + let index = server.index(uid); + + let documents = json!([ + { "id": 0, "name": "kefir", "breed": "sorry-I-forgot" }, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + + index.wait_task(value.uid()).await.succeeded(); + + // Make sure kefir now has 2 vectors + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "breed": "sorry-I-forgot", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ], + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn search_with_vector() { + let index = shared_index_for_fragments().await; + + let (value, code) = index.search_post( + json!({"vector": [1.0, 1.0, 1.0], "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + )).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 1, + "name": "echo" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn search_with_media() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 2, + "name": "intel", + "breed": "labrador" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_and_vector() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "vector": [1.0, 1.0, 1.0], + "media": { "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Invalid request: both `media` and `vector` parameters are present.", + "code": "invalid_search_media_and_vector", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_media_and_vector" + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_matching_multiple_fragments() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "name": "dustin", "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Error while generating embeddings: user error: Query matches multiple search fragments.\n - Note: First matched fragment `justBreed`.\n - Note: Second matched fragment `justName`.\n - Note: {\"q\":null,\"media\":{\"name\":\"dustin\",\"breed\":\"labrador\"}}", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_matching_no_fragment() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "ticker": "GME", "section": "portfolio" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Error while generating embeddings: user error: Query matches no search fragment.\n - Note: {\"q\":null,\"media\":{\"ticker\":\"GME\",\"section\":\"portfolio\"}}", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn search_with_query() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "q": "bulldog", + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 3, + "name": "dustin", + "breed": "bulldog" + } + ], + "query": "bulldog", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn deleting_fragments_deletes_vectors() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": null, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (value, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(value["embedders"], { + ".rest.url" => "[url]", + }), @r#" + { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); +} + +#[actix_rt::test] +async fn modifying_fragments_modifies_vectors() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"]["value"] = + serde_json::Value::String("{{ doc.name }} is a dog (maybe bulldog?)".to_string()); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog (maybe bulldog?)" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn swapping_fragments() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + let basic = settings["embedders"]["rest"]["indexingFragments"]["basic"].clone(); + let with_breed = settings["embedders"]["rest"]["indexingFragments"]["withBreed"].clone(); + settings["embedders"]["rest"]["indexingFragments"]["basic"] = with_breed; + settings["embedders"]["rest"]["indexingFragments"]["withBreed"] = basic; + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + }, + "withBreed": { + "value": "{{ doc.name }} is a dog" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ], + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ], + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn ommitted_fragment_isnt_removed() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; // basic is removed + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().remove("withBreed"); // withBreed isn't specified + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": null + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + // Make sure withBreed is still here because it wasn't specified + let (value, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(value["embedders"], { + ".rest.url" => "[url]", + }), @r#" + { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + } + "#); +} + +#[actix_rt::test] +async fn fragment_insertion() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().insert( + String::from("useless"), + serde_json::json!({ + "value": "This fragment is useless" + }), + ); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "useless": { + "value": "This fragment is useless" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn multiple_embedders() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + let url = settings["embedders"]["rest"]["url"].as_str().unwrap(); + + let settings2 = json!({ + "embedders": { + "rest2": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + "rest3": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings2).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest2": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + }, + "rest3": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + } + }, + "searchFragments": { + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest2": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); + + // Remove Rest2 + + settings["embedders"]["rest2"] = serde_json::Value::Null; + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); + + // Remove rest's basic fragment + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; + //settings["embedders"].as_object_mut().unwrap().remove("rest2"); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn remove_non_existant_embedder() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"] + .as_object_mut() + .unwrap() + .insert(String::from("non-existant"), serde_json::Value::Null); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "non-existant": null, + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + +#[actix_rt::test] +async fn double_remove_embedder() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"] + .as_object_mut() + .unwrap() + .insert(String::from("rest"), serde_json::Value::Null); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": null + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": null + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + +#[actix_rt::test] +async fn complex_fragment() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().insert( + String::from("complex"), + serde_json::json!({ + "value": { + "breed": "{{ doc.breed }}", + "breeds": [ + "{{ doc.breed }}", + { + "breed": "{{ doc.breed }}", + } + ] + } + }), + ); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "complex": { + "value": { + "breed": "{{ doc.breed }}", + "breeds": [ + "{{ doc.breed }}", + { + "breed": "{{ doc.breed }}" + } + ] + } + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ], + [ + 0.0, + 0.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn both_fragments_and_document_template() { + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + snapshot!(code, @"200 OK"); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": "http://localhost:1337", + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "justBreed": {"value": "It's a {{ media.breed }}"}, + }, + "documentTemplate": "{{ doc.name }} is a dog", + }, + }, + }); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r#" + { + "message": "Error while generating embeddings: user error: cannot pass both fragments and a document template.\n - Note: 1 fragments declared in `indexingFragments` and 1 fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[ignore = "failing due to issue #5746"] +#[actix_rt::test] +async fn set_fragments_then_document_template() { + let (server, uid, settings) = init_fragments_index().await; + let index = server.index(uid); + + let url = settings["embedders"]["rest"]["url"].as_str().unwrap(); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "documentTemplate": "{{ doc.name }} is a dog", + }, + }, + }); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r""); + + let (settings, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(settings, @r#""#); // Should have removed fragments +} + +#[actix_rt::test] +async fn composite() { + let (server, uid, _settings) = init_fragments_index_composite().await; + let index = server.index(uid); + + let (value, code) = index.search_post( + json!({"vector": [1.0, 1.0, 1.0], "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + )).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 1, + "name": "echo" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); + + let (value, code) = index + .search_post( + json!({"q": "bulldog", "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + ), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 3, + "name": "dustin", + "breed": "bulldog" + } + ], + "query": "bulldog", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 98555dfac..7f54489b6 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -1,4 +1,5 @@ mod binary_quantized; +mod fragments; #[cfg(feature = "test-ollama")] mod ollama; mod openai; diff --git a/crates/meilisearch/tests/vector/openai.rs b/crates/meilisearch/tests/vector/openai.rs index 4ae8cb041..e207c3eb6 100644 --- a/crates/meilisearch/tests/vector/openai.rs +++ b/crates/meilisearch/tests/vector/openai.rs @@ -136,7 +136,7 @@ fn long_text() -> &'static str { }) } -async fn create_mock_tokenized() -> (MockServer, Value) { +async fn create_mock_tokenized() -> (&'static MockServer, Value) { create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false, false).await } @@ -145,8 +145,8 @@ async fn create_mock_with_template( model_dimensions: ModelDimensions, fallible: bool, slow: bool, -) -> (MockServer, Value) { - let mock_server = MockServer::start().await; +) -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); const API_KEY: &str = "my-api-key"; const API_KEY_BEARER: &str = "Bearer my-api-key"; @@ -299,7 +299,7 @@ async fn create_mock_with_template( } })) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -321,27 +321,27 @@ const DOGGO_TEMPLATE: &str = r#"{%- if doc.gender == "F" -%}Une chienne nommée Un chien nommé {{doc.name}}, né en {{doc.birthyear}} {%- endif %}, de race {{doc.breed}}."#; -async fn create_mock() -> (MockServer, Value) { +async fn create_mock() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false, false).await } -async fn create_mock_dimensions() -> (MockServer, Value) { +async fn create_mock_dimensions() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false, false).await } -async fn create_mock_small_embedding_model() -> (MockServer, Value) { +async fn create_mock_small_embedding_model() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false, false).await } -async fn create_mock_legacy_embedding_model() -> (MockServer, Value) { +async fn create_mock_legacy_embedding_model() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false, false).await } -async fn create_fallible_mock() -> (MockServer, Value) { +async fn create_fallible_mock() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, false).await } -async fn create_slow_mock() -> (MockServer, Value) { +async fn create_slow_mock() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, true).await } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index e03563bcc..974341cd0 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -12,8 +12,8 @@ use crate::common::Value; use crate::json; use crate::vector::{get_server_vector, GetAllDocumentsOptions}; -async fn create_mock() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -32,7 +32,7 @@ async fn create_mock() -> (MockServer, Value) { json!({ "data": text_to_embedding.get(text.as_str()).unwrap_or(&[99., 99., 99.]) }), ) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -50,8 +50,8 @@ async fn create_mock() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_mock_default_template() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_default_template() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -73,7 +73,7 @@ async fn create_mock_default_template() -> (MockServer, Value) { .set_body_json(json!({"error": "text not found", "text": text})), } }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -106,8 +106,8 @@ struct SingleResponse { embedding: Vec, } -async fn create_mock_multiple() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_multiple() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -146,7 +146,7 @@ async fn create_mock_multiple() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(response) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -176,8 +176,8 @@ struct SingleRequest { input: String, } -async fn create_mock_single_response_in_array() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_single_response_in_array() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -212,7 +212,7 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(response) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -236,8 +236,8 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_raw_with_custom_header() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -277,7 +277,7 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(output) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -293,8 +293,8 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_mock_raw() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_raw() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -321,7 +321,7 @@ async fn create_mock_raw() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(output) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -337,8 +337,8 @@ async fn create_mock_raw() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let count = AtomicUsize::new(0); Mock::given(method("POST")) @@ -355,7 +355,7 @@ async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) ResponseTemplate::new(500).set_body_string("Service Unavailable") } }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index d2f74da2a..bca8fbc59 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -101,6 +101,10 @@ impl Setting { matches!(self, Self::NotSet) } + pub const fn is_reset(&self) -> bool { + matches!(self, Self::Reset) + } + /// If `Self` is `Reset`, then map self to `Set` with the provided `val`. pub fn or_reset(self, val: T) -> Self { match self { @@ -1213,6 +1217,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // new config EitherOrBoth::Right((name, mut setting)) => { tracing::debug!(embedder = name, "new embedder"); + // if we are asked to reset an embedder that doesn't exist, just ignore it + if setting.is_reset() { + continue; + } // apply the default source in case the source was not set so that it gets validated crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); crate::vector::settings::EmbeddingSettings::apply_default_openai_model( diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs index 8314b8649..2e31da094 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/composite.rs @@ -59,12 +59,24 @@ pub struct EmbedderOptions { impl Embedder { pub fn new( - EmbedderOptions { search, index }: EmbedderOptions, + EmbedderOptions { search: search_options, index: index_options }: EmbedderOptions, cache_cap: usize, ) -> Result { - let search = SubEmbedder::new(search, cache_cap)?; + // don't check similarity if one child is a rest embedder with fragments + // FIXME: skipping the check isn't ideal but we are unsure how to handle fragments in this context + let mut skip_similarity_check = false; + for options in [&search_options, &index_options] { + if let SubEmbedderOptions::Rest(options) = &options { + if !options.search_fragments.is_empty() || !options.indexing_fragments.is_empty() { + skip_similarity_check = true; + break; + } + } + } + + let search = SubEmbedder::new(search_options, cache_cap)?; // cache is only used at search - let index = SubEmbedder::new(index, 0)?; + let index = SubEmbedder::new(index_options, 0)?; // check dimensions if search.dimensions() != index.dimensions() { @@ -73,7 +85,12 @@ impl Embedder { index.dimensions(), )); } + // check similarity + if skip_similarity_check { + return Ok(Self { search, index }); + } + let search_embeddings = search .embed( vec![