diff --git a/.gitignore b/.gitignore index fc24b8306..44cfa8f75 100644 --- a/.gitignore +++ b/.gitignore @@ -5,18 +5,24 @@ **/*.json_lines **/*.rs.bk /*.mdb -/data.ms +/*.ms /snapshots /dumps /bench /_xtask_benchmark.ms /benchmarks +.DS_Store # Snapshots ## ... large *.full.snap -## ... unreviewed +## ... unreviewed *.snap.new +## ... pending +*.pending-snap + +# Tmp files +.tmp* # Database snapshot crates/meilisearch/db.snapshot diff --git a/Cargo.lock b/Cargo.lock index ceec0a05e..8413b3d14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3775,6 +3775,7 @@ dependencies = [ "meili-snap", "meilisearch-auth", "meilisearch-types", + "memmap2", "mimalloc", "mime", "mopa-maintained", @@ -3908,9 +3909,9 @@ checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" [[package]] name = "memmap2" -version = "0.9.5" +version = "0.9.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +checksum = "483758ad303d734cec05e5c12b41d7e93e6a6390c5e9dae6bdeb7c1259012d28" dependencies = [ "libc", "stable_deref_trait", diff --git a/crates/benchmarks/Cargo.toml b/crates/benchmarks/Cargo.toml index 9dccc444b..f60f0979c 100644 --- a/crates/benchmarks/Cargo.toml +++ b/crates/benchmarks/Cargo.toml @@ -14,7 +14,7 @@ license.workspace = true anyhow = "1.0.98" bumpalo = "3.18.1" csv = "1.3.1" -memmap2 = "0.9.5" +memmap2 = "0.9.7" milli = { path = "../milli" } mimalloc = { version = "0.1.47", default-features = false } serde_json = { version = "1.0.140", features = ["preserve_order"] } @@ -51,3 +51,7 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "sort" +harness = false diff --git a/crates/benchmarks/benches/sort.rs b/crates/benchmarks/benches/sort.rs new file mode 100644 index 000000000..c3e934432 --- /dev/null +++ b/crates/benchmarks/benches/sort.rs @@ -0,0 +1,114 @@ +//! This benchmark module is used to compare the performance of sorting documents in /search VS /documents +//! +//! The tests/benchmarks were designed in the context of a query returning only 20 documents. + +mod datasets_paths; +mod utils; + +use criterion::{criterion_group, criterion_main}; +use milli::update::Settings; +use utils::Conf; + +#[cfg(not(windows))] +#[global_allocator] +static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; + +fn base_conf(builder: &mut Settings) { + let displayed_fields = + ["geonameid", "name", "asciiname", "alternatenames", "_geo", "population"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_displayed_fields(displayed_fields); + + let sortable_fields = + ["_geo", "name", "population", "elevation", "timezone", "modification-date"] + .iter() + .map(|s| s.to_string()) + .collect(); + builder.set_sortable_fields(sortable_fields); +} + +#[rustfmt::skip] +const BASE_CONF: Conf = Conf { + dataset: datasets_paths::SMOL_ALL_COUNTRIES, + dataset_format: "jsonl", + configure: base_conf, + primary_key: Some("geonameid"), + queries: &[""], + offsets: &[ + Some((0, 20)), // The most common query in the real world + Some((0, 500)), // A query that ranges over many documents + Some((980, 20)), // The worst query that could happen in the real world + Some((800_000, 20)) // The worst query + ], + get_documents: true, + ..Conf::BASE +}; + +fn bench_sort(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + utils::Conf { + group_name: "without sort", + sort: None, + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many different values", + sort: Some(vec!["name:asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many similar values", + sort: Some(vec!["timezone:desc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many similar then different values", + sort: Some(vec!["timezone:desc", "name:asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many different then similar values", + sort: Some(vec!["timezone:desc", "name:asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "geo sort", + sample_size: Some(10), + sort: Some(vec!["_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many similar values then geo sort", + sample_size: Some(50), + sort: Some(vec!["timezone:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many different values then geo sort", + sample_size: Some(50), + sort: Some(vec!["name:desc", "_geoPoint(45.4777599, 9.1967508):asc"]), + ..BASE_CONF + }, + + utils::Conf { + group_name: "sort on many fields", + sort: Some(vec!["population:asc", "name:asc", "elevation:asc", "timezone:asc"]), + ..BASE_CONF + }, + ]; + + utils::run_benches(c, confs); +} + +criterion_group!(benches, bench_sort); +criterion_main!(benches); diff --git a/crates/benchmarks/benches/utils.rs b/crates/benchmarks/benches/utils.rs index b12408051..0abbd6c71 100644 --- a/crates/benchmarks/benches/utils.rs +++ b/crates/benchmarks/benches/utils.rs @@ -9,6 +9,7 @@ use anyhow::Context; use bumpalo::Bump; use criterion::BenchmarkId; use memmap2::Mmap; +use milli::documents::sort::recursive_sort; use milli::heed::EnvOpenOptions; use milli::progress::Progress; use milli::update::new::indexer; @@ -35,6 +36,12 @@ pub struct Conf<'a> { pub configure: fn(&mut Settings), pub filter: Option<&'a str>, pub sort: Option>, + /// set to skip documents (offset, limit) + pub offsets: &'a [Option<(usize, usize)>], + /// enable if you want to bench getting documents without querying + pub get_documents: bool, + /// configure the benchmark sample size + pub sample_size: Option, /// enable or disable the optional words on the query pub optional_words: bool, /// primary key, if there is None we'll auto-generate docids for every documents @@ -52,6 +59,9 @@ impl Conf<'_> { configure: |_| (), filter: None, sort: None, + offsets: &[None], + get_documents: false, + sample_size: None, optional_words: true, primary_key: None, }; @@ -145,25 +155,79 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { let file_name = Path::new(conf.dataset).file_name().and_then(|f| f.to_str()).unwrap(); let name = format!("{}: {}", file_name, conf.group_name); let mut group = c.benchmark_group(&name); + if let Some(sample_size) = conf.sample_size { + group.sample_size(sample_size); + } for &query in conf.queries { - group.bench_with_input(BenchmarkId::from_parameter(query), &query, |b, &query| { - b.iter(|| { - let rtxn = index.read_txn().unwrap(); - let mut search = index.search(&rtxn); - search.query(query).terms_matching_strategy(TermsMatchingStrategy::default()); - if let Some(filter) = conf.filter { - let filter = Filter::from_str(filter).unwrap().unwrap(); - search.filter(filter); - } - if let Some(sort) = &conf.sort { - let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); - search.sort_criteria(sort); - } - let _ids = search.execute().unwrap(); - }); - }); + for offset in conf.offsets { + let parameter = match offset { + None => query.to_string(), + Some((offset, limit)) => format!("{query}[{offset}:{limit}]"), + }; + group.bench_with_input( + BenchmarkId::from_parameter(parameter), + &query, + |b, &query| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + search + .query(query) + .terms_matching_strategy(TermsMatchingStrategy::default()); + if let Some(filter) = conf.filter { + let filter = Filter::from_str(filter).unwrap().unwrap(); + search.filter(filter); + } + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + search.sort_criteria(sort); + } + if let Some((offset, limit)) = offset { + search.offset(*offset).limit(*limit); + } + + let _ids = search.execute().unwrap(); + }); + }, + ); + } } + + if conf.get_documents { + for offset in conf.offsets { + let parameter = match offset { + None => String::from("get_documents"), + Some((offset, limit)) => format!("get_documents[{offset}:{limit}]"), + }; + group.bench_with_input(BenchmarkId::from_parameter(parameter), &(), |b, &()| { + b.iter(|| { + let rtxn = index.read_txn().unwrap(); + if let Some(sort) = &conf.sort { + let sort = sort.iter().map(|sort| sort.parse().unwrap()).collect(); + let all_docs = index.documents_ids(&rtxn).unwrap(); + let facet_sort = + recursive_sort(&index, &rtxn, sort, &all_docs).unwrap(); + let iter = facet_sort.iter().unwrap(); + if let Some((offset, limit)) = offset { + let _results = iter.skip(*offset).take(*limit).collect::>(); + } else { + let _results = iter.collect::>(); + } + } else { + let all_docs = index.documents_ids(&rtxn).unwrap(); + if let Some((offset, limit)) = offset { + let _results = + all_docs.iter().skip(*offset).take(*limit).collect::>(); + } else { + let _results = all_docs.iter().collect::>(); + } + } + }); + }); + } + } + group.finish(); index.prepare_for_closing().wait(); diff --git a/crates/dump/src/reader/compat/v1_to_v2.rs b/crates/dump/src/reader/compat/v1_to_v2.rs index 0d050497b..35d369c3a 100644 --- a/crates/dump/src/reader/compat/v1_to_v2.rs +++ b/crates/dump/src/reader/compat/v1_to_v2.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::str::FromStr; use super::v2_to_v3::CompatV2ToV3; @@ -94,6 +95,10 @@ impl CompatIndexV1ToV2 { self.from.documents().map(|it| Box::new(it) as Box>) } + pub fn documents_file(&self) -> &File { + self.from.documents_file() + } + pub fn settings(&mut self) -> Result> { Ok(v2::settings::Settings::::from(self.from.settings()?).check()) } diff --git a/crates/dump/src/reader/compat/v2_to_v3.rs b/crates/dump/src/reader/compat/v2_to_v3.rs index e7516e708..62326040e 100644 --- a/crates/dump/src/reader/compat/v2_to_v3.rs +++ b/crates/dump/src/reader/compat/v2_to_v3.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::str::FromStr; use time::OffsetDateTime; @@ -122,6 +123,13 @@ impl CompatIndexV2ToV3 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV2ToV3::V2(v2) => v2.documents_file(), + CompatIndexV2ToV3::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { let settings = match self { CompatIndexV2ToV3::V2(from) => from.settings()?, diff --git a/crates/dump/src/reader/compat/v3_to_v4.rs b/crates/dump/src/reader/compat/v3_to_v4.rs index 5bb70e9b2..1dba37771 100644 --- a/crates/dump/src/reader/compat/v3_to_v4.rs +++ b/crates/dump/src/reader/compat/v3_to_v4.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use super::v2_to_v3::{CompatIndexV2ToV3, CompatV2ToV3}; use super::v4_to_v5::CompatV4ToV5; use crate::reader::{v3, v4, UpdateFile}; @@ -252,6 +254,13 @@ impl CompatIndexV3ToV4 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV3ToV4::V3(v3) => v3.documents_file(), + CompatIndexV3ToV4::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { Ok(match self { CompatIndexV3ToV4::V3(v3) => { diff --git a/crates/dump/src/reader/compat/v4_to_v5.rs b/crates/dump/src/reader/compat/v4_to_v5.rs index e52acb176..3f47b5b48 100644 --- a/crates/dump/src/reader/compat/v4_to_v5.rs +++ b/crates/dump/src/reader/compat/v4_to_v5.rs @@ -1,3 +1,5 @@ +use std::fs::File; + use super::v3_to_v4::{CompatIndexV3ToV4, CompatV3ToV4}; use super::v5_to_v6::CompatV5ToV6; use crate::reader::{v4, v5, Document}; @@ -241,6 +243,13 @@ impl CompatIndexV4ToV5 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV4ToV5::V4(v4) => v4.documents_file(), + CompatIndexV4ToV5::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { CompatIndexV4ToV5::V4(v4) => Ok(v5::Settings::from(v4.settings()?).check()), diff --git a/crates/dump/src/reader/compat/v5_to_v6.rs b/crates/dump/src/reader/compat/v5_to_v6.rs index f7bda81c6..f173bb6bd 100644 --- a/crates/dump/src/reader/compat/v5_to_v6.rs +++ b/crates/dump/src/reader/compat/v5_to_v6.rs @@ -1,3 +1,4 @@ +use std::fs::File; use std::num::NonZeroUsize; use std::str::FromStr; @@ -243,6 +244,13 @@ impl CompatIndexV5ToV6 { } } + pub fn documents_file(&self) -> &File { + match self { + CompatIndexV5ToV6::V5(v5) => v5.documents_file(), + CompatIndexV5ToV6::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { CompatIndexV5ToV6::V5(v5) => Ok(v6::Settings::from(v5.settings()?).check()), diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 23e7eec9e..c894c255f 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -192,6 +192,14 @@ impl DumpIndexReader { } } + /// A reference to a file in the NDJSON format containing all the documents of the index + pub fn documents_file(&self) -> &File { + match self { + DumpIndexReader::Current(v6) => v6.documents_file(), + DumpIndexReader::Compat(compat) => compat.documents_file(), + } + } + pub fn settings(&mut self) -> Result> { match self { DumpIndexReader::Current(v6) => v6.settings(), diff --git a/crates/dump/src/reader/v1/mod.rs b/crates/dump/src/reader/v1/mod.rs index ac7324d9a..d86ede62c 100644 --- a/crates/dump/src/reader/v1/mod.rs +++ b/crates/dump/src/reader/v1/mod.rs @@ -72,6 +72,10 @@ impl V1IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result { Ok(serde_json::from_reader(&mut self.settings)?) } diff --git a/crates/dump/src/reader/v2/mod.rs b/crates/dump/src/reader/v2/mod.rs index 14a643c2d..a74687381 100644 --- a/crates/dump/src/reader/v2/mod.rs +++ b/crates/dump/src/reader/v2/mod.rs @@ -203,6 +203,10 @@ impl V2IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v3/mod.rs b/crates/dump/src/reader/v3/mod.rs index 920e1dc6e..5f89eb861 100644 --- a/crates/dump/src/reader/v3/mod.rs +++ b/crates/dump/src/reader/v3/mod.rs @@ -215,6 +215,10 @@ impl V3IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v4/mod.rs b/crates/dump/src/reader/v4/mod.rs index 585786ae4..16a1e27c2 100644 --- a/crates/dump/src/reader/v4/mod.rs +++ b/crates/dump/src/reader/v4/mod.rs @@ -210,6 +210,10 @@ impl V4IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v5/mod.rs b/crates/dump/src/reader/v5/mod.rs index dfbc6346c..0123db433 100644 --- a/crates/dump/src/reader/v5/mod.rs +++ b/crates/dump/src/reader/v5/mod.rs @@ -247,6 +247,10 @@ impl V5IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { Ok(self.settings.clone()) } diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 449a7e5fe..08d4700e5 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -284,6 +284,10 @@ impl V6IndexReader { .map(|line| -> Result<_> { Ok(serde_json::from_str(&line?)?) })) } + pub fn documents_file(&self) -> &File { + self.documents.get_ref() + } + pub fn settings(&mut self) -> Result> { let mut settings: Settings = serde_json::from_reader(&mut self.settings)?; patch_embedders(&mut settings); diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index de0d01935..20cc49686 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -26,7 +26,7 @@ flate2 = "1.1.2" indexmap = "2.9.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } -memmap2 = "0.9.5" +memmap2 = "0.9.7" page_size = "0.6.0" rayon = "1.10.0" roaring = { version = "0.10.12", features = ["serde"] } diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 0cbbb2514..32ce131b5 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -20,6 +20,7 @@ pub fn snapshot_index_scheduler(scheduler: &IndexScheduler) -> String { let IndexScheduler { cleanup_enabled: _, + experimental_no_edition_2024_for_dumps: _, processing_tasks, env, version, diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index b2f27d66b..46566b9ba 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -168,6 +168,9 @@ pub struct IndexScheduler { /// Whether we should automatically cleanup the task queue or not. pub(crate) cleanup_enabled: bool, + /// Whether we should use the old document indexer or the new one. + pub(crate) experimental_no_edition_2024_for_dumps: bool, + /// The webhook url we should send tasks to after processing every batches. pub(crate) webhook_url: Option, /// The Authorization header to send to the webhook URL. @@ -210,6 +213,7 @@ impl IndexScheduler { index_mapper: self.index_mapper.clone(), cleanup_enabled: self.cleanup_enabled, + experimental_no_edition_2024_for_dumps: self.experimental_no_edition_2024_for_dumps, webhook_url: self.webhook_url.clone(), webhook_authorization_header: self.webhook_authorization_header.clone(), embedders: self.embedders.clone(), @@ -296,6 +300,9 @@ impl IndexScheduler { index_mapper, env, cleanup_enabled: options.cleanup_enabled, + experimental_no_edition_2024_for_dumps: options + .indexer_config + .experimental_no_edition_2024_for_dumps, webhook_url: options.webhook_url, webhook_authorization_header: options.webhook_authorization_header, embedders: Default::default(), @@ -594,6 +601,11 @@ impl IndexScheduler { Ok(nbr_index_processing_tasks > 0) } + /// Whether the index should use the old document indexer. + pub fn no_edition_2024_for_dumps(&self) -> bool { + self.experimental_no_edition_2024_for_dumps + } + /// Return the tasks matching the query from the user's point of view along /// with the total number of tasks matching the query, ignoring from and limit. /// diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index b8d100415..b14f23d0b 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -5,6 +5,7 @@ use std::sync::atomic::Ordering; use dump::IndexMetadata; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self}; @@ -227,12 +228,21 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 2062e1c28..0cd06f2e4 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -9,6 +9,7 @@ use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; @@ -62,13 +63,14 @@ impl IndexScheduler { let ExportIndexSettings { filter, override_settings } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); // First, check if the index already exists let url = format!("{base_url}/indexes/{uid}"); let response = retry(&must_stop_processing, || { let mut request = agent.get(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + if let Some(bearer) = &bearer { + request = request.set("Authorization", bearer); } request.send_bytes(Default::default()).map_err(into_backoff_error) @@ -90,8 +92,8 @@ impl IndexScheduler { let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + if let Some(bearer) = &bearer { + request = request.set("Authorization", bearer); } let index_param = json!({ "uid": uid, "primaryKey": primary_key }); request.send_json(&index_param).map_err(into_backoff_error) @@ -103,8 +105,8 @@ impl IndexScheduler { let url = format!("{base_url}/indexes/{uid}"); retry(&must_stop_processing, || { let mut request = agent.patch(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + if let Some(bearer) = &bearer { + request = request.set("Authorization", bearer); } let index_param = json!({ "primaryKey": primary_key }); request.send_json(&index_param).map_err(into_backoff_error) @@ -122,7 +124,6 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); - let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); retry(&must_stop_processing, || { let mut request = agent.patch(&url); if let Some(bearer) = bearer.as_ref() { @@ -167,10 +168,10 @@ impl IndexScheduler { }, ); - let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(20 * 1024 * 1024); // defaults to 20 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); - request_threads() + let results = request_threads() .broadcast(|ctx| { let index_rtxn = index .read_txn() @@ -229,12 +230,21 @@ impl IndexScheduler { )); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors.insert( embedder_name, @@ -265,9 +275,8 @@ impl IndexScheduler { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); request = request.set("Content-Encoding", "gzip"); - if let Some(api_key) = api_key { - request = request - .set("Authorization", &(format!("Bearer {api_key}"))); + if let Some(bearer) = &bearer { + request = request.set("Authorization", bearer); } request.send_bytes(&compressed_buffer).map_err(into_backoff_error) })?; @@ -276,7 +285,7 @@ impl IndexScheduler { } buffer.extend_from_slice(&tmp_buffer); - if i % 100 == 0 { + if i > 0 && i % 100 == 0 { step.fetch_add(100, atomic::Ordering::Relaxed); } } @@ -284,8 +293,8 @@ impl IndexScheduler { retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); + if let Some(bearer) = &bearer { + request = request.set("Authorization", bearer); } request.send_bytes(&buffer).map_err(into_backoff_error) })?; @@ -298,6 +307,9 @@ impl IndexScheduler { Some(uid.to_string()), ) })?; + for result in results { + result?; + } step.store(total_documents, atomic::Ordering::Relaxed); } diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index a9b920bd2..791fed4d8 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -3,6 +3,7 @@ use std::collections::BTreeMap; use big_s::S; use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; use meilisearch_types::milli::vector::SearchQuery; @@ -220,8 +221,8 @@ fn import_vectors() { let embeddings = index.embeddings(&rtxn, 0).unwrap(); - assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -311,9 +312,9 @@ fn import_vectors() { let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].embeddings[0] == patou_embed, @"true"); // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].embeddings[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -497,13 +498,13 @@ fn import_vectors_first_and_embedder_later() { let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; - assert!(!embedding.is_empty(), "{embedding:?}"); + let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"]; + assert!(!embeddings.is_empty(), "{embeddings:?}"); // the document with the id 3 should keep its original embedding let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embeddings, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings, .. } = &embeddings["my_doggo_embedder"]; snapshot!(embeddings.len(), @"1"); assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); @@ -558,7 +559,7 @@ fn import_vectors_first_and_embedder_later() { "###); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); @@ -566,7 +567,7 @@ fn import_vectors_first_and_embedder_later() { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["my_doggo_embedder"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); } @@ -696,7 +697,7 @@ fn delete_document_containing_vector() { "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let (embedding, _) = &embeddings["manual"]; + let EmbeddingsWithMetadata { embeddings: embedding, .. } = &embeddings["manual"]; assert!(!embedding.is_empty(), "{embedding:?}"); index_scheduler diff --git a/crates/meilisearch-auth/src/lib.rs b/crates/meilisearch-auth/src/lib.rs index 27d163192..6f5a5c2a2 100644 --- a/crates/meilisearch-auth/src/lib.rs +++ b/crates/meilisearch-auth/src/lib.rs @@ -158,7 +158,7 @@ impl AuthController { self.store.delete_all_keys() } - /// Delete all the keys in the DB. + /// Insert a key directly into the store. pub fn raw_insert_key(&mut self, key: Key) -> Result<()> { self.store.put_api_key(key)?; Ok(()) @@ -351,6 +351,7 @@ pub struct IndexSearchRules { fn generate_default_keys(store: &HeedAuthStore) -> Result<()> { store.put_api_key(Key::default_chat())?; + store.put_api_key(Key::default_read_only_admin())?; store.put_api_key(Key::default_admin())?; store.put_api_key(Key::default_search())?; diff --git a/crates/meilisearch-auth/src/store.rs b/crates/meilisearch-auth/src/store.rs index bae27afe4..eb2170f08 100644 --- a/crates/meilisearch-auth/src/store.rs +++ b/crates/meilisearch-auth/src/store.rs @@ -88,7 +88,13 @@ impl HeedAuthStore { let mut actions = HashSet::new(); for action in &key.actions { match action { - Action::All => actions.extend(enum_iterator::all::()), + Action::All => { + actions.extend(enum_iterator::all::()); + actions.remove(&Action::AllGet); + } + Action::AllGet => { + actions.extend(enum_iterator::all::().filter(|a| a.is_read())) + } Action::DocumentsAll => { actions.extend( [Action::DocumentsGet, Action::DocumentsDelete, Action::DocumentsAdd] diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index faf59643f..f3279a094 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -24,7 +24,7 @@ enum-iterator = "2.1.0" file-store = { path = "../file-store" } flate2 = "1.1.2" fst = "0.4.7" -memmap2 = "0.9.5" +memmap2 = "0.9.7" milli = { path = "../milli" } roaring = { version = "0.10.12", features = ["serde"] } rustc-hash = "2.1.1" diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index c57e2d042..458034c00 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -237,6 +237,7 @@ InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQU MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; MissingDocumentEditionFunction , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; +InvalidDocumentSort , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; InvalidVectorDimensions , InvalidRequest , BAD_REQUEST ; InvalidVectorsType , InvalidRequest , BAD_REQUEST ; @@ -415,6 +416,7 @@ InvalidChatCompletionPrompts , InvalidRequest , BAD_REQU InvalidChatCompletionSystemPrompt , InvalidRequest , BAD_REQUEST ; InvalidChatCompletionSearchDescriptionPrompt , InvalidRequest , BAD_REQUEST ; InvalidChatCompletionSearchQueryParamPrompt , InvalidRequest , BAD_REQUEST ; +InvalidChatCompletionSearchFilterParamPrompt , InvalidRequest , BAD_REQUEST ; InvalidChatCompletionSearchIndexUidParamPrompt , InvalidRequest , BAD_REQUEST ; InvalidChatCompletionPreQueryPrompt , InvalidRequest , BAD_REQUEST } @@ -476,7 +478,8 @@ impl ErrorCode for milli::Error { UserError::InvalidDistinctAttribute { .. } => Code::InvalidSearchDistinct, UserError::SortRankingRuleMissing => Code::InvalidSearchSort, UserError::InvalidFacetsDistribution { .. } => Code::InvalidSearchFacets, - UserError::InvalidSortableAttribute { .. } => Code::InvalidSearchSort, + UserError::InvalidSearchSortableAttribute { .. } => Code::InvalidSearchSort, + UserError::InvalidDocumentSortableAttribute { .. } => Code::InvalidDocumentSort, UserError::InvalidSearchableAttribute { .. } => { Code::InvalidSearchAttributesToSearchOn } @@ -492,7 +495,8 @@ impl ErrorCode for milli::Error { UserError::InvalidVectorsMapType { .. } | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, - UserError::SortError(_) => Code::InvalidSearchSort, + UserError::SortError { search: true, .. } => Code::InvalidSearchSort, + UserError::SortError { search: false, .. } => Code::InvalidDocumentSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { Code::InvalidSettingsTypoTolerance } diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index 3c78035e8..ddffb107c 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -4,10 +4,11 @@ use serde::{Deserialize, Serialize}; use crate::error::{Code, ResponseError}; -pub const DEFAULT_CHAT_SYSTEM_PROMPT: &str = "You are a highly capable research assistant with access to powerful search tools. IMPORTANT INSTRUCTIONS:1. When answering questions, you MUST make multiple tool calls (at least 2-3) to gather comprehensive information.2. Use different search queries for each tool call - vary keywords, rephrase questions, and explore different semantic angles to ensure broad coverage.3. Always explicitly announce BEFORE making each tool call by saying: \"I'll search for [specific information] now.\"4. Combine information from ALL tool calls to provide complete, nuanced answers rather than relying on a single source.5. For complex topics, break down your research into multiple targeted queries rather than using a single generic search."; +pub const DEFAULT_CHAT_SYSTEM_PROMPT: &str = "You are a highly capable research assistant with access to powerful search tools. IMPORTANT INSTRUCTIONS:1. When answering questions, you MUST make multiple tool calls (at least 2-3) to gather comprehensive information.2. Use different search queries for each tool call - vary keywords, rephrase questions, and explore different semantic angles to ensure broad coverage.3. Always explicitly announce BEFORE making each tool call by saying: \"I'll search for [specific information] now.\"4. Combine information from ALL tool calls to provide complete, nuanced answers rather than relying on a single source.5. For complex topics, break down your research into multiple targeted queries rather than using a single generic search. Meilisearch doesn't use the colon (:) syntax to filter but rather the equal (=) one. Separate filters from query and keep the q parameter empty if needed. Same for the filter parameter: keep it empty if need be. If you need to find documents that CONTAINS keywords simply put the keywords in the q parameter do no use a filter for this purpose. Whenever you get an error, read the error message and fix your error. "; pub const DEFAULT_CHAT_SEARCH_DESCRIPTION_PROMPT: &str = - "Search the database for relevant JSON documents using an optional query."; + "Query: 'best story about Rust before 2018' with year: 2018, 2020, 2021\nlabel: analysis, golang, javascript\ntype: story, link\nvote: 300, 298, 278\n: {\"q\": \"\", \"filter\": \"category = Rust AND type = story AND year < 2018 AND vote > 100\"}\nQuery: 'A black or green car that can go fast with red brakes' with maxspeed_kmh: 200, 150, 130\ncolor: black, grey, red, green\nbrand: Toyota, Renault, Jeep, Ferrari\n: {\"q\": \"red brakes\", \"filter\": \"maxspeed_kmh > 150 AND color IN ['black', green]\"}\nQuery: 'Superman movie released in 2018 or after' with year: 2018, 2020, 2021\ngenres: Drama, Comedy, Adventure, Fiction\n: {\"q\":\"Superman\",\"filter\":\"genres IN [Adventure, Fiction] AND year >= 2018\"}"; pub const DEFAULT_CHAT_SEARCH_Q_PARAM_PROMPT: &str = "The search query string used to find relevant documents in the index. This should contain keywords or phrases that best represent what the user is looking for. More specific queries will yield more precise results."; +pub const DEFAULT_CHAT_SEARCH_FILTER_PARAM_PROMPT: &str = "The search filter string used to find relevant documents in the index. It supports parentheses, `=`, `!=`, `>=`, `>`, `<=`, `<`, `IN`, `NOT IN`, `TO`, `EXISTS`, `NOT EXISTS`, `IS NULL`, `IS NOT NULL`, `IS EMPTY`, `IS NOT EMPTY`, `_geoRadius`, or `_geoBoundingBox`. Here is an example: \"price > 100 AND category = 'electronics'\". The following is a list of fields that can be filtered on: "; pub const DEFAULT_CHAT_SEARCH_INDEX_UID_PARAM_PROMPT: &str = "The name of the index to search within. An index is a collection of documents organized for search. Selecting the right index ensures the most relevant results for the user query."; #[derive(Serialize, Deserialize, Debug, Clone, Copy, Default, PartialEq, Eq)] @@ -161,18 +162,31 @@ impl ChatCompletionSource { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] #[serde(rename_all = "camelCase")] pub struct ChatCompletionPrompts { + #[serde(default)] pub system: String, + #[serde(default)] pub search_description: String, + #[serde(default)] pub search_q_param: String, + #[serde(default = "default_search_filter_param")] + pub search_filter_param: String, + #[serde(default)] pub search_index_uid_param: String, } +/// This function is used for when the search_filter_param is +/// not provided and this can happen when the database is in v1.15. +fn default_search_filter_param() -> String { + DEFAULT_CHAT_SEARCH_FILTER_PARAM_PROMPT.to_string() +} + impl Default for ChatCompletionPrompts { fn default() -> Self { Self { system: DEFAULT_CHAT_SYSTEM_PROMPT.to_string(), search_description: DEFAULT_CHAT_SEARCH_DESCRIPTION_PROMPT.to_string(), search_q_param: DEFAULT_CHAT_SEARCH_Q_PARAM_PROMPT.to_string(), + search_filter_param: DEFAULT_CHAT_SEARCH_FILTER_PARAM_PROMPT.to_string(), search_index_uid_param: DEFAULT_CHAT_SEARCH_INDEX_UID_PARAM_PROMPT.to_string(), } } diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index 3ba31c2cb..aec3199a3 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -144,6 +144,21 @@ impl Key { } } + pub fn default_read_only_admin() -> Self { + let now = OffsetDateTime::now_utc(); + let uid = Uuid::new_v4(); + Self { + name: Some("Default Read-Only Admin API Key".to_string()), + description: Some("Use it to read information across the whole database. Caution! Do not expose this key on a public frontend".to_string()), + uid, + actions: vec![Action::AllGet, Action::KeysGet], + indexes: vec![IndexUidPattern::all()], + expires_at: None, + created_at: now, + updated_at: now, + } + } + pub fn default_search() -> Self { let now = OffsetDateTime::now_utc(); let uid = Uuid::new_v4(); @@ -347,6 +362,9 @@ pub enum Action { #[serde(rename = "chatsSettings.update")] #[deserr(rename = "chatsSettings.update")] ChatsSettingsUpdate, + #[serde(rename = "*.get")] + #[deserr(rename = "*.get")] + AllGet, } impl Action { @@ -385,6 +403,7 @@ impl Action { METRICS_GET => Some(Self::MetricsGet), DUMPS_ALL => Some(Self::DumpsAll), DUMPS_CREATE => Some(Self::DumpsCreate), + SNAPSHOTS_ALL => Some(Self::SnapshotsAll), SNAPSHOTS_CREATE => Some(Self::SnapshotsCreate), VERSION => Some(Self::Version), KEYS_CREATE => Some(Self::KeysAdd), @@ -393,12 +412,60 @@ impl Action { KEYS_DELETE => Some(Self::KeysDelete), EXPERIMENTAL_FEATURES_GET => Some(Self::ExperimentalFeaturesGet), EXPERIMENTAL_FEATURES_UPDATE => Some(Self::ExperimentalFeaturesUpdate), + EXPORT => Some(Self::Export), NETWORK_GET => Some(Self::NetworkGet), NETWORK_UPDATE => Some(Self::NetworkUpdate), + ALL_GET => Some(Self::AllGet), _otherwise => None, } } + /// Whether the action should be included in [Action::AllRead]. + pub fn is_read(&self) -> bool { + use Action::*; + + // It's using an exhaustive match to force the addition of new actions. + match self { + // Any action that expands to others must return false, as it wouldn't be able to expand recursively. + All | AllGet | DocumentsAll | IndexesAll | ChatsAll | TasksAll | SettingsAll + | StatsAll | MetricsAll | DumpsAll | SnapshotsAll | ChatsSettingsAll => false, + + Search => true, + DocumentsAdd => false, + DocumentsGet => true, + DocumentsDelete => false, + Export => true, + IndexesAdd => false, + IndexesGet => true, + IndexesUpdate => false, + IndexesDelete => false, + IndexesSwap => false, + TasksCancel => false, + TasksDelete => false, + TasksGet => true, + SettingsGet => true, + SettingsUpdate => false, + StatsGet => true, + MetricsGet => true, + DumpsCreate => false, + SnapshotsCreate => false, + Version => true, + KeysAdd => false, + KeysGet => false, // Disabled in order to prevent privilege escalation + KeysUpdate => false, + KeysDelete => false, + ExperimentalFeaturesGet => true, + ExperimentalFeaturesUpdate => false, + NetworkGet => true, + NetworkUpdate => false, + ChatCompletions => false, // Disabled because it might trigger generation of new chats + ChatsGet => true, + ChatsDelete => false, + ChatsSettingsGet => true, + ChatsSettingsUpdate => false, + } + } + pub const fn repr(&self) -> u8 { *self as u8 } @@ -408,6 +475,7 @@ pub mod actions { use super::Action::*; pub(crate) const ALL: u8 = All.repr(); + pub const ALL_GET: u8 = AllGet.repr(); pub const SEARCH: u8 = Search.repr(); pub const DOCUMENTS_ALL: u8 = DocumentsAll.repr(); pub const DOCUMENTS_ADD: u8 = DocumentsAdd.repr(); @@ -432,6 +500,7 @@ pub mod actions { pub const METRICS_GET: u8 = MetricsGet.repr(); pub const DUMPS_ALL: u8 = DumpsAll.repr(); pub const DUMPS_CREATE: u8 = DumpsCreate.repr(); + pub const SNAPSHOTS_ALL: u8 = SnapshotsAll.repr(); pub const SNAPSHOTS_CREATE: u8 = SnapshotsCreate.repr(); pub const VERSION: u8 = Version.repr(); pub const KEYS_CREATE: u8 = KeysAdd.repr(); @@ -454,3 +523,68 @@ pub mod actions { pub const CHATS_SETTINGS_GET: u8 = ChatsSettingsGet.repr(); pub const CHATS_SETTINGS_UPDATE: u8 = ChatsSettingsUpdate.repr(); } + +#[cfg(test)] +pub(crate) mod test { + use super::actions::*; + use super::Action::*; + use super::*; + + #[test] + fn test_action_repr_and_constants() { + assert!(All.repr() == 0 && ALL == 0); + assert!(Search.repr() == 1 && SEARCH == 1); + assert!(DocumentsAll.repr() == 2 && DOCUMENTS_ALL == 2); + assert!(DocumentsAdd.repr() == 3 && DOCUMENTS_ADD == 3); + assert!(DocumentsGet.repr() == 4 && DOCUMENTS_GET == 4); + assert!(DocumentsDelete.repr() == 5 && DOCUMENTS_DELETE == 5); + assert!(IndexesAll.repr() == 6 && INDEXES_ALL == 6); + assert!(IndexesAdd.repr() == 7 && INDEXES_CREATE == 7); + assert!(IndexesGet.repr() == 8 && INDEXES_GET == 8); + assert!(IndexesUpdate.repr() == 9 && INDEXES_UPDATE == 9); + assert!(IndexesDelete.repr() == 10 && INDEXES_DELETE == 10); + assert!(IndexesSwap.repr() == 11 && INDEXES_SWAP == 11); + assert!(TasksAll.repr() == 12 && TASKS_ALL == 12); + assert!(TasksCancel.repr() == 13 && TASKS_CANCEL == 13); + assert!(TasksDelete.repr() == 14 && TASKS_DELETE == 14); + assert!(TasksGet.repr() == 15 && TASKS_GET == 15); + assert!(SettingsAll.repr() == 16 && SETTINGS_ALL == 16); + assert!(SettingsGet.repr() == 17 && SETTINGS_GET == 17); + assert!(SettingsUpdate.repr() == 18 && SETTINGS_UPDATE == 18); + assert!(StatsAll.repr() == 19 && STATS_ALL == 19); + assert!(StatsGet.repr() == 20 && STATS_GET == 20); + assert!(MetricsAll.repr() == 21 && METRICS_ALL == 21); + assert!(MetricsGet.repr() == 22 && METRICS_GET == 22); + assert!(DumpsAll.repr() == 23 && DUMPS_ALL == 23); + assert!(DumpsCreate.repr() == 24 && DUMPS_CREATE == 24); + assert!(SnapshotsAll.repr() == 25 && SNAPSHOTS_ALL == 25); + assert!(SnapshotsCreate.repr() == 26 && SNAPSHOTS_CREATE == 26); + assert!(Version.repr() == 27 && VERSION == 27); + assert!(KeysAdd.repr() == 28 && KEYS_CREATE == 28); + assert!(KeysGet.repr() == 29 && KEYS_GET == 29); + assert!(KeysUpdate.repr() == 30 && KEYS_UPDATE == 30); + assert!(KeysDelete.repr() == 31 && KEYS_DELETE == 31); + assert!(ExperimentalFeaturesGet.repr() == 32 && EXPERIMENTAL_FEATURES_GET == 32); + assert!(ExperimentalFeaturesUpdate.repr() == 33 && EXPERIMENTAL_FEATURES_UPDATE == 33); + assert!(Export.repr() == 34 && EXPORT == 34); + assert!(NetworkGet.repr() == 35 && NETWORK_GET == 35); + assert!(NetworkUpdate.repr() == 36 && NETWORK_UPDATE == 36); + assert!(ChatCompletions.repr() == 37 && CHAT_COMPLETIONS == 37); + assert!(ChatsAll.repr() == 38 && CHATS_ALL == 38); + assert!(ChatsGet.repr() == 39 && CHATS_GET == 39); + assert!(ChatsDelete.repr() == 40 && CHATS_DELETE == 40); + assert!(ChatsSettingsAll.repr() == 41 && CHATS_SETTINGS_ALL == 41); + assert!(ChatsSettingsGet.repr() == 42 && CHATS_SETTINGS_GET == 42); + assert!(ChatsSettingsUpdate.repr() == 43 && CHATS_SETTINGS_UPDATE == 43); + assert!(AllGet.repr() == 44 && ALL_GET == 44); + } + + #[test] + fn test_from_repr() { + for action in enum_iterator::all::() { + let repr = action.repr(); + let action_from_repr = Action::from_repr(repr); + assert_eq!(Some(action), action_from_repr, "Failed for action: {:?}", action); + } + } +} diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index 83eb439d9..5cbbb6666 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -50,6 +50,7 @@ jsonwebtoken = "9.3.1" lazy_static = "1.5.0" meilisearch-auth = { path = "../meilisearch-auth" } meilisearch-types = { path = "../meilisearch-types" } +memmap2 = "0.9.7" mimalloc = { version = "0.1.47", default-features = false } mime = "0.3.17" num_cpus = "1.17.0" @@ -169,5 +170,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.20/build.zip" -sha1 = "82a7ddd7bf14bb5323c3d235d2b62892a98b6a59" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.22/build.zip" +sha1 = "b70b2036b5f167da9ea0b637da8b320c7ea88254" diff --git a/crates/meilisearch/src/analytics/mock_analytics.rs b/crates/meilisearch/src/analytics/mock_analytics.rs index 54b8d4f1b..062240018 100644 --- a/crates/meilisearch/src/analytics/mock_analytics.rs +++ b/crates/meilisearch/src/analytics/mock_analytics.rs @@ -104,6 +104,4 @@ impl Analytics for MockAnalytics { _request: &HttpRequest, ) { } - fn get_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} - fn post_fetch_documents(&self, _documents_query: &DocumentFetchKind, _request: &HttpRequest) {} } diff --git a/crates/meilisearch/src/analytics/mod.rs b/crates/meilisearch/src/analytics/mod.rs index bd14b0bfa..0d1a860e1 100644 --- a/crates/meilisearch/src/analytics/mod.rs +++ b/crates/meilisearch/src/analytics/mod.rs @@ -73,12 +73,6 @@ pub enum DocumentDeletionKind { PerFilter, } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum DocumentFetchKind { - PerDocumentId { retrieve_vectors: bool }, - Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, -} - /// To send an event to segment, your event must be able to aggregate itself with another event of the same type. pub trait Aggregate: 'static + mopa::Any + Send { /// The name of the event that will be sent to segment. diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 0abc5c817..a2a0f0c05 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -203,6 +203,7 @@ struct Infos { experimental_composite_embedders: bool, experimental_embedding_cache_entries: usize, experimental_no_snapshot_compaction: bool, + experimental_no_edition_2024_for_dumps: bool, experimental_no_edition_2024_for_settings: bool, gpu_enabled: bool, db_path: bool, @@ -293,6 +294,7 @@ impl Infos { max_indexing_threads, skip_index_budget: _, experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps, } = indexer_options; let RuntimeTogglableFeatures { @@ -329,6 +331,7 @@ impl Infos { experimental_composite_embedders: composite_embedders, experimental_embedding_cache_entries, experimental_no_snapshot_compaction, + experimental_no_edition_2024_for_dumps, gpu_enabled: meilisearch_types::milli::vector::is_cuda_enabled(), db_path: db_path != PathBuf::from("./data.ms"), import_dump: import_dump.is_some(), diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index 91c6c23fa..8d4430f07 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -49,7 +49,7 @@ pub enum MeilisearchHttpError { TooManySearchRequests(usize), #[error("Internal error: Search limiter is down.")] SearchLimiterIsDown, - #[error("The provided payload reached the size limit. The maximum accepted payload size is {}.", Byte::from_u64(*.0 as u64).get_appropriate_unit(UnitType::Binary))] + #[error("The provided payload reached the size limit. The maximum accepted payload size is {}.", Byte::from_u64(*.0 as u64).get_appropriate_unit(if *.0 % 1024 == 0 { UnitType::Binary } else { UnitType::Decimal }))] PayloadTooLarge(usize), #[error("Two indexes must be given for each swap. The list `[{}]` contains {} indexes.", .0.iter().map(|uid| format!("\"{uid}\"")).collect::>().join(", "), .0.len() diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 43d7afe0e..0fb93b65a 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -30,6 +30,7 @@ use actix_web::web::Data; use actix_web::{web, HttpRequest}; use analytics::Analytics; use anyhow::bail; +use bumpalo::Bump; use error::PayloadError; use extractors::payload::PayloadConfig; use index_scheduler::versioning::Versioning; @@ -38,6 +39,7 @@ use meilisearch_auth::{open_auth_store_env, AuthController}; use meilisearch_types::milli::constants::VERSION_MAJOR; use meilisearch_types::milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use meilisearch_types::milli::progress::{EmbedderStats, Progress}; +use meilisearch_types::milli::update::new::indexer; use meilisearch_types::milli::update::{ default_thread_pool_and_threads, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, }; @@ -533,7 +535,7 @@ fn import_dump( let mut index_reader = index_reader?; let metadata = index_reader.metadata(); let uid = metadata.uid.clone(); - tracing::info!("Importing index `{}`.", metadata.uid); + tracing::info!("Importing index `{uid}`."); let date = Some((metadata.created_at, metadata.updated_at)); let index = index_scheduler.create_raw_index(&metadata.uid, date)?; @@ -552,48 +554,100 @@ fn import_dump( apply_settings_to_builder(&settings, &mut builder); let embedder_stats: Arc = Default::default(); builder.execute(&|| false, &progress, embedder_stats.clone())?; + wtxn.commit()?; - // 5.3 Import the documents. - // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. - tracing::info!("Importing the documents."); - let file = tempfile::tempfile()?; - let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); - for document in index_reader.documents()? { - builder.append_json_object(&document?)?; + let mut wtxn = index.write_txn()?; + let rtxn = index.read_txn()?; + + if index_scheduler.no_edition_2024_for_dumps() { + // 5.3 Import the documents. + // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. + tracing::info!("Importing the documents."); + let file = tempfile::tempfile()?; + let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); + for document in index_reader.documents()? { + builder.append_json_object(&document?)?; + } + + // This flush the content of the batch builder. + let file = builder.into_inner()?.into_inner()?; + + // 5.3.2 We feed it to the milli index. + let reader = BufReader::new(file); + let reader = DocumentsBatchReader::from_reader(reader)?; + + let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; + let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; + + let builder = milli::update::IndexDocuments::new( + &mut wtxn, + &index, + indexer_config, + IndexDocumentsConfig { + update_method: IndexDocumentsMethod::ReplaceDocuments, + ..Default::default() + }, + |indexing_step| tracing::trace!("update: {:?}", indexing_step), + || false, + &embedder_stats, + )?; + + let builder = builder.with_embedders(embedders); + + let (builder, user_result) = builder.add_documents(reader)?; + let user_result = user_result?; + tracing::info!(documents_found = user_result, "{} documents found.", user_result); + builder.execute()?; + } else { + let db_fields_ids_map = index.fields_ids_map(&rtxn)?; + let primary_key = index.primary_key(&rtxn)?; + let mut new_fields_ids_map = db_fields_ids_map.clone(); + + let mut indexer = indexer::DocumentOperation::new(); + let embedders = index.embedding_configs().embedding_configs(&rtxn)?; + let embedders = index_scheduler.embedders(uid.clone(), embedders)?; + + let mmap = unsafe { memmap2::Mmap::map(index_reader.documents_file())? }; + + indexer.replace_documents(&mmap)?; + + let indexer_config = index_scheduler.indexer_config(); + let pool = &indexer_config.thread_pool; + + let indexer_alloc = Bump::new(); + let (document_changes, mut operation_stats, primary_key) = indexer.into_changes( + &indexer_alloc, + &index, + &rtxn, + primary_key, + &mut new_fields_ids_map, + &|| false, // never stop processing a dump + progress.clone(), + )?; + + let operation_stats = operation_stats.pop().unwrap(); + if let Some(error) = operation_stats.error { + return Err(error.into()); + } + + let _congestion = indexer::index( + &mut wtxn, + &index, + pool, + indexer_config.grenad_parameters(), + &db_fields_ids_map, + new_fields_ids_map, + primary_key, + &document_changes, + embedders, + &|| false, // never stop processing a dump + &progress, + &embedder_stats, + )?; } - // This flush the content of the batch builder. - let file = builder.into_inner()?.into_inner()?; - - // 5.3.2 We feed it to the milli index. - let reader = BufReader::new(file); - let reader = DocumentsBatchReader::from_reader(reader)?; - - let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; - let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; - - let builder = milli::update::IndexDocuments::new( - &mut wtxn, - &index, - indexer_config, - IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }, - |indexing_step| tracing::trace!("update: {:?}", indexing_step), - || false, - &embedder_stats, - )?; - - let builder = builder.with_embedders(embedders); - - let (builder, user_result) = builder.add_documents(reader)?; - let user_result = user_result?; - tracing::info!(documents_found = user_result, "{} documents found.", user_result); - builder.execute()?; wtxn.commit()?; tracing::info!("All documents successfully imported."); - index_scheduler.refresh_index_stats(&uid)?; } diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index d52e04cc6..607bc91eb 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -15,30 +15,33 @@ lazy_static! { "Meilisearch number of degraded search requests" )) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_SEARCH_REQUESTS: IntCounterVec = register_int_counter_vec!( + pub static ref MEILISEARCH_CHAT_SEARCHES_TOTAL: IntCounterVec = register_int_counter_vec!( opts!( - "meilisearch_chat_search_requests", - "Meilisearch number of search requests performed by the chat route itself" + "meilisearch_chat_searches_total", + "Total number of searches performed by the chat route" ), &["type"] ) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( - opts!("meilisearch_chat_prompt_tokens_usage", "Meilisearch Chat Prompt Tokens Usage"), + pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_TOTAL: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_prompt_tokens_total", "Total number of prompt tokens consumed"), &["workspace", "model"] ) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE: IntCounterVec = + pub static ref MEILISEARCH_CHAT_COMPLETION_TOKENS_TOTAL: IntCounterVec = register_int_counter_vec!( opts!( - "meilisearch_chat_completion_tokens_usage", - "Meilisearch Chat Completion Tokens Usage" + "meilisearch_chat_completion_tokens_total", + "Total number of completion tokens consumed" ), &["workspace", "model"] ) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( - opts!("meilisearch_chat_total_tokens_usage", "Meilisearch Chat Total Tokens Usage"), + pub static ref MEILISEARCH_CHAT_TOKENS_TOTAL: IntCounterVec = register_int_counter_vec!( + opts!( + "meilisearch_chat_tokens_total", + "Total number of tokens consumed (prompt + completion)" + ), &["workspace", "model"] ) .expect("Can't create a metric"); diff --git a/crates/meilisearch/src/option.rs b/crates/meilisearch/src/option.rs index 9658352c8..dd77a1222 100644 --- a/crates/meilisearch/src/option.rs +++ b/crates/meilisearch/src/option.rs @@ -68,6 +68,8 @@ const MEILI_EXPERIMENTAL_LIMIT_BATCHED_TASKS_TOTAL_SIZE: &str = const MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES: &str = "MEILI_EXPERIMENTAL_EMBEDDING_CACHE_ENTRIES"; const MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION: &str = "MEILI_EXPERIMENTAL_NO_SNAPSHOT_COMPACTION"; +const MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS: &str = + "MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS"; const DEFAULT_CONFIG_FILE_PATH: &str = "./config.toml"; const DEFAULT_DB_PATH: &str = "./data.ms"; const DEFAULT_HTTP_ADDR: &str = "localhost:7700"; @@ -759,6 +761,15 @@ pub struct IndexerOpts { #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_SETTINGS)] #[serde(default)] pub experimental_no_edition_2024_for_settings: bool, + + /// Experimental make dump imports use the old document indexer. + /// + /// When enabled, Meilisearch will use the old document indexer when importing dumps. + /// + /// For more information, see . + #[clap(long, env = MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS)] + #[serde(default)] + pub experimental_no_edition_2024_for_dumps: bool, } impl IndexerOpts { @@ -769,6 +780,7 @@ impl IndexerOpts { max_indexing_threads, skip_index_budget: _, experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps, } = self; if let Some(max_indexing_memory) = max_indexing_memory.0 { export_to_env_if_not_present( @@ -788,6 +800,12 @@ impl IndexerOpts { experimental_no_edition_2024_for_settings.to_string(), ); } + if experimental_no_edition_2024_for_dumps { + export_to_env_if_not_present( + MEILI_EXPERIMENTAL_NO_EDITION_2024_FOR_DUMPS, + experimental_no_edition_2024_for_dumps.to_string(), + ); + } } } @@ -808,6 +826,7 @@ impl TryFrom<&IndexerOpts> for IndexerConfig { skip_index_budget: other.skip_index_budget, experimental_no_edition_2024_for_settings: other .experimental_no_edition_2024_for_settings, + experimental_no_edition_2024_for_dumps: other.experimental_no_edition_2024_for_dumps, chunk_compression_type: Default::default(), chunk_compression_level: Default::default(), documents_chunk_size: Default::default(), diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index 4f7087ae8..f2c17a696 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -27,9 +27,10 @@ use meilisearch_types::features::{ ChatCompletionPrompts as DbChatCompletionPrompts, ChatCompletionSource as DbChatCompletionSource, SystemRole, }; +use meilisearch_types::heed::RoTxn; use meilisearch_types::keys::actions; use meilisearch_types::milli::index::ChatConfig; -use meilisearch_types::milli::{all_obkv_to_json, obkv_to_json, TimeBudget}; +use meilisearch_types::milli::{all_obkv_to_json, obkv_to_json, OrderBy, PatternMatch, TimeBudget}; use meilisearch_types::{Document, Index}; use serde::Deserialize; use serde_json::json; @@ -49,8 +50,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, - MEILISEARCH_CHAT_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, + MEILISEARCH_CHAT_COMPLETION_TOKENS_TOTAL, MEILISEARCH_CHAT_PROMPT_TOKENS_TOTAL, + MEILISEARCH_CHAT_SEARCHES_TOTAL, MEILISEARCH_CHAT_TOKENS_TOTAL, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; @@ -169,6 +170,7 @@ fn setup_search_tool( let mut index_uids = Vec::new(); let mut function_description = prompts.search_description.clone(); + let mut filter_description = prompts.search_filter_param.clone(); index_scheduler.try_for_each_index::<_, ()>(|name, index| { // Make sure to skip unauthorized indexes if !filters.is_index_authorized(name) { @@ -180,16 +182,22 @@ fn setup_search_tool( let index_description = chat_config.description; let _ = writeln!(&mut function_description, "\n\n - {name}: {index_description}\n"); index_uids.push(name.to_string()); + let facet_distributions = format_facet_distributions(index, &rtxn, 10).unwrap(); // TODO do not unwrap + let _ = writeln!(&mut filter_description, "\n## Facet distributions of the {name} index"); + let _ = writeln!(&mut filter_description, "{facet_distributions}"); Ok(()) })?; + tracing::debug!("LLM function description: {function_description}"); + tracing::debug!("LLM filter description: {filter_description}"); + let tool = ChatCompletionToolArgs::default() .r#type(ChatCompletionToolType::Function) .function( FunctionObjectArgs::default() .name(MEILI_SEARCH_IN_INDEX_FUNCTION_NAME) - .description(&function_description) + .description(function_description) .parameters(json!({ "type": "object", "properties": { @@ -203,9 +211,13 @@ fn setup_search_tool( // "type": ["string", "null"], "type": "string", "description": prompts.search_q_param, + }, + "filter": { + "type": "string", + "description": filter_description, } }, - "required": ["index_uid", "q"], + "required": ["index_uid", "q", "filter"], "additionalProperties": false, })) .strict(true) @@ -247,11 +259,19 @@ async fn process_search_request( auth_token: &str, index_uid: String, q: Option, + filter: Option, ) -> Result<(Index, Vec, String), ResponseError> { let index = index_scheduler.index(&index_uid)?; let rtxn = index.static_read_txn()?; let ChatConfig { description: _, prompt: _, search_parameters } = index.chat_config(&rtxn)?; - let mut query = SearchQuery { q, ..SearchQuery::from(search_parameters) }; + let mut query = SearchQuery { + q, + filter: filter.map(serde_json::Value::from), + ..SearchQuery::from(search_parameters) + }; + + tracing::debug!("LLM query: {:?}", query); + let auth_filter = ActionPolicy::<{ actions::SEARCH }>::authenticate( auth_ctrl, auth_token, @@ -280,17 +300,26 @@ async fn process_search_request( let (search, _is_finite_pagination, _max_total_hits, _offset) = prepare_search(&index_cloned, &rtxn, &query, &search_kind, time_budget, features)?; - search_from_kind(index_uid, search_kind, search) - .map(|(search_results, _)| (rtxn, search_results)) - .map_err(ResponseError::from) + match search_from_kind(index_uid, search_kind, search) { + Ok((search_results, _)) => Ok((rtxn, Ok(search_results))), + Err(MeilisearchHttpError::Milli { + error: meilisearch_types::milli::Error::UserError(user_error), + index_name: _, + }) => Ok((rtxn, Err(user_error))), + Err(err) => Err(ResponseError::from(err)), + } }) .await; permit.drop().await; - let output = output?; + let output = match output? { + Ok((rtxn, Ok(search_results))) => Ok((rtxn, search_results)), + Ok((_rtxn, Err(error))) => return Ok((index, Vec::new(), error.to_string())), + Err(err) => Err(err), + }; let mut documents = Vec::new(); if let Ok((ref rtxn, ref search_result)) = output { - MEILISEARCH_CHAT_SEARCH_REQUESTS.with_label_values(&["internal"]).inc(); + MEILISEARCH_CHAT_SEARCHES_TOTAL.with_label_values(&["internal"]).inc(); if search_result.degraded { MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } @@ -395,16 +424,19 @@ async fn non_streamed_chat( for call in meili_calls { let result = match serde_json::from_str(&call.function.arguments) { - Ok(SearchInIndexParameters { index_uid, q }) => process_search_request( - &index_scheduler, - auth_ctrl.clone(), - &search_queue, - auth_token, - index_uid, - q, - ) - .await - .map_err(|e| e.to_string()), + Ok(SearchInIndexParameters { index_uid, q, filter }) => { + process_search_request( + &index_scheduler, + auth_ctrl.clone(), + &search_queue, + auth_token, + index_uid, + q, + filter, + ) + .await + .map_err(|e| e.to_string()) + } Err(err) => Err(err.to_string()), }; @@ -564,13 +596,13 @@ async fn run_conversation( match result { Ok(resp) => { if let Some(usage) = resp.usage.as_ref() { - MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE + MEILISEARCH_CHAT_PROMPT_TOKENS_TOTAL .with_label_values(&[workspace_uid, &chat_completion.model]) .inc_by(usage.prompt_tokens as u64); - MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE + MEILISEARCH_CHAT_COMPLETION_TOKENS_TOTAL .with_label_values(&[workspace_uid, &chat_completion.model]) .inc_by(usage.completion_tokens as u64); - MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE + MEILISEARCH_CHAT_TOKENS_TOTAL .with_label_values(&[workspace_uid, &chat_completion.model]) .inc_by(usage.total_tokens as u64); } @@ -719,13 +751,14 @@ async fn handle_meili_tools( let mut error = None; let result = match serde_json::from_str(&call.function.arguments) { - Ok(SearchInIndexParameters { index_uid, q }) => match process_search_request( + Ok(SearchInIndexParameters { index_uid, q, filter }) => match process_search_request( index_scheduler, auth_ctrl.clone(), search_queue, auth_token, index_uid, q, + filter, ) .await { @@ -801,4 +834,42 @@ struct SearchInIndexParameters { index_uid: String, /// The query parameter to use. q: Option, + /// The filter parameter to use. + filter: Option, +} + +fn format_facet_distributions( + index: &Index, + rtxn: &RoTxn, + max_values_per_facet: usize, +) -> meilisearch_types::milli::Result { + let universe = index.documents_ids(rtxn)?; + let rules = index.filterable_attributes_rules(rtxn)?; + let fields_ids_map = index.fields_ids_map(rtxn)?; + let filterable_attributes = fields_ids_map + .names() + .filter(|name| rules.iter().any(|rule| matches!(rule.match_str(name), PatternMatch::Match))) + .map(|name| (name, OrderBy::Count)); + let facets_distribution = index + .facets_distribution(rtxn) + .max_values_per_facet(max_values_per_facet) + .candidates(universe) + .facets(filterable_attributes) + .execute()?; + + let mut output = String::new(); + for (facet_name, entries) in facets_distribution { + let _ = write!(&mut output, "{}: ", facet_name); + let total_entries = entries.len(); + for (i, (value, _count)) in entries.into_iter().enumerate() { + let _ = if total_entries.saturating_sub(1) == i { + write!(&mut output, "{value}.") + } else { + write!(&mut output, "{value}, ") + }; + } + let _ = writeln!(&mut output); + } + + Ok(output) } diff --git a/crates/meilisearch/src/routes/chats/settings.rs b/crates/meilisearch/src/routes/chats/settings.rs index 38eb0d3c5..44c099c14 100644 --- a/crates/meilisearch/src/routes/chats/settings.rs +++ b/crates/meilisearch/src/routes/chats/settings.rs @@ -8,8 +8,8 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::features::{ ChatCompletionPrompts as DbChatCompletionPrompts, ChatCompletionSettings, ChatCompletionSource as DbChatCompletionSource, DEFAULT_CHAT_SEARCH_DESCRIPTION_PROMPT, - DEFAULT_CHAT_SEARCH_INDEX_UID_PARAM_PROMPT, DEFAULT_CHAT_SEARCH_Q_PARAM_PROMPT, - DEFAULT_CHAT_SYSTEM_PROMPT, + DEFAULT_CHAT_SEARCH_FILTER_PARAM_PROMPT, DEFAULT_CHAT_SEARCH_INDEX_UID_PARAM_PROMPT, + DEFAULT_CHAT_SEARCH_Q_PARAM_PROMPT, DEFAULT_CHAT_SYSTEM_PROMPT, }; use meilisearch_types::keys::actions; use meilisearch_types::milli::update::Setting; @@ -84,6 +84,11 @@ async fn patch_settings( Setting::Reset => DEFAULT_CHAT_SEARCH_Q_PARAM_PROMPT.to_string(), Setting::NotSet => old_settings.prompts.search_q_param, }, + search_filter_param: match new_prompts.search_filter_param { + Setting::Set(new_description) => new_description, + Setting::Reset => DEFAULT_CHAT_SEARCH_FILTER_PARAM_PROMPT.to_string(), + Setting::NotSet => old_settings.prompts.search_filter_param, + }, search_index_uid_param: match new_prompts.search_index_uid_param { Setting::Set(new_description) => new_description, Setting::Reset => DEFAULT_CHAT_SEARCH_INDEX_UID_PARAM_PROMPT.to_string(), @@ -252,6 +257,10 @@ pub struct ChatPrompts { #[schema(value_type = Option, example = json!("This is query parameter..."))] pub search_q_param: Setting, #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + #[schema(value_type = Option, example = json!("This is filter parameter..."))] + pub search_filter_param: Setting, + #[serde(default)] #[deserr(default, error = DeserrJsonError)] #[schema(value_type = Option, example = json!("This is index you want to search in..."))] pub search_index_uid_param: Setting, diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index a93d736f7..138f5140f 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1,6 +1,7 @@ use std::collections::HashSet; use std::io::{ErrorKind, Seek as _}; use std::marker::PhantomData; +use std::str::FromStr; use actix_web::http::header::CONTENT_TYPE; use actix_web::web::Data; @@ -17,9 +18,11 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; +use meilisearch_types::milli::documents::sort::recursive_sort; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; -use meilisearch_types::milli::DocumentId; +use meilisearch_types::milli::{AscDesc, DocumentId}; use meilisearch_types::serde_cs::vec::CS; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; @@ -42,6 +45,7 @@ use crate::extractors::authentication::policies::*; use crate::extractors::authentication::GuardedData; use crate::extractors::payload::Payload; use crate::extractors::sequential_extractor::SeqHandler; +use crate::routes::indexes::search::fix_sort_query_parameters; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; @@ -135,6 +139,8 @@ pub struct DocumentsFetchAggregator { per_document_id: bool, // if a filter was used per_filter: bool, + // if documents were sorted + sort: bool, #[serde(rename = "vector.retrieve_vectors")] retrieve_vectors: bool, @@ -151,39 +157,6 @@ pub struct DocumentsFetchAggregator { marker: std::marker::PhantomData, } -#[derive(Copy, Clone, Debug, PartialEq, Eq)] -pub enum DocumentFetchKind { - PerDocumentId { retrieve_vectors: bool }, - Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool, ids: usize }, -} - -impl DocumentsFetchAggregator { - pub fn from_query(query: &DocumentFetchKind) -> Self { - let (limit, offset, retrieve_vectors) = match query { - DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), - DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { - (*limit, *offset, *retrieve_vectors) - } - }; - - let ids = match query { - DocumentFetchKind::Normal { ids, .. } => *ids, - DocumentFetchKind::PerDocumentId { .. } => 0, - }; - - Self { - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), - per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), - max_limit: limit, - max_offset: offset, - retrieve_vectors, - max_document_ids: ids, - - marker: PhantomData, - } - } -} - impl Aggregate for DocumentsFetchAggregator { fn event_name(&self) -> &'static str { Method::event_name() @@ -193,6 +166,7 @@ impl Aggregate for DocumentsFetchAggregator { Box::new(Self { per_document_id: self.per_document_id | new.per_document_id, per_filter: self.per_filter | new.per_filter, + sort: self.sort | new.sort, retrieve_vectors: self.retrieve_vectors | new.retrieve_vectors, max_limit: self.max_limit.max(new.max_limit), max_offset: self.max_offset.max(new.max_offset), @@ -276,6 +250,7 @@ pub async fn get_document( retrieve_vectors: param_retrieve_vectors.0, per_document_id: true, per_filter: false, + sort: false, max_limit: 0, max_offset: 0, max_document_ids: 0, @@ -406,6 +381,8 @@ pub struct BrowseQueryGet { #[param(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrQueryParamError)] filter: Option, + #[deserr(default, error = DeserrQueryParamError)] + sort: Option, } #[derive(Debug, Deserr, ToSchema)] @@ -430,6 +407,9 @@ pub struct BrowseQuery { #[schema(default, value_type = Option, example = "popularity > 1000")] #[deserr(default, error = DeserrJsonError)] filter: Option, + #[schema(default, value_type = Option>, example = json!(["title:asc", "rating:desc"]))] + #[deserr(default, error = DeserrJsonError)] + sort: Option>, } /// Get documents with POST @@ -495,6 +475,7 @@ pub async fn documents_by_query_post( analytics.publish( DocumentsFetchAggregator:: { per_filter: body.filter.is_some(), + sort: body.sort.is_some(), retrieve_vectors: body.retrieve_vectors, max_limit: body.limit, max_offset: body.offset, @@ -571,7 +552,7 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids } = + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter, ids, sort } = params.into_inner(); let filter = match filter { @@ -582,20 +563,20 @@ pub async fn get_documents( None => None, }; - let ids = ids.map(|ids| ids.into_iter().map(Into::into).collect()); - let query = BrowseQuery { offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), retrieve_vectors: retrieve_vectors.0, filter, - ids, + ids: ids.map(|ids| ids.into_iter().map(Into::into).collect()), + sort: sort.map(|attr| fix_sort_query_parameters(&attr)), }; analytics.publish( DocumentsFetchAggregator:: { per_filter: query.filter.is_some(), + sort: query.sort.is_some(), retrieve_vectors: query.retrieve_vectors, max_limit: query.limit, max_offset: query.offset, @@ -615,7 +596,7 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter, ids, sort } = query; let retrieve_vectors = RetrieveVectors::new(retrieve_vectors); @@ -633,6 +614,18 @@ fn documents_by_query( None }; + let sort_criteria = if let Some(sort) = &sort { + let sorts: Vec<_> = match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { + Ok(sorts) => sorts, + Err(asc_desc_error) => { + return Err(milli::SortError::from(asc_desc_error).into_document_error().into()) + } + }; + Some(sorts) + } else { + None + }; + let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents( &index, @@ -643,6 +636,7 @@ fn documents_by_query( fields, retrieve_vectors, index_scheduler.features(), + sort_criteria, )?; let ret = PaginationView::new(offset, limit, total as usize, documents); @@ -1467,9 +1461,13 @@ fn some_documents<'a, 't: 'a>( Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? { + for ( + name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }, + ) in index.embeddings(rtxn, key)? + { let embeddings = - ExplicitVectors { embeddings: Some(vector.into()), regenerate }; + ExplicitVectors { embeddings: Some(embeddings.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, @@ -1494,6 +1492,7 @@ fn retrieve_documents>( attributes_to_retrieve: Option>, retrieve_vectors: RetrieveVectors, features: RoFeatures, + sort_criteria: Option>, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -1526,15 +1525,32 @@ fn retrieve_documents>( })? } - let (it, number_of_documents) = { + let (it, number_of_documents) = if let Some(sort) = sort_criteria { + let number_of_documents = candidates.len(); + let facet_sort = recursive_sort(index, &rtxn, sort, &candidates)?; + let iter = facet_sort.iter()?; + let mut documents = Vec::with_capacity(limit); + for result in iter.skip(offset).take(limit) { + documents.push(result?); + } + ( + itertools::Either::Left(some_documents( + index, + &rtxn, + documents.into_iter(), + retrieve_vectors, + )?), + number_of_documents, + ) + } else { let number_of_documents = candidates.len(); ( - some_documents( + itertools::Either::Right(some_documents( index, &rtxn, candidates.into_iter().skip(offset).take(limit), retrieve_vectors, - )?, + )?), number_of_documents, ) }; diff --git a/crates/meilisearch/src/search/federated/perform.rs b/crates/meilisearch/src/search/federated/perform.rs index 5ad64d63c..c0fec01e8 100644 --- a/crates/meilisearch/src/search/federated/perform.rs +++ b/crates/meilisearch/src/search/federated/perform.rs @@ -745,10 +745,9 @@ impl SearchByIndex { match sort.iter().map(|s| milli::AscDesc::from_str(s)).collect() { Ok(sorts) => sorts, Err(asc_desc_error) => { - return Err(milli::Error::from(milli::SortError::from( - asc_desc_error, - )) - .into()) + return Err(milli::SortError::from(asc_desc_error) + .into_search_error() + .into()) } }; Some(sorts) diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 1c987a70c..82096e7b4 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -16,7 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::locales::Locale; -use meilisearch_types::milli::index::{self, SearchParameters}; +use meilisearch_types::milli::index::{self, EmbeddingsWithMetadata, SearchParameters}; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; @@ -1051,6 +1051,7 @@ pub fn prepare_search<'t>( .unwrap_or(DEFAULT_PAGINATION_MAX_TOTAL_HITS); search.exhaustive_number_hits(is_finite_pagination); + search.max_total_hits(Some(max_total_hits)); search.scoring_strategy( if query.show_ranking_score || query.show_ranking_score_details @@ -1091,7 +1092,7 @@ pub fn prepare_search<'t>( let sort = match sort.iter().map(|s| AscDesc::from_str(s)).collect() { Ok(sorts) => sorts, Err(asc_desc_error) => { - return Err(milli::Error::from(SortError::from(asc_desc_error)).into()) + return Err(SortError::from(asc_desc_error).into_search_error().into()) } }; @@ -1527,8 +1528,11 @@ impl<'a> HitMaker<'a> { Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? { - let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate }; + for (name, EmbeddingsWithMetadata { embeddings, regenerate, has_fragments: _ }) in + self.index.embeddings(self.rtxn, id)? + { + let embeddings = + ExplicitVectors { embeddings: Some(embeddings.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index 2688dd918..6dc3f429b 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -419,14 +419,14 @@ async fn error_add_api_key_invalid_parameters_actions() { let (response, code) = server.add_api_key(content).await; meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" + meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r#" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`, `*.get`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" } - "###); + "#); } #[actix_rt::test] @@ -790,7 +790,7 @@ async fn list_api_keys() { meili_snap::snapshot!(code, @"201 Created"); let (response, code) = server.list_api_keys("").await; - meili_snap::snapshot!(meili_snap::json_string!(response, { ".results[].createdAt" => "[ignored]", ".results[].updatedAt" => "[ignored]", ".results[].uid" => "[ignored]", ".results[].key" => "[ignored]" }), @r###" + meili_snap::snapshot!(meili_snap::json_string!(response, { ".results[].createdAt" => "[ignored]", ".results[].updatedAt" => "[ignored]", ".results[].uid" => "[ignored]", ".results[].key" => "[ignored]" }), @r#" { "results": [ { @@ -850,6 +850,22 @@ async fn list_api_keys() { "createdAt": "[ignored]", "updatedAt": "[ignored]" }, + { + "name": "Default Read-Only Admin API Key", + "description": "Use it to read information across the whole database. Caution! Do not expose this key on a public frontend", + "key": "[ignored]", + "uid": "[ignored]", + "actions": [ + "*.get", + "keys.get" + ], + "indexes": [ + "*" + ], + "expiresAt": null, + "createdAt": "[ignored]", + "updatedAt": "[ignored]" + }, { "name": "Default Chat API Key", "description": "Use it to chat and search from the frontend", @@ -869,9 +885,9 @@ async fn list_api_keys() { ], "offset": 0, "limit": 20, - "total": 4 + "total": 5 } - "###); + "#); meili_snap::snapshot!(code, @"200 OK"); } diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index 687cb67a0..b16ccb2f5 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -91,14 +91,14 @@ async fn create_api_key_bad_actions() { // can't parse let (response, code) = server.add_api_key(json!({ "actions": ["doggo"] })).await; snapshot!(code, @"400 Bad Request"); - snapshot!(json_string!(response), @r###" + snapshot!(json_string!(response), @r#" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`, `*.get`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" } - "###); + "#); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/common/index.rs b/crates/meilisearch/tests/common/index.rs index b4ae151f3..bb1506022 100644 --- a/crates/meilisearch/tests/common/index.rs +++ b/crates/meilisearch/tests/common/index.rs @@ -551,5 +551,7 @@ pub struct GetAllDocumentsOptions { pub offset: Option, #[serde(skip_serializing_if = "Option::is_none")] pub fields: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub sort: Option>, pub retrieve_vectors: bool, } diff --git a/crates/meilisearch/tests/common/mod.rs b/crates/meilisearch/tests/common/mod.rs index 1395aac5a..03b1271f1 100644 --- a/crates/meilisearch/tests/common/mod.rs +++ b/crates/meilisearch/tests/common/mod.rs @@ -3,8 +3,10 @@ pub mod index; pub mod server; pub mod service; +use std::collections::BTreeMap; use std::fmt::{self, Display}; +use actix_http::StatusCode; #[allow(unused)] pub use index::GetAllDocumentsOptions; use meili_snap::json_string; @@ -13,6 +15,8 @@ use serde::{Deserialize, Serialize}; #[allow(unused)] pub use server::{default_settings, Server}; use tokio::sync::OnceCell; +use wiremock::matchers::{method, path}; +use wiremock::{Mock, MockServer, Request, ResponseTemplate}; use crate::common::index::Index; @@ -517,3 +521,166 @@ pub async fn shared_index_with_geo_documents() -> &'static Index<'static, Shared }) .await } + +pub async fn shared_index_for_fragments() -> Index<'static, Shared> { + static INDEX: OnceCell<(Server, String)> = OnceCell::const_new(); + let (server, uid) = INDEX + .get_or_init(|| async { + let (server, uid, _) = init_fragments_index().await; + (server.into_shared(), uid) + }) + .await; + server._index(uid).to_shared() +} + +async fn fragment_mock_server() -> String { + let text_to_embedding: BTreeMap<_, _> = vec![ + ("kefir", [0.5, -0.5, 0.0]), + ("intel", [1.0, 1.0, 0.0]), + ("dustin", [-0.5, 0.5, 0.0]), + ("bulldog", [0.0, 0.0, 1.0]), + ("labrador", [0.0, 0.0, -1.0]), + ("{{ doc.", [-9999.0, -9999.0, -9999.0]), // If a template didn't render + ] + .into_iter() + .collect(); + + let mock_server = Box::leak(Box::new(MockServer::start().await)); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let text = String::from_utf8_lossy(&req.body).to_string(); + + let mut data = [0.0, 0.0, 0.0]; + for (inner_text, inner_data) in &text_to_embedding { + if text.contains(inner_text) { + for (i, &value) in inner_data.iter().enumerate() { + data[i] += value; + } + } + } + ResponseTemplate::new(200).set_body_json(json!({ "data": data })) + }) + .mount(mock_server) + .await; + + mock_server.uri() +} + +pub async fn init_fragments_index() -> (Server, String, crate::common::Value) { + let url = fragment_mock_server().await; + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + assert_eq!(code, StatusCode::OK); + + // Configure the index to use our mock embedder + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "justBreed": {"value": "It's a {{ media.breed }}"}, + "justName": {"value": "{{ media.name }} is a dog"}, + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, StatusCode::ACCEPTED); + + server.wait_task(response.uid()).await.succeeded(); + + // Send documents + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel", "breed": "labrador"}, + {"id": 3, "name": "dustin", "breed": "bulldog"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + assert_eq!(code, StatusCode::ACCEPTED); + + let _task = server.wait_task(value.uid()).await.succeeded(); + + let uid = index.uid.clone(); + (server, uid, settings) +} + +pub async fn init_fragments_index_composite() -> (Server, String, crate::common::Value) { + let url = fragment_mock_server().await; + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + assert_eq!(code, StatusCode::OK); + + let (_response, code) = server.set_features(json!({"compositeEmbedders": true})).await; + assert_eq!(code, StatusCode::OK); + + // Configure the index to use our mock embedder + let settings = json!({ + "embedders": { + "rest": { + "source": "composite", + "searchEmbedder": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + "indexingEmbedder": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + } + }, + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + assert_eq!(code, StatusCode::ACCEPTED); + + server.wait_task(response.uid()).await.succeeded(); + + // Send documents + let documents = json!([ + {"id": 0, "name": "kefir"}, + {"id": 1, "name": "echo", "_vectors": { "rest": [1, 1, 1] }}, + {"id": 2, "name": "intel", "breed": "labrador"}, + {"id": 3, "name": "dustin", "breed": "bulldog"}, + ]); + let (value, code) = index.add_documents(documents, None).await; + assert_eq!(code, StatusCode::ACCEPTED); + + server.wait_task(value.uid()).await.succeeded(); + + let uid = index.uid.clone(); + (server, uid, settings) +} diff --git a/crates/meilisearch/tests/common/server.rs b/crates/meilisearch/tests/common/server.rs index 89c5a3aaa..63c990466 100644 --- a/crates/meilisearch/tests/common/server.rs +++ b/crates/meilisearch/tests/common/server.rs @@ -35,7 +35,7 @@ pub struct Server { pub static TEST_TEMP_DIR: Lazy = Lazy::new(|| TempDir::new().unwrap()); impl Server { - fn into_shared(self) -> Server { + pub(super) fn into_shared(self) -> Server { Server { service: self.service, _dir: self._dir, _marker: PhantomData } } @@ -97,6 +97,7 @@ impl Server { self.use_api_key(master_key); let (response, code) = self.list_api_keys("").await; assert_eq!(200, code, "{:?}", response); + // TODO: relying on the order of keys is not ideal, we should use the name instead let admin_key = &response["results"][1]["key"]; self.use_api_key(admin_key.as_str().unwrap()); } @@ -465,6 +466,7 @@ pub fn default_settings(dir: impl AsRef) -> Opt { // Having 2 threads makes the tests way faster max_indexing_threads: MaxThreads::from_str("2").unwrap(), experimental_no_edition_2024_for_settings: false, + experimental_no_edition_2024_for_dumps: false, }, experimental_enable_metrics: false, ..Parser::parse_from(None as Option<&str>) diff --git a/crates/meilisearch/tests/documents/get_documents.rs b/crates/meilisearch/tests/documents/get_documents.rs index 44eb181df..b3c68351f 100644 --- a/crates/meilisearch/tests/documents/get_documents.rs +++ b/crates/meilisearch/tests/documents/get_documents.rs @@ -5,8 +5,8 @@ use urlencoding::encode as urlencode; use crate::common::encoder::Encoder; use crate::common::{ - shared_does_not_exists_index, shared_empty_index, shared_index_with_test_set, - GetAllDocumentsOptions, Server, Value, + shared_does_not_exists_index, shared_empty_index, shared_index_with_geo_documents, + shared_index_with_test_set, GetAllDocumentsOptions, Server, Value, }; use crate::json; @@ -83,6 +83,311 @@ async fn get_document() { ); } +#[actix_rt::test] +async fn get_document_sorted() { + let server = Server::new_shared(); + let index = server.unique_index(); + index.load_test_set(server).await; + + let (task, _status_code) = + index.update_settings_sortable_attributes(json!(["age", "email", "gender", "name"])).await; + server.wait_task(task.uid()).await.succeeded(); + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["id", "age", "email"]), + sort: Some(vec!["age:asc", "email:desc"]), + ..Default::default() + }) + .await; + let results = response["results"].as_array().unwrap(); + snapshot!(json_string!(results), @r#" + [ + { + "id": 5, + "age": 20, + "email": "warrenwatson@chorizon.com" + }, + { + "id": 6, + "age": 20, + "email": "sheliaberry@chorizon.com" + }, + { + "id": 57, + "age": 20, + "email": "kaitlinconner@chorizon.com" + }, + { + "id": 45, + "age": 20, + "email": "irenebennett@chorizon.com" + }, + { + "id": 40, + "age": 21, + "email": "staffordemerson@chorizon.com" + }, + { + "id": 41, + "age": 21, + "email": "salinasgamble@chorizon.com" + }, + { + "id": 63, + "age": 21, + "email": "knowleshebert@chorizon.com" + }, + { + "id": 50, + "age": 21, + "email": "guerramcintyre@chorizon.com" + }, + { + "id": 44, + "age": 22, + "email": "jonispears@chorizon.com" + }, + { + "id": 56, + "age": 23, + "email": "tuckerbarry@chorizon.com" + }, + { + "id": 51, + "age": 23, + "email": "keycervantes@chorizon.com" + }, + { + "id": 60, + "age": 23, + "email": "jodyherrera@chorizon.com" + }, + { + "id": 70, + "age": 23, + "email": "glassperkins@chorizon.com" + }, + { + "id": 75, + "age": 24, + "email": "emmajacobs@chorizon.com" + }, + { + "id": 68, + "age": 24, + "email": "angelinadyer@chorizon.com" + }, + { + "id": 17, + "age": 25, + "email": "ortegabrennan@chorizon.com" + }, + { + "id": 76, + "age": 25, + "email": "claricegardner@chorizon.com" + }, + { + "id": 43, + "age": 25, + "email": "arnoldbender@chorizon.com" + }, + { + "id": 12, + "age": 25, + "email": "aidakirby@chorizon.com" + }, + { + "id": 9, + "age": 26, + "email": "kellimendez@chorizon.com" + } + ] + "#); + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["id", "gender", "name"]), + sort: Some(vec!["gender:asc", "name:asc"]), + ..Default::default() + }) + .await; + let results = response["results"].as_array().unwrap(); + snapshot!(json_string!(results), @r#" + [ + { + "id": 3, + "name": "Adeline Flynn", + "gender": "female" + }, + { + "id": 12, + "name": "Aida Kirby", + "gender": "female" + }, + { + "id": 68, + "name": "Angelina Dyer", + "gender": "female" + }, + { + "id": 15, + "name": "Aurelia Contreras", + "gender": "female" + }, + { + "id": 36, + "name": "Barbra Valenzuela", + "gender": "female" + }, + { + "id": 23, + "name": "Blanca Mcclain", + "gender": "female" + }, + { + "id": 53, + "name": "Caitlin Burnett", + "gender": "female" + }, + { + "id": 71, + "name": "Candace Sawyer", + "gender": "female" + }, + { + "id": 65, + "name": "Carole Rowland", + "gender": "female" + }, + { + "id": 33, + "name": "Cecilia Greer", + "gender": "female" + }, + { + "id": 1, + "name": "Cherry Orr", + "gender": "female" + }, + { + "id": 38, + "name": "Christina Short", + "gender": "female" + }, + { + "id": 7, + "name": "Chrystal Boyd", + "gender": "female" + }, + { + "id": 76, + "name": "Clarice Gardner", + "gender": "female" + }, + { + "id": 73, + "name": "Eleanor Shepherd", + "gender": "female" + }, + { + "id": 75, + "name": "Emma Jacobs", + "gender": "female" + }, + { + "id": 16, + "name": "Estella Bass", + "gender": "female" + }, + { + "id": 62, + "name": "Estelle Ramirez", + "gender": "female" + }, + { + "id": 20, + "name": "Florence Long", + "gender": "female" + }, + { + "id": 42, + "name": "Graciela Russell", + "gender": "female" + } + ] + "#); +} + +#[actix_rt::test] +async fn get_document_geosorted() { + let index = shared_index_with_geo_documents().await; + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + sort: Some(vec!["_geoPoint(45.4777599, 9.1967508):asc"]), + ..Default::default() + }) + .await; + let results = response["results"].as_array().unwrap(); + snapshot!(json_string!(results), @r#" + [ + { + "id": 2, + "name": "La Bella Italia", + "address": "456 Elm Street, Townsville", + "type": "Italian", + "rating": 9, + "_geo": { + "lat": "45.4777599", + "lng": "9.1967508" + } + }, + { + "id": 1, + "name": "Taco Truck", + "address": "444 Salsa Street, Burritoville", + "type": "Mexican", + "rating": 9, + "_geo": { + "lat": 34.0522, + "lng": -118.2437 + } + }, + { + "id": 3, + "name": "Crêpe Truck", + "address": "2 Billig Avenue, Rouenville", + "type": "French", + "rating": 10 + } + ] + "#); +} + +#[actix_rt::test] +async fn get_document_sort_the_unsortable() { + let index = shared_index_with_test_set().await; + + let (response, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["id", "name"]), + sort: Some(vec!["name:asc"]), + ..Default::default() + }) + .await; + + snapshot!(json_string!(response), @r#" + { + "message": "Attribute `name` is not sortable. This index does not have configured sortable attributes.", + "code": "invalid_document_sort", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_sort" + } + "#); +} + #[actix_rt::test] async fn error_get_unexisting_index_all_documents() { let index = shared_does_not_exists_index().await; diff --git a/crates/meilisearch/tests/index/stats.rs b/crates/meilisearch/tests/index/stats.rs index 610601318..7f2ca9b4a 100644 --- a/crates/meilisearch/tests/index/stats.rs +++ b/crates/meilisearch/tests/index/stats.rs @@ -1,5 +1,4 @@ use crate::common::{shared_does_not_exists_index, Server}; - use crate::json; #[actix_rt::test] diff --git a/crates/meilisearch/tests/search/multi/proxy.rs b/crates/meilisearch/tests/search/multi/proxy.rs index 311f69d9e..943295da5 100644 --- a/crates/meilisearch/tests/search/multi/proxy.rs +++ b/crates/meilisearch/tests/search/multi/proxy.rs @@ -2500,7 +2500,7 @@ pub struct LocalMeiliParams { /// A server that exploits [`MockServer`] to provide an URL for testing network and the network. pub struct LocalMeili { - mock_server: MockServer, + mock_server: &'static MockServer, } impl LocalMeili { @@ -2509,7 +2509,7 @@ impl LocalMeili { } pub async fn with_params(server: Arc, params: LocalMeiliParams) -> Self { - let mock_server = MockServer::start().await; + let mock_server = Box::leak(Box::new(MockServer::start().await)); // tokio won't let us execute asynchronous code from a sync function inside of an async test, // so instead we spawn another thread that will call the service on a brand new tokio runtime @@ -2573,7 +2573,7 @@ impl LocalMeili { response.set_body_json(value) } }) - .mount(&mock_server) + .mount(mock_server) .await; Self { mock_server } } diff --git a/crates/meilisearch/tests/search/pagination.rs b/crates/meilisearch/tests/search/pagination.rs index c0752e7ec..6dd8b3181 100644 --- a/crates/meilisearch/tests/search/pagination.rs +++ b/crates/meilisearch/tests/search/pagination.rs @@ -1,6 +1,7 @@ use super::shared_index_with_documents; use crate::common::Server; use crate::json; +use meili_snap::{json_string, snapshot}; #[actix_rt::test] async fn default_search_should_return_estimated_total_hit() { @@ -133,3 +134,61 @@ async fn ensure_placeholder_search_hit_count_valid() { .await; } } + +#[actix_rt::test] +async fn test_issue_5274() { + let server = Server::new_shared(); + let index = server.unique_index(); + + let documents = json!([ + { + "id": 1, + "title": "Document 1", + "content": "This is the first." + }, + { + "id": 2, + "title": "Document 2", + "content": "This is the second doc." + } + ]); + let (task, _code) = index.add_documents(documents, None).await; + server.wait_task(task.uid()).await.succeeded(); + + // Find out the lowest ranking score among the documents + let (rep, _status) = index + .search_post(json!({"q": "doc", "page": 1, "hitsPerPage": 2, "showRankingScore": true})) + .await; + let hits = rep["hits"].as_array().expect("Missing hits array"); + let second_hit = hits.get(1).expect("Missing second hit"); + let ranking_score = second_hit + .get("_rankingScore") + .expect("Missing _rankingScore field") + .as_f64() + .expect("Expected _rankingScore to be a f64"); + + // Search with a ranking score threshold just above and expect to be a single hit + let (rep, _status) = index + .search_post(json!({"q": "doc", "page": 1, "hitsPerPage": 1, "rankingScoreThreshold": ranking_score + 0.0001})) + .await; + + snapshot!(json_string!(rep, { + ".processingTimeMs" => "[ignored]", + }), @r#" + { + "hits": [ + { + "id": 2, + "title": "Document 2", + "content": "This is the second doc." + } + ], + "query": "doc", + "processingTimeMs": "[ignored]", + "hitsPerPage": 1, + "page": 1, + "totalPages": 1, + "totalHits": 1 + } + "#); +} diff --git a/crates/meilisearch/tests/settings/get_settings.rs b/crates/meilisearch/tests/settings/get_settings.rs index 47e699380..f50f7f940 100644 --- a/crates/meilisearch/tests/settings/get_settings.rs +++ b/crates/meilisearch/tests/settings/get_settings.rs @@ -692,3 +692,68 @@ async fn granular_filterable_attributes() { ] "###); } + +#[actix_rt::test] +async fn test_searchable_attributes_order() { + let server = Server::new_shared(); + let index = server.unique_index(); + + // 1) Create an index with settings "searchableAttributes": ["title", "overview"] + let (response, code) = index.create(None).await; + assert_eq!(code, 202, "{response}"); + server.wait_task(response.uid()).await.succeeded(); + + let (task, code) = index + .update_settings(json!({ + "searchableAttributes": ["title", "overview"] + })) + .await; + assert_eq!(code, 202, "{task}"); + server.wait_task(task.uid()).await.succeeded(); + + // 2) Add documents in the index + let documents = json!([ + { + "id": 1, + "title": "The Matrix", + "overview": "A computer hacker learns from mysterious rebels about the true nature of his reality." + }, + { + "id": 2, + "title": "Inception", + "overview": "A thief who steals corporate secrets through dream-sharing technology." + } + ]); + + let (response, code) = index.add_documents(documents, None).await; + assert_eq!(code, 202, "{response}"); + server.wait_task(response.uid()).await.succeeded(); + + // 3) Modify the settings "searchableAttributes": ["overview", "title"] (overview is put first) + let (task, code) = index + .update_settings(json!({ + "searchableAttributes": ["overview", "title"] + })) + .await; + assert_eq!(code, 202, "{task}"); + server.wait_task(task.uid()).await.succeeded(); + + // 4) Check if it has been applied + let (response, code) = index.settings().await; + assert_eq!(code, 200, "{response}"); + assert_eq!(response["searchableAttributes"], json!(["overview", "title"])); + + // 5) Re-modify the settings "searchableAttributes": ["title", "overview"] (title is put first) + let (task, code) = index + .update_settings(json!({ + "searchableAttributes": ["title", "overview"] + })) + .await; + assert_eq!(code, 202, "{task}"); + server.wait_task(task.uid()).await.succeeded(); + + // 6) Check if it has been applied + let (response, code) = index.settings().await; + assert_eq!(code, 200, "{response}"); + assert_eq!(response["searchableAttributes"], json!(["title", "overview"])); +} diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap index af7e82c8b..3c97dbe70 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/kefir_settings.snap @@ -61,7 +61,16 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "pagination": { "maxTotalHits": 15 }, - "embedders": {}, + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "pooling": "forceMean", + "documentTemplate": "{{doc.description}}", + "documentTemplateMaxBytes": 400 + } + }, "searchCutoffMs": 8000, "localizedAttributes": [ { diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/search_with_retrieve_vectors.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/search_with_retrieve_vectors.snap new file mode 100644 index 000000000..5baf8155c --- /dev/null +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_features/search_with_retrieve_vectors.snap @@ -0,0 +1,40 @@ +--- +source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +--- +[ + { + "id": 1, + "name": "kefir", + "surname": [ + "kef", + "kefkef", + "kefirounet", + "boubou" + ], + "age": 1.4, + "description": "kefir est un petit chien blanc très mignon", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "surname": [ + "untel", + "tétel", + "iouiou" + ], + "age": 11.5, + "description": "intel est un grand beagle très mignon", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": false + } + } + } +] diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index f4edae51b..b56cc5ca3 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -4,7 +4,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 24, + "uid": 30, "progress": null, "details": { "upgradeFrom": "v1.12.0", @@ -26,6 +26,155 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "finishedAt": "[date]", "batchStrategy": "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type." }, + { + "uid": 29, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.067201S", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z", + "batchStrategy": "unspecified" + }, + { + "uid": 28, + "progress": null, + "details": { + "deletedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.012727S", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z", + "batchStrategy": "unspecified" + }, + { + "uid": 27, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.059920S", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z", + "batchStrategy": "unspecified" + }, + { + "uid": 26, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.088879S", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z", + "batchStrategy": "unspecified" + }, + { + "uid": 25, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.312911S", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z", + "batchStrategy": "unspecified" + }, + { + "uid": 24, + "progress": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.247378S", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z", + "batchStrategy": "unspecified" + }, { "uid": 23, "progress": null, @@ -348,179 +497,10 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "2025-01-16T17:01:14.112756687Z", "finishedAt": "2025-01-16T17:01:14.120064527Z", "batchStrategy": "unspecified" - }, - { - "uid": 10, - "progress": null, - "details": { - "faceting": { - "maxValuesPerFacet": 99 - }, - "pagination": { - "maxTotalHits": 15 - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007391353S", - "startedAt": "2025-01-16T17:00:29.201180268Z", - "finishedAt": "2025-01-16T17:00:29.208571621Z", - "batchStrategy": "unspecified" - }, - { - "uid": 9, - "progress": null, - "details": { - "faceting": { - "maxValuesPerFacet": 100 - }, - "pagination": { - "maxTotalHits": 1000 - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007445825S", - "startedAt": "2025-01-16T17:00:15.77629445Z", - "finishedAt": "2025-01-16T17:00:15.783740275Z", - "batchStrategy": "unspecified" - }, - { - "uid": 8, - "progress": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - }, - "disableOnWords": [ - "kefir" - ], - "disableOnAttributes": [ - "surname" - ] - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.012020083S", - "startedAt": "2025-01-16T16:59:42.744086671Z", - "finishedAt": "2025-01-16T16:59:42.756106754Z", - "batchStrategy": "unspecified" - }, - { - "uid": 7, - "progress": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - } - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007440092S", - "startedAt": "2025-01-16T16:58:41.2155771Z", - "finishedAt": "2025-01-16T16:58:41.223017192Z", - "batchStrategy": "unspecified" - }, - { - "uid": 6, - "progress": null, - "details": { - "synonyms": { - "boubou": [ - "kefir" - ] - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007565161S", - "startedAt": "2025-01-16T16:54:51.940332781Z", - "finishedAt": "2025-01-16T16:54:51.947897942Z", - "batchStrategy": "unspecified" - }, - { - "uid": 5, - "progress": null, - "details": { - "stopWords": [ - "le", - "un" - ] - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.016307263S", - "startedAt": "2025-01-16T16:53:19.913351957Z", - "finishedAt": "2025-01-16T16:53:19.92965922Z", - "batchStrategy": "unspecified" } ], - "total": 23, + "total": 29, "limit": 20, - "from": 24, - "next": 4 + "from": 30, + "next": 10 } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index f4edae51b..b56cc5ca3 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -4,7 +4,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 24, + "uid": 30, "progress": null, "details": { "upgradeFrom": "v1.12.0", @@ -26,6 +26,155 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "finishedAt": "[date]", "batchStrategy": "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type." }, + { + "uid": 29, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.067201S", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z", + "batchStrategy": "unspecified" + }, + { + "uid": 28, + "progress": null, + "details": { + "deletedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.012727S", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z", + "batchStrategy": "unspecified" + }, + { + "uid": 27, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.059920S", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z", + "batchStrategy": "unspecified" + }, + { + "uid": 26, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.088879S", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z", + "batchStrategy": "unspecified" + }, + { + "uid": 25, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.312911S", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z", + "batchStrategy": "unspecified" + }, + { + "uid": 24, + "progress": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.247378S", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z", + "batchStrategy": "unspecified" + }, { "uid": 23, "progress": null, @@ -348,179 +497,10 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "2025-01-16T17:01:14.112756687Z", "finishedAt": "2025-01-16T17:01:14.120064527Z", "batchStrategy": "unspecified" - }, - { - "uid": 10, - "progress": null, - "details": { - "faceting": { - "maxValuesPerFacet": 99 - }, - "pagination": { - "maxTotalHits": 15 - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007391353S", - "startedAt": "2025-01-16T17:00:29.201180268Z", - "finishedAt": "2025-01-16T17:00:29.208571621Z", - "batchStrategy": "unspecified" - }, - { - "uid": 9, - "progress": null, - "details": { - "faceting": { - "maxValuesPerFacet": 100 - }, - "pagination": { - "maxTotalHits": 1000 - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007445825S", - "startedAt": "2025-01-16T17:00:15.77629445Z", - "finishedAt": "2025-01-16T17:00:15.783740275Z", - "batchStrategy": "unspecified" - }, - { - "uid": 8, - "progress": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - }, - "disableOnWords": [ - "kefir" - ], - "disableOnAttributes": [ - "surname" - ] - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.012020083S", - "startedAt": "2025-01-16T16:59:42.744086671Z", - "finishedAt": "2025-01-16T16:59:42.756106754Z", - "batchStrategy": "unspecified" - }, - { - "uid": 7, - "progress": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - } - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007440092S", - "startedAt": "2025-01-16T16:58:41.2155771Z", - "finishedAt": "2025-01-16T16:58:41.223017192Z", - "batchStrategy": "unspecified" - }, - { - "uid": 6, - "progress": null, - "details": { - "synonyms": { - "boubou": [ - "kefir" - ] - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007565161S", - "startedAt": "2025-01-16T16:54:51.940332781Z", - "finishedAt": "2025-01-16T16:54:51.947897942Z", - "batchStrategy": "unspecified" - }, - { - "uid": 5, - "progress": null, - "details": { - "stopWords": [ - "le", - "un" - ] - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.016307263S", - "startedAt": "2025-01-16T16:53:19.913351957Z", - "finishedAt": "2025-01-16T16:53:19.92965922Z", - "batchStrategy": "unspecified" } ], - "total": 23, + "total": 29, "limit": 20, - "from": 24, - "next": 4 + "from": 30, + "next": 10 } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index f4edae51b..b56cc5ca3 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -4,7 +4,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 24, + "uid": 30, "progress": null, "details": { "upgradeFrom": "v1.12.0", @@ -26,6 +26,155 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "finishedAt": "[date]", "batchStrategy": "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type." }, + { + "uid": 29, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.067201S", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z", + "batchStrategy": "unspecified" + }, + { + "uid": 28, + "progress": null, + "details": { + "deletedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.012727S", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z", + "batchStrategy": "unspecified" + }, + { + "uid": 27, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.059920S", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z", + "batchStrategy": "unspecified" + }, + { + "uid": 26, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.088879S", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z", + "batchStrategy": "unspecified" + }, + { + "uid": 25, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.312911S", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z", + "batchStrategy": "unspecified" + }, + { + "uid": 24, + "progress": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.247378S", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z", + "batchStrategy": "unspecified" + }, { "uid": 23, "progress": null, @@ -348,179 +497,10 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "2025-01-16T17:01:14.112756687Z", "finishedAt": "2025-01-16T17:01:14.120064527Z", "batchStrategy": "unspecified" - }, - { - "uid": 10, - "progress": null, - "details": { - "faceting": { - "maxValuesPerFacet": 99 - }, - "pagination": { - "maxTotalHits": 15 - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007391353S", - "startedAt": "2025-01-16T17:00:29.201180268Z", - "finishedAt": "2025-01-16T17:00:29.208571621Z", - "batchStrategy": "unspecified" - }, - { - "uid": 9, - "progress": null, - "details": { - "faceting": { - "maxValuesPerFacet": 100 - }, - "pagination": { - "maxTotalHits": 1000 - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007445825S", - "startedAt": "2025-01-16T17:00:15.77629445Z", - "finishedAt": "2025-01-16T17:00:15.783740275Z", - "batchStrategy": "unspecified" - }, - { - "uid": 8, - "progress": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - }, - "disableOnWords": [ - "kefir" - ], - "disableOnAttributes": [ - "surname" - ] - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.012020083S", - "startedAt": "2025-01-16T16:59:42.744086671Z", - "finishedAt": "2025-01-16T16:59:42.756106754Z", - "batchStrategy": "unspecified" - }, - { - "uid": 7, - "progress": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - } - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007440092S", - "startedAt": "2025-01-16T16:58:41.2155771Z", - "finishedAt": "2025-01-16T16:58:41.223017192Z", - "batchStrategy": "unspecified" - }, - { - "uid": 6, - "progress": null, - "details": { - "synonyms": { - "boubou": [ - "kefir" - ] - } - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.007565161S", - "startedAt": "2025-01-16T16:54:51.940332781Z", - "finishedAt": "2025-01-16T16:54:51.947897942Z", - "batchStrategy": "unspecified" - }, - { - "uid": 5, - "progress": null, - "details": { - "stopWords": [ - "le", - "un" - ] - }, - "stats": { - "totalNbTasks": 1, - "status": { - "succeeded": 1 - }, - "types": { - "settingsUpdate": 1 - }, - "indexUids": { - "kefir": 1 - } - }, - "duration": "PT0.016307263S", - "startedAt": "2025-01-16T16:53:19.913351957Z", - "finishedAt": "2025-01-16T16:53:19.92965922Z", - "batchStrategy": "unspecified" } ], - "total": 23, + "total": 29, "limit": 20, - "from": 24, - "next": 4 + "from": 30, + "next": 10 } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index 01d2ea341..a52072f56 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -4,8 +4,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 25, - "batchUid": 24, + "uid": 31, + "batchUid": 30, "indexUid": null, "status": "succeeded", "type": "upgradeDatabase", @@ -20,6 +20,118 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "[date]", "finishedAt": "[date]" }, + { + "uid": 30, + "batchUid": 29, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.067201S", + "enqueuedAt": "2025-07-07T13:43:08.772432Z", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z" + }, + { + "uid": 29, + "batchUid": 28, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 1 + }, + "error": null, + "duration": "PT0.012727S", + "enqueuedAt": "2025-07-07T13:42:50.744793Z", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z" + }, + { + "uid": 28, + "batchUid": 27, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `kefir`: Bad embedder configuration in the document with id: `2`. Could not parse `._vectors.doggo_embedder`: trailing characters at line 1 column 13", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "PT0.059920S", + "enqueuedAt": "2025-07-07T13:42:15.624598Z", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z" + }, + { + "uid": 27, + "batchUid": 26, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.088879S", + "enqueuedAt": "2025-07-07T13:40:01.46081Z", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z" + }, + { + "uid": 26, + "batchUid": 25, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.312911S", + "enqueuedAt": "2025-07-07T13:32:46.13871Z", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z" + }, + { + "uid": 25, + "batchUid": 24, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "error": null, + "duration": "PT0.247378S", + "enqueuedAt": "2025-07-07T13:28:27.390054Z", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z" + }, { "uid": 24, "batchUid": 23, @@ -264,134 +376,10 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "enqueuedAt": "2025-01-16T17:02:52.527382964Z", "startedAt": "2025-01-16T17:02:52.539749853Z", "finishedAt": "2025-01-16T17:02:52.547390016Z" - }, - { - "uid": 11, - "batchUid": 11, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "searchCutoffMs": 8000 - }, - "error": null, - "duration": "PT0.007307840S", - "enqueuedAt": "2025-01-16T17:01:14.100316617Z", - "startedAt": "2025-01-16T17:01:14.112756687Z", - "finishedAt": "2025-01-16T17:01:14.120064527Z" - }, - { - "uid": 10, - "batchUid": 10, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "faceting": { - "maxValuesPerFacet": 99 - }, - "pagination": { - "maxTotalHits": 15 - } - }, - "error": null, - "duration": "PT0.007391353S", - "enqueuedAt": "2025-01-16T17:00:29.188815062Z", - "startedAt": "2025-01-16T17:00:29.201180268Z", - "finishedAt": "2025-01-16T17:00:29.208571621Z" - }, - { - "uid": 9, - "batchUid": 9, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "faceting": { - "maxValuesPerFacet": 100 - }, - "pagination": { - "maxTotalHits": 1000 - } - }, - "error": null, - "duration": "PT0.007445825S", - "enqueuedAt": "2025-01-16T17:00:15.759501709Z", - "startedAt": "2025-01-16T17:00:15.77629445Z", - "finishedAt": "2025-01-16T17:00:15.783740275Z" - }, - { - "uid": 8, - "batchUid": 8, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - }, - "disableOnWords": [ - "kefir" - ], - "disableOnAttributes": [ - "surname" - ] - } - }, - "error": null, - "duration": "PT0.012020083S", - "enqueuedAt": "2025-01-16T16:59:42.727292501Z", - "startedAt": "2025-01-16T16:59:42.744086671Z", - "finishedAt": "2025-01-16T16:59:42.756106754Z" - }, - { - "uid": 7, - "batchUid": 7, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - } - } - }, - "error": null, - "duration": "PT0.007440092S", - "enqueuedAt": "2025-01-16T16:58:41.203145044Z", - "startedAt": "2025-01-16T16:58:41.2155771Z", - "finishedAt": "2025-01-16T16:58:41.223017192Z" - }, - { - "uid": 6, - "batchUid": 6, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "synonyms": { - "boubou": [ - "kefir" - ] - } - }, - "error": null, - "duration": "PT0.007565161S", - "enqueuedAt": "2025-01-16T16:54:51.927866243Z", - "startedAt": "2025-01-16T16:54:51.940332781Z", - "finishedAt": "2025-01-16T16:54:51.947897942Z" } ], - "total": 24, + "total": 30, "limit": 20, - "from": 25, - "next": 5 + "from": 31, + "next": 11 } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index 01d2ea341..a52072f56 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -4,8 +4,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 25, - "batchUid": 24, + "uid": 31, + "batchUid": 30, "indexUid": null, "status": "succeeded", "type": "upgradeDatabase", @@ -20,6 +20,118 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "[date]", "finishedAt": "[date]" }, + { + "uid": 30, + "batchUid": 29, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.067201S", + "enqueuedAt": "2025-07-07T13:43:08.772432Z", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z" + }, + { + "uid": 29, + "batchUid": 28, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 1 + }, + "error": null, + "duration": "PT0.012727S", + "enqueuedAt": "2025-07-07T13:42:50.744793Z", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z" + }, + { + "uid": 28, + "batchUid": 27, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `kefir`: Bad embedder configuration in the document with id: `2`. Could not parse `._vectors.doggo_embedder`: trailing characters at line 1 column 13", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "PT0.059920S", + "enqueuedAt": "2025-07-07T13:42:15.624598Z", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z" + }, + { + "uid": 27, + "batchUid": 26, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.088879S", + "enqueuedAt": "2025-07-07T13:40:01.46081Z", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z" + }, + { + "uid": 26, + "batchUid": 25, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.312911S", + "enqueuedAt": "2025-07-07T13:32:46.13871Z", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z" + }, + { + "uid": 25, + "batchUid": 24, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "error": null, + "duration": "PT0.247378S", + "enqueuedAt": "2025-07-07T13:28:27.390054Z", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z" + }, { "uid": 24, "batchUid": 23, @@ -264,134 +376,10 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "enqueuedAt": "2025-01-16T17:02:52.527382964Z", "startedAt": "2025-01-16T17:02:52.539749853Z", "finishedAt": "2025-01-16T17:02:52.547390016Z" - }, - { - "uid": 11, - "batchUid": 11, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "searchCutoffMs": 8000 - }, - "error": null, - "duration": "PT0.007307840S", - "enqueuedAt": "2025-01-16T17:01:14.100316617Z", - "startedAt": "2025-01-16T17:01:14.112756687Z", - "finishedAt": "2025-01-16T17:01:14.120064527Z" - }, - { - "uid": 10, - "batchUid": 10, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "faceting": { - "maxValuesPerFacet": 99 - }, - "pagination": { - "maxTotalHits": 15 - } - }, - "error": null, - "duration": "PT0.007391353S", - "enqueuedAt": "2025-01-16T17:00:29.188815062Z", - "startedAt": "2025-01-16T17:00:29.201180268Z", - "finishedAt": "2025-01-16T17:00:29.208571621Z" - }, - { - "uid": 9, - "batchUid": 9, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "faceting": { - "maxValuesPerFacet": 100 - }, - "pagination": { - "maxTotalHits": 1000 - } - }, - "error": null, - "duration": "PT0.007445825S", - "enqueuedAt": "2025-01-16T17:00:15.759501709Z", - "startedAt": "2025-01-16T17:00:15.77629445Z", - "finishedAt": "2025-01-16T17:00:15.783740275Z" - }, - { - "uid": 8, - "batchUid": 8, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - }, - "disableOnWords": [ - "kefir" - ], - "disableOnAttributes": [ - "surname" - ] - } - }, - "error": null, - "duration": "PT0.012020083S", - "enqueuedAt": "2025-01-16T16:59:42.727292501Z", - "startedAt": "2025-01-16T16:59:42.744086671Z", - "finishedAt": "2025-01-16T16:59:42.756106754Z" - }, - { - "uid": 7, - "batchUid": 7, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - } - } - }, - "error": null, - "duration": "PT0.007440092S", - "enqueuedAt": "2025-01-16T16:58:41.203145044Z", - "startedAt": "2025-01-16T16:58:41.2155771Z", - "finishedAt": "2025-01-16T16:58:41.223017192Z" - }, - { - "uid": 6, - "batchUid": 6, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "synonyms": { - "boubou": [ - "kefir" - ] - } - }, - "error": null, - "duration": "PT0.007565161S", - "enqueuedAt": "2025-01-16T16:54:51.927866243Z", - "startedAt": "2025-01-16T16:54:51.940332781Z", - "finishedAt": "2025-01-16T16:54:51.947897942Z" } ], - "total": 24, + "total": 30, "limit": 20, - "from": 25, - "next": 5 + "from": 31, + "next": 11 } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index 01d2ea341..a52072f56 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -4,8 +4,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 25, - "batchUid": 24, + "uid": 31, + "batchUid": 30, "indexUid": null, "status": "succeeded", "type": "upgradeDatabase", @@ -20,6 +20,118 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "[date]", "finishedAt": "[date]" }, + { + "uid": 30, + "batchUid": 29, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.067201S", + "enqueuedAt": "2025-07-07T13:43:08.772432Z", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z" + }, + { + "uid": 29, + "batchUid": 28, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 1 + }, + "error": null, + "duration": "PT0.012727S", + "enqueuedAt": "2025-07-07T13:42:50.744793Z", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z" + }, + { + "uid": 28, + "batchUid": 27, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `kefir`: Bad embedder configuration in the document with id: `2`. Could not parse `._vectors.doggo_embedder`: trailing characters at line 1 column 13", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "PT0.059920S", + "enqueuedAt": "2025-07-07T13:42:15.624598Z", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z" + }, + { + "uid": 27, + "batchUid": 26, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.088879S", + "enqueuedAt": "2025-07-07T13:40:01.46081Z", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z" + }, + { + "uid": 26, + "batchUid": 25, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.312911S", + "enqueuedAt": "2025-07-07T13:32:46.13871Z", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z" + }, + { + "uid": 25, + "batchUid": 24, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "error": null, + "duration": "PT0.247378S", + "enqueuedAt": "2025-07-07T13:28:27.390054Z", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z" + }, { "uid": 24, "batchUid": 23, @@ -264,134 +376,10 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "enqueuedAt": "2025-01-16T17:02:52.527382964Z", "startedAt": "2025-01-16T17:02:52.539749853Z", "finishedAt": "2025-01-16T17:02:52.547390016Z" - }, - { - "uid": 11, - "batchUid": 11, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "searchCutoffMs": 8000 - }, - "error": null, - "duration": "PT0.007307840S", - "enqueuedAt": "2025-01-16T17:01:14.100316617Z", - "startedAt": "2025-01-16T17:01:14.112756687Z", - "finishedAt": "2025-01-16T17:01:14.120064527Z" - }, - { - "uid": 10, - "batchUid": 10, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "faceting": { - "maxValuesPerFacet": 99 - }, - "pagination": { - "maxTotalHits": 15 - } - }, - "error": null, - "duration": "PT0.007391353S", - "enqueuedAt": "2025-01-16T17:00:29.188815062Z", - "startedAt": "2025-01-16T17:00:29.201180268Z", - "finishedAt": "2025-01-16T17:00:29.208571621Z" - }, - { - "uid": 9, - "batchUid": 9, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "faceting": { - "maxValuesPerFacet": 100 - }, - "pagination": { - "maxTotalHits": 1000 - } - }, - "error": null, - "duration": "PT0.007445825S", - "enqueuedAt": "2025-01-16T17:00:15.759501709Z", - "startedAt": "2025-01-16T17:00:15.77629445Z", - "finishedAt": "2025-01-16T17:00:15.783740275Z" - }, - { - "uid": 8, - "batchUid": 8, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - }, - "disableOnWords": [ - "kefir" - ], - "disableOnAttributes": [ - "surname" - ] - } - }, - "error": null, - "duration": "PT0.012020083S", - "enqueuedAt": "2025-01-16T16:59:42.727292501Z", - "startedAt": "2025-01-16T16:59:42.744086671Z", - "finishedAt": "2025-01-16T16:59:42.756106754Z" - }, - { - "uid": 7, - "batchUid": 7, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "typoTolerance": { - "minWordSizeForTypos": { - "oneTypo": 4 - } - } - }, - "error": null, - "duration": "PT0.007440092S", - "enqueuedAt": "2025-01-16T16:58:41.203145044Z", - "startedAt": "2025-01-16T16:58:41.2155771Z", - "finishedAt": "2025-01-16T16:58:41.223017192Z" - }, - { - "uid": 6, - "batchUid": 6, - "indexUid": "kefir", - "status": "succeeded", - "type": "settingsUpdate", - "canceledBy": null, - "details": { - "synonyms": { - "boubou": [ - "kefir" - ] - } - }, - "error": null, - "duration": "PT0.007565161S", - "enqueuedAt": "2025-01-16T16:54:51.927866243Z", - "startedAt": "2025-01-16T16:54:51.940332781Z", - "finishedAt": "2025-01-16T16:54:51.947897942Z" } ], - "total": 24, + "total": 30, "limit": 20, - "from": 25, - "next": 5 + "from": 31, + "next": 11 } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap index fb62b35da..81b50fb92 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -4,7 +4,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 24, + "uid": 30, "progress": null, "details": { "upgradeFrom": "v1.12.0", @@ -26,6 +26,155 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "finishedAt": "[date]", "batchStrategy": "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type." }, + { + "uid": 29, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.067201S", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z", + "batchStrategy": "unspecified" + }, + { + "uid": 28, + "progress": null, + "details": { + "deletedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "indexDeletion": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.012727S", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z", + "batchStrategy": "unspecified" + }, + { + "uid": 27, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "failed": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.059920S", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z", + "batchStrategy": "unspecified" + }, + { + "uid": 26, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "mieli": 1 + } + }, + "duration": "PT0.088879S", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z", + "batchStrategy": "unspecified" + }, + { + "uid": 25, + "progress": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "documentAdditionOrUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.312911S", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z", + "batchStrategy": "unspecified" + }, + { + "uid": 24, + "progress": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "stats": { + "totalNbTasks": 1, + "status": { + "succeeded": 1 + }, + "types": { + "settingsUpdate": 1 + }, + "indexUids": { + "kefir": 1 + } + }, + "duration": "PT0.247378S", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z", + "batchStrategy": "unspecified" + }, { "uid": 23, "progress": null, @@ -642,8 +791,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "batchStrategy": "unspecified" } ], - "total": 25, + "total": 31, "limit": 1000, - "from": 24, + "from": 30, "next": null } diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap index abb4dcdd9..1ec334fed 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap @@ -4,8 +4,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs { "results": [ { - "uid": 25, - "batchUid": 24, + "uid": 31, + "batchUid": 30, "indexUid": null, "status": "succeeded", "type": "upgradeDatabase", @@ -20,6 +20,118 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "startedAt": "[date]", "finishedAt": "[date]" }, + { + "uid": 30, + "batchUid": 29, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.067201S", + "enqueuedAt": "2025-07-07T13:43:08.772432Z", + "startedAt": "2025-07-07T13:43:08.772854Z", + "finishedAt": "2025-07-07T13:43:08.840055Z" + }, + { + "uid": 29, + "batchUid": 28, + "indexUid": "mieli", + "status": "succeeded", + "type": "indexDeletion", + "canceledBy": null, + "details": { + "deletedDocuments": 1 + }, + "error": null, + "duration": "PT0.012727S", + "enqueuedAt": "2025-07-07T13:42:50.744793Z", + "startedAt": "2025-07-07T13:42:50.745461Z", + "finishedAt": "2025-07-07T13:42:50.758188Z" + }, + { + "uid": 28, + "batchUid": 27, + "indexUid": "kefir", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `kefir`: Bad embedder configuration in the document with id: `2`. Could not parse `._vectors.doggo_embedder`: trailing characters at line 1 column 13", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "PT0.059920S", + "enqueuedAt": "2025-07-07T13:42:15.624598Z", + "startedAt": "2025-07-07T13:42:15.625413Z", + "finishedAt": "2025-07-07T13:42:15.685333Z" + }, + { + "uid": 27, + "batchUid": 26, + "indexUid": "mieli", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.088879S", + "enqueuedAt": "2025-07-07T13:40:01.46081Z", + "startedAt": "2025-07-07T13:40:01.461741Z", + "finishedAt": "2025-07-07T13:40:01.55062Z" + }, + { + "uid": 26, + "batchUid": 25, + "indexUid": "kefir", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "PT0.312911S", + "enqueuedAt": "2025-07-07T13:32:46.13871Z", + "startedAt": "2025-07-07T13:32:46.139785Z", + "finishedAt": "2025-07-07T13:32:46.452696Z" + }, + { + "uid": 25, + "batchUid": 24, + "indexUid": "kefir", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.description}}" + } + } + }, + "error": null, + "duration": "PT0.247378S", + "enqueuedAt": "2025-07-07T13:28:27.390054Z", + "startedAt": "2025-07-07T13:28:27.391344Z", + "finishedAt": "2025-07-07T13:28:27.638722Z" + }, { "uid": 24, "batchUid": 23, @@ -497,8 +609,8 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "finishedAt": "2025-01-16T16:45:16.131303739Z" } ], - "total": 26, + "total": 32, "limit": 1000, - "from": 25, + "from": 31, "next": null } diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb index 4c80ffe2c..80fb2b9d5 100644 Binary files a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/auth/lock.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb index c31db3415..95ca0a9da 100644 Binary files a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/data.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb index c99608b77..5fa5e6b49 100644 Binary files a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/indexes/381abe91-f939-4b91-92f2-01a24c2e8e3d/lock.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb index 226be2332..f2bcb1b8b 100644 Binary files a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/data.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb index 6d38eab08..b8e0e358d 100644 Binary files a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb and b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.ms/tasks/lock.mdb differ diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 1b2ae054c..b98f27b2d 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -114,13 +114,13 @@ async fn check_the_index_scheduler(server: &Server) { // All the indexes are still present let (indexes, _) = server.list_indexes(None, None).await; - snapshot!(indexes, @r#" + snapshot!(indexes, @r###" { "results": [ { "uid": "kefir", "createdAt": "2025-01-16T16:45:16.020663157Z", - "updatedAt": "2025-01-23T11:36:22.634859166Z", + "updatedAt": "2025-07-07T13:43:08.835381Z", "primaryKey": "id" } ], @@ -128,7 +128,7 @@ async fn check_the_index_scheduler(server: &Server) { "limit": 20, "total": 1 } - "#); + "###); // And their metadata are still right let (stats, _) = server.stats().await; assert_json_snapshot!(stats, { @@ -141,21 +141,21 @@ async fn check_the_index_scheduler(server: &Server) { { "databaseSize": "[bytes]", "usedDatabaseSize": "[bytes]", - "lastUpdate": "2025-01-23T11:36:22.634859166Z", + "lastUpdate": "2025-07-07T13:43:08.835381Z", "indexes": { "kefir": { - "numberOfDocuments": 1, + "numberOfDocuments": 2, "rawDocumentDbSize": "[bytes]", "avgDocumentSize": "[bytes]", "isIndexing": false, - "numberOfEmbeddings": 0, - "numberOfEmbeddedDocuments": 0, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 2, "fieldDistribution": { - "age": 1, - "description": 1, - "id": 1, - "name": 1, - "surname": 1 + "age": 2, + "description": 2, + "id": 2, + "name": 2, + "surname": 2 } } } @@ -227,21 +227,21 @@ async fn check_the_index_scheduler(server: &Server) { { "databaseSize": "[bytes]", "usedDatabaseSize": "[bytes]", - "lastUpdate": "2025-01-23T11:36:22.634859166Z", + "lastUpdate": "2025-07-07T13:43:08.835381Z", "indexes": { "kefir": { - "numberOfDocuments": 1, + "numberOfDocuments": 2, "rawDocumentDbSize": "[bytes]", "avgDocumentSize": "[bytes]", "isIndexing": false, - "numberOfEmbeddings": 0, - "numberOfEmbeddedDocuments": 0, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 2, "fieldDistribution": { - "age": 1, - "description": 1, - "id": 1, - "name": 1, - "surname": 1 + "age": 2, + "description": 2, + "id": 2, + "name": 2, + "surname": 2 } } } @@ -254,18 +254,18 @@ async fn check_the_index_scheduler(server: &Server) { ".avgDocumentSize" => "[bytes]", }), @r###" { - "numberOfDocuments": 1, + "numberOfDocuments": 2, "rawDocumentDbSize": "[bytes]", "avgDocumentSize": "[bytes]", "isIndexing": false, - "numberOfEmbeddings": 0, - "numberOfEmbeddedDocuments": 0, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 2, "fieldDistribution": { - "age": 1, - "description": 1, - "id": 1, - "name": 1, - "surname": 1 + "age": 2, + "description": 2, + "id": 2, + "name": 2, + "surname": 2 } } "###); @@ -295,4 +295,8 @@ async fn check_the_index_features(server: &Server) { let (results, _status) = kefir.search_post(json!({ "sort": ["age:asc"], "filter": "surname = kefirounet" })).await; snapshot!(results, name: "search_with_sort_and_filter"); + + // ensuring we can get the vectors and their `regenerate` is still good. + let (results, _status) = kefir.search_post(json!({"retrieveVectors": true})).await; + snapshot!(json_string!(results["hits"], {"[]._vectors.doggo_embedder.embeddings" => "[vector]"}), name: "search_with_retrieve_vectors"); } diff --git a/crates/meilisearch/tests/vector/fragments.rs b/crates/meilisearch/tests/vector/fragments.rs new file mode 100644 index 000000000..81c2e3a55 --- /dev/null +++ b/crates/meilisearch/tests/vector/fragments.rs @@ -0,0 +1,2120 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{ + init_fragments_index, init_fragments_index_composite, shared_index_for_fragments, +}; +use crate::json; +use crate::vector::{GetAllDocumentsOptions, Server}; + +#[actix_rt::test] +async fn experimental_feature_not_enabled() { + let server = Server::new().await; + let index = server.unique_index(); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": "http://localhost:1337", + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r#" + { + "message": "setting `indexingFragments` requires enabling the `multimodal` experimental feature. See https://github.com/orgs/meilisearch/discussions/846", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "#); +} + +#[actix_rt::test] +async fn indexing_fragments() { + let index = shared_index_for_fragments().await; + + // Make sure the documents have been indexed and their embeddings retrieved + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn replace_document() { + let (server, uid, _settings) = init_fragments_index().await; + let index = server.index(uid); + + let documents = json!([ + { "id": 0, "name": "kefir", "breed": "sorry-I-forgot" }, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + + server.wait_task(value.uid()).await.succeeded(); + + // Make sure kefir now has 2 vectors + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "breed": "sorry-I-forgot", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ], + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn search_with_vector() { + let index = shared_index_for_fragments().await; + + let (value, code) = index.search_post( + json!({"vector": [1.0, 1.0, 1.0], "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + )).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 1, + "name": "echo" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn search_with_media() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 2, + "name": "intel", + "breed": "labrador" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_and_vector() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "vector": [1.0, 1.0, 1.0], + "media": { "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Invalid request: both `media` and `vector` parameters are present.", + "code": "invalid_search_media_and_vector", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_media_and_vector" + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_matching_multiple_fragments() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "name": "dustin", "breed": "labrador" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Error while generating embeddings: user error: Query matches multiple search fragments.\n - Note: First matched fragment `justBreed`.\n - Note: Second matched fragment `justName`.\n - Note: {\"q\":null,\"media\":{\"name\":\"dustin\",\"breed\":\"labrador\"}}", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn search_with_media_matching_no_fragment() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "media": { "ticker": "GME", "section": "portfolio" }, + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"400 Bad Request"); + snapshot!(value, @r#" + { + "message": "Error while generating embeddings: user error: Query matches no search fragment.\n - Note: {\"q\":null,\"media\":{\"ticker\":\"GME\",\"section\":\"portfolio\"}}", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[actix_rt::test] +async fn search_with_query() { + let index = shared_index_for_fragments().await; + + let (value, code) = index + .search_post(json!({ + "q": "bulldog", + "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, + "limit": 1 + } + )) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 3, + "name": "dustin", + "breed": "bulldog" + } + ], + "query": "bulldog", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} + +#[actix_rt::test] +async fn deleting_fragments_deletes_vectors() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": null, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (value, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(value["embedders"], { + ".rest.url" => "[url]", + }), @r#" + { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "###); +} + +#[actix_rt::test] +async fn modifying_fragments_modifies_vectors() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"]["value"] = + serde_json::Value::String("{{ doc.name }} is a dog (maybe bulldog?)".to_string()); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog (maybe bulldog?)" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn swapping_fragments() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + let basic = settings["embedders"]["rest"]["indexingFragments"]["basic"].clone(); + let with_breed = settings["embedders"]["rest"]["indexingFragments"]["withBreed"].clone(); + settings["embedders"]["rest"]["indexingFragments"]["basic"] = with_breed; + settings["embedders"]["rest"]["indexingFragments"]["withBreed"] = basic; + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + }, + "withBreed": { + "value": "{{ doc.name }} is a dog" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(documents, @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ], + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ], + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn ommitted_fragment_isnt_removed() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; // basic is removed + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().remove("withBreed"); // withBreed isn't specified + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": null + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + // Make sure withBreed is still here because it wasn't specified + let (value, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(value["embedders"], { + ".rest.url" => "[url]", + }), @r#" + { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "headers": {} + } + } + "#); +} + +#[actix_rt::test] +async fn fragment_insertion() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().insert( + String::from("useless"), + serde_json::json!({ + "value": "This fragment is useless" + }), + ); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let value = server.wait_task(response.uid()).await.succeeded(); + snapshot!(value, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "useless": { + "value": "This fragment is useless" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ], + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn multiple_embedders() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + let url = settings["embedders"]["rest"]["url"].as_str().unwrap(); + + let settings2 = json!({ + "embedders": { + "rest2": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "withBreed": {"value": "{{ doc.name }} is a {{ doc.breed }}"}, + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + "rest3": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "query": {"value": "Some pre-prompt for query {{ q }}"}, + } + }, + }, + }); + let (response, code) = index.update_settings(settings2).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest2": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + }, + "rest3": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + } + }, + "searchFragments": { + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest2": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest2": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); + + // Remove Rest2 + + settings["embedders"]["rest2"] = serde_json::Value::Null; + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); + + // Remove rest's basic fragment + + settings["embedders"]["rest"]["indexingFragments"]["basic"] = serde_json::Value::Null; + //settings["embedders"].as_object_mut().unwrap().remove("rest2"); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + }, + "rest3": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + -1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 1.0 + ] + ], + "regenerate": true + }, + "rest3": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn remove_non_existant_embedder() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"] + .as_object_mut() + .unwrap() + .insert(String::from("non-existant"), serde_json::Value::Null); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "non-existant": null, + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + +#[actix_rt::test] +async fn double_remove_embedder() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"] + .as_object_mut() + .unwrap() + .insert(String::from("rest"), serde_json::Value::Null); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": null + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": null + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + +#[actix_rt::test] +async fn complex_fragment() { + let (server, uid, mut settings) = init_fragments_index().await; + let index = server.index(uid); + + settings["embedders"]["rest"]["indexingFragments"].as_object_mut().unwrap().insert( + String::from("complex"), + serde_json::json!({ + "value": { + "breed": "{{ doc.breed }}", + "breeds": [ + "{{ doc.breed }}", + { + "breed": "{{ doc.breed }}", + } + ] + } + }), + ); + + let (response, code) = index.update_settings(settings).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await.succeeded(); + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "[uuid]", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "indexingFragments": { + "basic": { + "value": "{{ doc.name }} is a dog" + }, + "complex": { + "value": { + "breed": "{{ doc.breed }}", + "breeds": [ + "{{ doc.breed }}", + { + "breed": "{{ doc.breed }}" + } + ] + } + }, + "withBreed": { + "value": "{{ doc.name }} is a {{ doc.breed }}" + } + }, + "searchFragments": { + "justBreed": { + "value": "It's a {{ media.breed }}" + }, + "justName": { + "value": "{{ media.name }} is a dog" + }, + "query": { + "value": "Some pre-prompt for query {{ q }}" + } + }, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); + + let (documents, code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(code, @"200 OK"); + snapshot!(json_string!(documents), @r#" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "rest": { + "embeddings": [ + [ + 0.5, + -0.5, + 0.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "intel", + "breed": "labrador", + "_vectors": { + "rest": { + "embeddings": [ + [ + 1.0, + 1.0, + 0.0 + ], + [ + 1.0, + 1.0, + -1.0 + ], + [ + 0.0, + 0.0, + -1.0 + ] + ], + "regenerate": true + } + } + }, + { + "id": 3, + "name": "dustin", + "breed": "bulldog", + "_vectors": { + "rest": { + "embeddings": [ + [ + -0.5, + 0.5, + 0.0 + ], + [ + -0.5, + 0.5, + 1.0 + ], + [ + 0.0, + 0.0, + 1.0 + ] + ], + "regenerate": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 4 + } + "#); +} + +#[actix_rt::test] +async fn both_fragments_and_document_template() { + let server = Server::new().await; + let index = server.unique_index(); + + let (_response, code) = server.set_features(json!({"multimodal": true})).await; + snapshot!(code, @"200 OK"); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": "http://localhost:1337", + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "indexingFragments": { + "basic": {"value": "{{ doc.name }} is a dog"}, + }, + "searchFragments": { + "justBreed": {"value": "It's a {{ media.breed }}"}, + }, + "documentTemplate": "{{ doc.name }} is a dog", + }, + }, + }); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(response, @r#" + { + "message": "Error while generating embeddings: user error: cannot pass both fragments and a document template.\n - Note: 1 fragments declared in `indexingFragments` and 1 fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "#); +} + +#[ignore = "failing due to issue #5746"] +#[actix_rt::test] +async fn set_fragments_then_document_template() { + let (server, uid, settings) = init_fragments_index().await; + let index = server.index(uid); + + let url = settings["embedders"]["rest"]["url"].as_str().unwrap(); + + let settings = json!({ + "embedders": { + "rest": { + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{fragment}}", + "response": { + "data": "{{embedding}}" + }, + "documentTemplate": "{{ doc.name }} is a dog", + }, + }, + }); + + let (response, code) = index.update_settings(settings.clone()).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r""); + + let (settings, code) = index.settings().await; + snapshot!(code, @"200 OK"); + snapshot!(settings, @r#""#); // Should have removed fragments +} + +#[actix_rt::test] +async fn composite() { + let (server, uid, _settings) = init_fragments_index_composite().await; + let index = server.index(uid); + + let (value, code) = index.search_post( + json!({"vector": [1.0, 1.0, 1.0], "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + )).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 1, + "name": "echo" + } + ], + "query": "", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); + + let (value, code) = index + .search_post( + json!({"q": "bulldog", "hybrid": {"semanticRatio": 1.0, "embedder": "rest"}, "limit": 1} + ), + ) + .await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r#" + { + "hits": [ + { + "id": 3, + "name": "dustin", + "breed": "bulldog" + } + ], + "query": "bulldog", + "processingTimeMs": "[duration]", + "limit": 1, + "offset": 0, + "estimatedTotalHits": 4, + "semanticHitCount": 1 + } + "#); +} diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index ca2ecc998..8538f5f1e 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -1,4 +1,5 @@ mod binary_quantized; +mod fragments; #[cfg(feature = "test-ollama")] mod ollama; mod openai; diff --git a/crates/meilisearch/tests/vector/openai.rs b/crates/meilisearch/tests/vector/openai.rs index 19b13228a..1d7e94a23 100644 --- a/crates/meilisearch/tests/vector/openai.rs +++ b/crates/meilisearch/tests/vector/openai.rs @@ -136,7 +136,7 @@ fn long_text() -> &'static str { }) } -async fn create_mock_tokenized() -> (MockServer, Value) { +async fn create_mock_tokenized() -> (&'static MockServer, Value) { create_mock_with_template("{{doc.text}}", ModelDimensions::Large, false, false).await } @@ -145,8 +145,8 @@ async fn create_mock_with_template( model_dimensions: ModelDimensions, fallible: bool, slow: bool, -) -> (MockServer, Value) { - let mock_server = MockServer::start().await; +) -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); const API_KEY: &str = "my-api-key"; const API_KEY_BEARER: &str = "Bearer my-api-key"; @@ -299,7 +299,7 @@ async fn create_mock_with_template( } })) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -321,27 +321,27 @@ const DOGGO_TEMPLATE: &str = r#"{%- if doc.gender == "F" -%}Une chienne nommée Un chien nommé {{doc.name}}, né en {{doc.birthyear}} {%- endif %}, de race {{doc.breed}}."#; -async fn create_mock() -> (MockServer, Value) { +async fn create_mock() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, false, false).await } -async fn create_mock_dimensions() -> (MockServer, Value) { +async fn create_mock_dimensions() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large512, false, false).await } -async fn create_mock_small_embedding_model() -> (MockServer, Value) { +async fn create_mock_small_embedding_model() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Small, false, false).await } -async fn create_mock_legacy_embedding_model() -> (MockServer, Value) { +async fn create_mock_legacy_embedding_model() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Ada, false, false).await } -async fn create_fallible_mock() -> (MockServer, Value) { +async fn create_fallible_mock() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, false).await } -async fn create_slow_mock() -> (MockServer, Value) { +async fn create_slow_mock() -> (&'static MockServer, Value) { create_mock_with_template(DOGGO_TEMPLATE, ModelDimensions::Large, true, true).await } diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 768d03eb9..e68a87dbf 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -12,8 +12,8 @@ use crate::common::Value; use crate::json; use crate::vector::{get_server_vector, GetAllDocumentsOptions}; -async fn create_mock() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -32,7 +32,7 @@ async fn create_mock() -> (MockServer, Value) { json!({ "data": text_to_embedding.get(text.as_str()).unwrap_or(&[99., 99., 99.]) }), ) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -50,8 +50,8 @@ async fn create_mock() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_mock_default_template() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_default_template() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -73,7 +73,7 @@ async fn create_mock_default_template() -> (MockServer, Value) { .set_body_json(json!({"error": "text not found", "text": text})), } }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -106,8 +106,8 @@ struct SingleResponse { embedding: Vec, } -async fn create_mock_multiple() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_multiple() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -146,7 +146,7 @@ async fn create_mock_multiple() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(response) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -176,8 +176,8 @@ struct SingleRequest { input: String, } -async fn create_mock_single_response_in_array() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_single_response_in_array() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -212,7 +212,7 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(response) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -236,8 +236,8 @@ async fn create_mock_single_response_in_array() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_raw_with_custom_header() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -277,7 +277,7 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(output) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -293,8 +293,8 @@ async fn create_mock_raw_with_custom_header() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_mock_raw() -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_mock_raw() -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let text_to_embedding: BTreeMap<_, _> = vec![ // text -> embedding @@ -321,7 +321,7 @@ async fn create_mock_raw() -> (MockServer, Value) { ResponseTemplate::new(200).set_body_json(output) }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); @@ -337,8 +337,8 @@ async fn create_mock_raw() -> (MockServer, Value) { (mock_server, embedder_settings) } -async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) { - let mock_server = MockServer::start().await; +async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (&'static MockServer, Value) { + let mock_server = Box::leak(Box::new(MockServer::start().await)); let count = AtomicUsize::new(0); Mock::given(method("POST")) @@ -355,7 +355,7 @@ async fn create_faulty_mock_raw(sender: mpsc::Sender<()>) -> (MockServer, Value) ResponseTemplate::new(500).set_body_string("Service Unavailable") } }) - .mount(&mock_server) + .mount(mock_server) .await; let url = mock_server.uri(); diff --git a/crates/meilisearch/tests/vector/settings.rs b/crates/meilisearch/tests/vector/settings.rs index 50253f930..d26174faf 100644 --- a/crates/meilisearch/tests/vector/settings.rs +++ b/crates/meilisearch/tests/vector/settings.rs @@ -101,14 +101,7 @@ async fn reset_embedder_documents() { server.wait_task(response.uid()).await; // Make sure the documents are still present - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { - limit: None, - offset: None, - retrieve_vectors: false, - fields: None, - }) - .await; + let (documents, _code) = index.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(json_string!(documents), @r###" { "results": [ diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index b967e620c..170bbdcc8 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -15,6 +15,7 @@ use meilisearch_types::heed::{ }; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader}; +use meilisearch_types::milli::index::EmbeddingsWithMetadata; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, BEU32}; use meilisearch_types::tasks::{Status, Task}; @@ -591,12 +592,21 @@ fn export_documents( .into()); }; - for (embedder_name, (embeddings, regenerate)) in embeddings { + for ( + embedder_name, + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, + ) in embeddings + { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate, + regenerate: regenerate && + // Meilisearch does not handle well dumps with fragments, because as the fragments + // are marked as user-provided, + // all embeddings would be regenerated on any settings change or document update. + // To prevent this, we mark embeddings has non regenerate in this case. + !has_fragments, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); diff --git a/crates/milli/Cargo.toml b/crates/milli/Cargo.toml index 3d08252ac..d94a4d4e1 100644 --- a/crates/milli/Cargo.toml +++ b/crates/milli/Cargo.toml @@ -40,7 +40,7 @@ indexmap = { version = "2.9.0", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } memchr = "2.7.5" -memmap2 = "0.9.5" +memmap2 = "0.9.7" obkv = "0.3.0" once_cell = "1.21.3" ordered-float = "5.0.0" diff --git a/crates/milli/src/asc_desc.rs b/crates/milli/src/asc_desc.rs index e75adf83d..d7288faa3 100644 --- a/crates/milli/src/asc_desc.rs +++ b/crates/milli/src/asc_desc.rs @@ -168,6 +168,16 @@ pub enum SortError { ReservedNameForFilter { name: String }, } +impl SortError { + pub fn into_search_error(self) -> Error { + Error::UserError(UserError::SortError { error: self, search: true }) + } + + pub fn into_document_error(self) -> Error { + Error::UserError(UserError::SortError { error: self, search: false }) + } +} + impl From for SortError { fn from(error: AscDescError) -> Self { match error { @@ -190,12 +200,6 @@ impl From for SortError { } } -impl From for Error { - fn from(error: SortError) -> Self { - Self::UserError(UserError::SortError(error)) - } -} - #[cfg(test)] mod tests { use big_s::S; diff --git a/crates/milli/src/documents/geo_sort.rs b/crates/milli/src/documents/geo_sort.rs new file mode 100644 index 000000000..0750dfe5c --- /dev/null +++ b/crates/milli/src/documents/geo_sort.rs @@ -0,0 +1,294 @@ +use crate::{ + distance_between_two_points, + heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}, + lat_lng_to_xyz, + search::new::{facet_string_values, facet_values_prefix_key}, + GeoPoint, Index, +}; +use heed::{ + types::{Bytes, Unit}, + RoPrefix, RoTxn, +}; +use roaring::RoaringBitmap; +use rstar::RTree; +use std::collections::VecDeque; + +#[derive(Debug, Clone, Copy)] +pub struct GeoSortParameter { + // Define the strategy used by the geo sort + pub strategy: GeoSortStrategy, + // Limit the number of docs in a single bucket to avoid unexpectedly large overhead + pub max_bucket_size: u64, + // Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal + pub distance_error_margin: f64, +} + +impl Default for GeoSortParameter { + fn default() -> Self { + Self { + strategy: GeoSortStrategy::default(), + max_bucket_size: 1000, + distance_error_margin: 1.0, + } + } +} +/// Define the strategy used by the geo sort. +/// The parameter represents the cache size, and, in the case of the Dynamic strategy, +/// the point where we move from using the iterative strategy to the rtree. +#[derive(Debug, Clone, Copy)] +pub enum GeoSortStrategy { + AlwaysIterative(usize), + AlwaysRtree(usize), + Dynamic(usize), +} + +impl Default for GeoSortStrategy { + fn default() -> Self { + GeoSortStrategy::Dynamic(1000) + } +} + +impl GeoSortStrategy { + pub fn use_rtree(&self, candidates: usize) -> bool { + match self { + GeoSortStrategy::AlwaysIterative(_) => false, + GeoSortStrategy::AlwaysRtree(_) => true, + GeoSortStrategy::Dynamic(i) => candidates >= *i, + } + } + + pub fn cache_size(&self) -> usize { + match self { + GeoSortStrategy::AlwaysIterative(i) + | GeoSortStrategy::AlwaysRtree(i) + | GeoSortStrategy::Dynamic(i) => *i, + } + } +} + +#[allow(clippy::too_many_arguments)] +pub fn fill_cache( + index: &Index, + txn: &RoTxn, + strategy: GeoSortStrategy, + ascending: bool, + target_point: [f64; 2], + field_ids: &Option<[u16; 2]>, + rtree: &mut Option>, + geo_candidates: &RoaringBitmap, + cached_sorted_docids: &mut VecDeque<(u32, [f64; 2])>, +) -> crate::Result<()> { + debug_assert!(cached_sorted_docids.is_empty()); + + // lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree` + let rtree = if strategy.use_rtree(geo_candidates.len() as usize) { + if let Some(rtree) = rtree.as_ref() { + // get rtree from cache + Some(rtree) + } else { + let rtree2 = index.geo_rtree(txn)?.expect("geo candidates but no rtree"); + // insert rtree in cache and returns it. + // Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation. + Some(&*rtree.insert(rtree2)) + } + } else { + None + }; + + let cache_size = strategy.cache_size(); + if let Some(rtree) = rtree { + if ascending { + let point = lat_lng_to_xyz(&target_point); + for point in rtree.nearest_neighbor_iter(&point) { + if geo_candidates.contains(point.data.0) { + cached_sorted_docids.push_back(point.data); + if cached_sorted_docids.len() >= cache_size { + break; + } + } + } + } else { + // in the case of the desc geo sort we look for the closest point to the opposite of the queried point + // and we insert the points in reverse order they get reversed when emptying the cache later on + let point = lat_lng_to_xyz(&opposite_of(target_point)); + for point in rtree.nearest_neighbor_iter(&point) { + if geo_candidates.contains(point.data.0) { + cached_sorted_docids.push_front(point.data); + if cached_sorted_docids.len() >= cache_size { + break; + } + } + } + } + } else { + // the iterative version + let [lat, lng] = field_ids.expect("fill_buffer can't be called without the lat&lng"); + + let mut documents = geo_candidates + .iter() + .map(|id| -> crate::Result<_> { Ok((id, geo_value(id, lat, lng, index, txn)?)) }) + .collect::>>()?; + // computing the distance between two points is expensive thus we cache the result + documents + .sort_by_cached_key(|(_, p)| distance_between_two_points(&target_point, p) as usize); + cached_sorted_docids.extend(documents); + }; + + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +pub fn next_bucket( + index: &Index, + txn: &RoTxn, + universe: &RoaringBitmap, + ascending: bool, + target_point: [f64; 2], + field_ids: &Option<[u16; 2]>, + rtree: &mut Option>, + cached_sorted_docids: &mut VecDeque<(u32, [f64; 2])>, + geo_candidates: &RoaringBitmap, + parameter: GeoSortParameter, +) -> crate::Result)>> { + let mut geo_candidates = geo_candidates & universe; + + if geo_candidates.is_empty() { + return Ok(Some((universe.clone(), None))); + } + + let next = |cache: &mut VecDeque<_>| { + if ascending { + cache.pop_front() + } else { + cache.pop_back() + } + }; + let put_back = |cache: &mut VecDeque<_>, x: _| { + if ascending { + cache.push_front(x) + } else { + cache.push_back(x) + } + }; + + let mut current_bucket = RoaringBitmap::new(); + // current_distance stores the first point and distance in current bucket + let mut current_distance: Option<([f64; 2], f64)> = None; + loop { + // The loop will only exit when we have found all points with equal distance or have exhausted the candidates. + if let Some((id, point)) = next(cached_sorted_docids) { + if geo_candidates.contains(id) { + let distance = distance_between_two_points(&target_point, &point); + if let Some((point0, bucket_distance)) = current_distance.as_ref() { + if (bucket_distance - distance).abs() > parameter.distance_error_margin { + // different distance, point belongs to next bucket + put_back(cached_sorted_docids, (id, point)); + return Ok(Some((current_bucket, Some(point0.to_owned())))); + } else { + // same distance, point belongs to current bucket + current_bucket.insert(id); + // remove from candidates to prevent it from being added to the cache again + geo_candidates.remove(id); + // current bucket size reaches limit, force return + if current_bucket.len() == parameter.max_bucket_size { + return Ok(Some((current_bucket, Some(point0.to_owned())))); + } + } + } else { + // first doc in current bucket + current_distance = Some((point, distance)); + current_bucket.insert(id); + geo_candidates.remove(id); + // current bucket size reaches limit, force return + if current_bucket.len() == parameter.max_bucket_size { + return Ok(Some((current_bucket, Some(point.to_owned())))); + } + } + } + } else { + // cache exhausted, we need to refill it + fill_cache( + index, + txn, + parameter.strategy, + ascending, + target_point, + field_ids, + rtree, + &geo_candidates, + cached_sorted_docids, + )?; + + if cached_sorted_docids.is_empty() { + // candidates exhausted, exit + if let Some((point0, _)) = current_distance.as_ref() { + return Ok(Some((current_bucket, Some(point0.to_owned())))); + } else { + return Ok(Some((universe.clone(), None))); + } + } + } + } +} + +/// Return an iterator over each number value in the given field of the given document. +fn facet_number_values<'a>( + docid: u32, + field_id: u16, + index: &Index, + txn: &'a RoTxn<'a>, +) -> crate::Result, Unit>> { + let key = facet_values_prefix_key(field_id, docid); + + let iter = index + .field_id_docid_facet_f64s + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type(); + + Ok(iter) +} + +/// Extracts the lat and long values from a single document. +/// +/// If it is not able to find it in the facet number index it will extract it +/// from the facet string index and parse it as f64 (as the geo extraction behaves). +pub(crate) fn geo_value( + docid: u32, + field_lat: u16, + field_lng: u16, + index: &Index, + rtxn: &RoTxn<'_>, +) -> crate::Result<[f64; 2]> { + let extract_geo = |geo_field: u16| -> crate::Result { + match facet_number_values(docid, geo_field, index, rtxn)?.next() { + Some(Ok(((_, _, geo), ()))) => Ok(geo), + Some(Err(e)) => Err(e.into()), + None => match facet_string_values(docid, geo_field, index, rtxn)?.next() { + Some(Ok((_, geo))) => { + Ok(geo.parse::().expect("cannot parse geo field as f64")) + } + Some(Err(e)) => Err(e.into()), + None => panic!("A geo faceted document doesn't contain any lat or lng"), + }, + } + }; + + let lat = extract_geo(field_lat)?; + let lng = extract_geo(field_lng)?; + + Ok([lat, lng]) +} + +/// Compute the antipodal coordinate of `coord` +pub(crate) fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { + coord[0] *= -1.; + // in the case of x,0 we want to return x,180 + if coord[1] > 0. { + coord[1] -= 180.; + } else { + coord[1] += 180.; + } + + coord +} diff --git a/crates/milli/src/documents/mod.rs b/crates/milli/src/documents/mod.rs index f43f7e842..7a4babfa8 100644 --- a/crates/milli/src/documents/mod.rs +++ b/crates/milli/src/documents/mod.rs @@ -1,8 +1,10 @@ mod builder; mod enriched; +pub mod geo_sort; mod primary_key; mod reader; mod serde_impl; +pub mod sort; use std::fmt::Debug; use std::io; @@ -19,6 +21,7 @@ pub use primary_key::{ pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; +pub use self::geo_sort::{GeoSortParameter, GeoSortStrategy}; use crate::error::{FieldIdMapMissingEntry, InternalError}; use crate::{FieldId, Object, Result}; diff --git a/crates/milli/src/documents/sort.rs b/crates/milli/src/documents/sort.rs new file mode 100644 index 000000000..3866d9e27 --- /dev/null +++ b/crates/milli/src/documents/sort.rs @@ -0,0 +1,444 @@ +use std::collections::{BTreeSet, VecDeque}; + +use crate::{ + constants::RESERVED_GEO_FIELD_NAME, + documents::{geo_sort::next_bucket, GeoSortParameter}, + heed_codec::{ + facet::{FacetGroupKeyCodec, FacetGroupValueCodec}, + BytesRefCodec, + }, + is_faceted, + search::facet::{ascending_facet_sort, descending_facet_sort}, + AscDesc, DocumentId, Member, UserError, +}; +use heed::Database; +use roaring::RoaringBitmap; + +#[derive(Debug, Clone, Copy)] +enum AscDescId { + Facet { field_id: u16, ascending: bool }, + Geo { field_ids: [u16; 2], target_point: [f64; 2], ascending: bool }, +} + +/// A [`SortedDocumentsIterator`] allows efficient access to a continuous range of sorted documents. +/// This is ideal in the context of paginated queries in which only a small number of documents are needed at a time. +/// Search operations will only be performed upon access. +pub enum SortedDocumentsIterator<'ctx> { + Leaf { + /// The exact number of documents remaining + size: usize, + values: Box + 'ctx>, + }, + Branch { + /// The current child, got from the children iterator + current_child: Option>>, + /// The exact number of documents remaining, excluding documents in the current child + next_children_size: usize, + /// Iterators to become the current child once it is exhausted + next_children: + Box>> + 'ctx>, + }, +} + +impl SortedDocumentsIterator<'_> { + /// Takes care of updating the current child if it is `None`, and also updates the size + fn update_current<'ctx>( + current_child: &mut Option>>, + next_children_size: &mut usize, + next_children: &mut Box< + dyn Iterator>> + 'ctx, + >, + ) -> crate::Result<()> { + if current_child.is_none() { + *current_child = match next_children.next() { + Some(Ok(builder)) => { + let next_child = Box::new(builder.build()?); + *next_children_size -= next_child.size_hint().0; + Some(next_child) + } + Some(Err(e)) => return Err(e), + None => return Ok(()), + }; + } + Ok(()) + } +} + +impl Iterator for SortedDocumentsIterator<'_> { + type Item = crate::Result; + + /// Implementing the `nth` method allows for efficient access to the nth document in the sorted order. + /// It's used by `skip` internally. + /// The default implementation of `nth` would iterate over all children, which is inefficient for large datasets. + /// This implementation will jump over whole chunks of children until it gets close. + fn nth(&mut self, n: usize) -> Option { + if n == 0 { + return self.next(); + } + + // If it's at the leaf level, just forward the call to the values iterator + let (current_child, next_children, next_children_size) = match self { + SortedDocumentsIterator::Leaf { values, size } => { + *size = size.saturating_sub(n); + return values.nth(n).map(Ok); + } + SortedDocumentsIterator::Branch { + current_child, + next_children, + next_children_size, + } => (current_child, next_children, next_children_size), + }; + + // Otherwise don't directly iterate over children, skip them if we know we will go further + let mut to_skip = n - 1; + while to_skip > 0 { + if let Err(e) = SortedDocumentsIterator::update_current( + current_child, + next_children_size, + next_children, + ) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; // No more inner iterators, everything has been consumed. + }; + + if to_skip >= inner.size_hint().0 { + // The current child isn't large enough to contain the nth element. + // Skip it and continue with the next one. + to_skip -= inner.size_hint().0; + *current_child = None; + continue; + } else { + // The current iterator is large enough, so we can forward the call to it. + return inner.nth(to_skip + 1); + } + } + + self.next() + } + + /// Iterators need to keep track of their size so that they can be skipped efficiently by the `nth` method. + fn size_hint(&self) -> (usize, Option) { + let size = match self { + SortedDocumentsIterator::Leaf { size, .. } => *size, + SortedDocumentsIterator::Branch { + next_children_size, + current_child: Some(current_child), + .. + } => current_child.size_hint().0 + next_children_size, + SortedDocumentsIterator::Branch { next_children_size, current_child: None, .. } => { + *next_children_size + } + }; + + (size, Some(size)) + } + + fn next(&mut self) -> Option { + match self { + SortedDocumentsIterator::Leaf { values, size } => { + let result = values.next().map(Ok); + if result.is_some() { + *size -= 1; + } + result + } + SortedDocumentsIterator::Branch { + current_child, + next_children_size, + next_children, + } => { + let mut result = None; + while result.is_none() { + // Ensure we have selected an iterator to work with + if let Err(e) = SortedDocumentsIterator::update_current( + current_child, + next_children_size, + next_children, + ) { + return Some(Err(e)); + } + let Some(inner) = current_child else { + return None; + }; + + result = inner.next(); + + // If the current iterator is exhausted, we need to try the next one + if result.is_none() { + *current_child = None; + } + } + result + } + } + } +} + +/// Builder for a [`SortedDocumentsIterator`]. +/// Most builders won't ever be built, because pagination will skip them. +pub struct SortedDocumentsIteratorBuilder<'ctx> { + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + fields: &'ctx [AscDescId], + candidates: RoaringBitmap, + geo_candidates: &'ctx RoaringBitmap, +} + +impl<'ctx> SortedDocumentsIteratorBuilder<'ctx> { + /// Performs the sort and builds a [`SortedDocumentsIterator`]. + fn build(self) -> crate::Result> { + let size = self.candidates.len() as usize; + + match self.fields { + [] => Ok(SortedDocumentsIterator::Leaf { + size, + values: Box::new(self.candidates.into_iter()), + }), + [AscDescId::Facet { field_id, ascending }, next_fields @ ..] => { + SortedDocumentsIteratorBuilder::build_facet( + self.index, + self.rtxn, + self.number_db, + self.string_db, + next_fields, + self.candidates, + self.geo_candidates, + *field_id, + *ascending, + ) + } + [AscDescId::Geo { field_ids, target_point, ascending }, next_fields @ ..] => { + SortedDocumentsIteratorBuilder::build_geo( + self.index, + self.rtxn, + self.number_db, + self.string_db, + next_fields, + self.candidates, + self.geo_candidates, + *field_ids, + *target_point, + *ascending, + ) + } + } + } + + /// Builds a [`SortedDocumentsIterator`] based on the results of a facet sort. + #[allow(clippy::too_many_arguments)] + fn build_facet( + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + next_fields: &'ctx [AscDescId], + candidates: RoaringBitmap, + geo_candidates: &'ctx RoaringBitmap, + field_id: u16, + ascending: bool, + ) -> crate::Result> { + let size = candidates.len() as usize; + + // Perform the sort on the first field + let (number_iter, string_iter) = if ascending { + let number_iter = ascending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; + let string_iter = ascending_facet_sort(rtxn, string_db, field_id, candidates)?; + + (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) + } else { + let number_iter = descending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; + let string_iter = descending_facet_sort(rtxn, string_db, field_id, candidates)?; + + (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) + }; + + // Create builders for the next level of the tree + let number_iter = number_iter.map(|r| r.map(|(d, _)| d)); + let string_iter = string_iter.map(|r| r.map(|(d, _)| d)); + let next_children = number_iter.chain(string_iter).map(move |r| { + Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: next_fields, + candidates: r?, + geo_candidates, + }) + }); + + Ok(SortedDocumentsIterator::Branch { + current_child: None, + next_children_size: size, + next_children: Box::new(next_children), + }) + } + + /// Builds a [`SortedDocumentsIterator`] based on the (lazy) results of a geo sort. + #[allow(clippy::too_many_arguments)] + fn build_geo( + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + next_fields: &'ctx [AscDescId], + candidates: RoaringBitmap, + geo_candidates: &'ctx RoaringBitmap, + field_ids: [u16; 2], + target_point: [f64; 2], + ascending: bool, + ) -> crate::Result> { + let mut cache = VecDeque::new(); + let mut rtree = None; + let size = candidates.len() as usize; + let not_geo_candidates = candidates.clone() - geo_candidates; + let mut geo_remaining = size - not_geo_candidates.len() as usize; + let mut not_geo_candidates = Some(not_geo_candidates); + + let next_children = std::iter::from_fn(move || { + // Find the next bucket of geo-sorted documents. + // next_bucket loops and will go back to the beginning so we use a variable to track how many are left. + if geo_remaining > 0 { + if let Ok(Some((docids, _point))) = next_bucket( + index, + rtxn, + &candidates, + ascending, + target_point, + &Some(field_ids), + &mut rtree, + &mut cache, + geo_candidates, + GeoSortParameter::default(), + ) { + geo_remaining -= docids.len() as usize; + return Some(Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: next_fields, + candidates: docids, + geo_candidates, + })); + } + } + + // Once all geo candidates have been processed, we can return the others + if let Some(not_geo_candidates) = not_geo_candidates.take() { + if !not_geo_candidates.is_empty() { + return Some(Ok(SortedDocumentsIteratorBuilder { + index, + rtxn, + number_db, + string_db, + fields: next_fields, + candidates: not_geo_candidates, + geo_candidates, + })); + } + } + + None + }); + + Ok(SortedDocumentsIterator::Branch { + current_child: None, + next_children_size: size, + next_children: Box::new(next_children), + }) + } +} + +/// A structure owning the data needed during the lifetime of a [`SortedDocumentsIterator`]. +pub struct SortedDocuments<'ctx> { + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, + fields: Vec, + number_db: Database, FacetGroupValueCodec>, + string_db: Database, FacetGroupValueCodec>, + candidates: &'ctx RoaringBitmap, + geo_candidates: RoaringBitmap, +} + +impl<'ctx> SortedDocuments<'ctx> { + pub fn iter(&'ctx self) -> crate::Result> { + let builder = SortedDocumentsIteratorBuilder { + index: self.index, + rtxn: self.rtxn, + number_db: self.number_db, + string_db: self.string_db, + fields: &self.fields, + candidates: self.candidates.clone(), + geo_candidates: &self.geo_candidates, + }; + builder.build() + } +} + +pub fn recursive_sort<'ctx>( + index: &'ctx crate::Index, + rtxn: &'ctx heed::RoTxn<'ctx>, + sort: Vec, + candidates: &'ctx RoaringBitmap, +) -> crate::Result> { + let sortable_fields: BTreeSet<_> = index.sortable_fields(rtxn)?.into_iter().collect(); + let fields_ids_map = index.fields_ids_map(rtxn)?; + + // Retrieve the field ids that are used for sorting + let mut fields = Vec::new(); + let mut need_geo_candidates = false; + for asc_desc in sort { + let (field, geofield) = match asc_desc { + AscDesc::Asc(Member::Field(field)) => (Some((field, true)), None), + AscDesc::Desc(Member::Field(field)) => (Some((field, false)), None), + AscDesc::Asc(Member::Geo(target_point)) => (None, Some((target_point, true))), + AscDesc::Desc(Member::Geo(target_point)) => (None, Some((target_point, false))), + }; + if let Some((field, ascending)) = field { + if is_faceted(&field, &sortable_fields) { + if let Some(field_id) = fields_ids_map.id(&field) { + fields.push(AscDescId::Facet { field_id, ascending }); + continue; + } + } + return Err(UserError::InvalidDocumentSortableAttribute { + field: field.to_string(), + sortable_fields: sortable_fields.clone(), + } + .into()); + } + if let Some((target_point, ascending)) = geofield { + if sortable_fields.contains(RESERVED_GEO_FIELD_NAME) { + if let (Some(lat), Some(lng)) = + (fields_ids_map.id("_geo.lat"), fields_ids_map.id("_geo.lng")) + { + need_geo_candidates = true; + fields.push(AscDescId::Geo { field_ids: [lat, lng], target_point, ascending }); + continue; + } + } + return Err(UserError::InvalidDocumentSortableAttribute { + field: RESERVED_GEO_FIELD_NAME.to_string(), + sortable_fields: sortable_fields.clone(), + } + .into()); + } + } + + let geo_candidates = if need_geo_candidates { + index.geo_faceted_documents_ids(rtxn)? + } else { + RoaringBitmap::new() + }; + + let number_db = index.facet_id_f64_docids.remap_key_type::>(); + let string_db = + index.facet_id_string_docids.remap_key_type::>(); + + Ok(SortedDocuments { index, rtxn, fields, number_db, string_db, candidates, geo_candidates }) +} diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index f8886da8e..9ad9d0511 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -191,7 +191,21 @@ and can not be more than 511 bytes.", .document_id.to_string() ), } )] - InvalidSortableAttribute { field: String, valid_fields: BTreeSet, hidden_fields: bool }, + InvalidSearchSortableAttribute { + field: String, + valid_fields: BTreeSet, + hidden_fields: bool, + }, + #[error("Attribute `{}` is not sortable. {}", + .field, + match .sortable_fields.is_empty() { + true => "This index does not have configured sortable attributes.".to_string(), + false => format!("Available sortable attributes are: `{}`.", + sortable_fields.iter().map(AsRef::as_ref).collect::>().join(", ") + ), + } + )] + InvalidDocumentSortableAttribute { field: String, sortable_fields: BTreeSet }, #[error("Attribute `{}` is not filterable and thus, cannot be used as distinct attribute. {}", .field, match (.valid_patterns.is_empty(), .matching_rule_index) { @@ -272,8 +286,8 @@ and can not be more than 511 bytes.", .document_id.to_string() PrimaryKeyCannotBeChanged(String), #[error(transparent)] SerdeJson(serde_json::Error), - #[error(transparent)] - SortError(#[from] SortError), + #[error("{error}")] + SortError { error: SortError, search: bool }, #[error("An unknown internal document id have been used: `{document_id}`.")] UnknownInternalDocumentId { document_id: DocumentId }, #[error("`minWordSizeForTypos` setting is invalid. `oneTypo` and `twoTypos` fields should be between `0` and `255`, and `twoTypos` should be greater or equals to `oneTypo` but found `oneTypo: {0}` and twoTypos: {1}`.")] @@ -616,7 +630,7 @@ fn conditionally_lookup_for_error_message() { ]; for (list, suffix) in messages { - let err = UserError::InvalidSortableAttribute { + let err = UserError::InvalidSearchSortableAttribute { field: "name".to_string(), valid_fields: list, hidden_fields: false, diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index b2ec992ba..9f32fdb04 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1766,20 +1766,22 @@ impl Index { &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result, bool)>> { + ) -> Result> { let mut res = BTreeMap::new(); let embedders = self.embedding_configs(); for config in embedders.embedding_configs(rtxn)? { let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); + let has_fragments = config.config.embedder_options.has_fragments(); let reader = ArroyWrapper::new( self.vector_arroy, embedder_info.embedder_id, config.config.quantized(), ); let embeddings = reader.item_vectors(rtxn, docid)?; + let regenerate = embedder_info.embedding_status.must_regenerate(docid); res.insert( config.name.to_owned(), - (embeddings, embedder_info.embedding_status.must_regenerate(docid)), + EmbeddingsWithMetadata { embeddings, regenerate, has_fragments }, ); } Ok(res) @@ -1919,6 +1921,12 @@ impl Index { } } +pub struct EmbeddingsWithMetadata { + pub embeddings: Vec, + pub regenerate: bool, + pub has_fragments: bool, +} + #[derive(Debug, Default, Deserialize, Serialize)] pub struct ChatConfig { pub description: String, diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index 504b4c68d..6fdae86b3 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -43,12 +43,13 @@ use std::fmt; use std::hash::BuildHasherDefault; use charabia::normalizer::{CharNormalizer, CompatibilityDecompositionNormalizer}; +pub use documents::GeoSortStrategy; pub use filter_parser::{Condition, FilterCondition, Span, Token}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; pub use search::new::{ - execute_search, filtered_universe, DefaultSearchLogger, GeoSortStrategy, SearchContext, - SearchLogger, VisualSearchLogger, + execute_search, filtered_universe, DefaultSearchLogger, SearchContext, SearchLogger, + VisualSearchLogger, }; use serde_json::Value; pub use thread_pool_no_abort::{PanicCatched, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index c906e1eb7..a29b6c4c7 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -210,6 +210,7 @@ impl Search<'_> { scoring_strategy: ScoringStrategy::Detailed, words_limit: self.words_limit, exhaustive_number_hits: self.exhaustive_number_hits, + max_total_hits: self.max_total_hits, rtxn: self.rtxn, index: self.index, semantic: self.semantic.clone(), diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 97d542524..8742db24d 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -9,6 +9,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, MatcherBuilder, MatchingWords}; use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; +use crate::documents::GeoSortParameter; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; use crate::score_details::{ScoreDetails, ScoringStrategy}; @@ -47,11 +48,12 @@ pub struct Search<'a> { sort_criteria: Option>, distinct: Option, searchable_attributes: Option<&'a [String]>, - geo_param: new::GeoSortParameter, + geo_param: GeoSortParameter, terms_matching_strategy: TermsMatchingStrategy, scoring_strategy: ScoringStrategy, words_limit: usize, exhaustive_number_hits: bool, + max_total_hits: Option, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, semantic: Option, @@ -70,10 +72,11 @@ impl<'a> Search<'a> { sort_criteria: None, distinct: None, searchable_attributes: None, - geo_param: new::GeoSortParameter::default(), + geo_param: GeoSortParameter::default(), terms_matching_strategy: TermsMatchingStrategy::default(), scoring_strategy: Default::default(), exhaustive_number_hits: false, + max_total_hits: None, words_limit: 10, rtxn, index, @@ -147,7 +150,7 @@ impl<'a> Search<'a> { } #[cfg(test)] - pub fn geo_sort_strategy(&mut self, strategy: new::GeoSortStrategy) -> &mut Search<'a> { + pub fn geo_sort_strategy(&mut self, strategy: crate::GeoSortStrategy) -> &mut Search<'a> { self.geo_param.strategy = strategy; self } @@ -165,6 +168,11 @@ impl<'a> Search<'a> { self } + pub fn max_total_hits(&mut self, max_total_hits: Option) -> &mut Search<'a> { + self.max_total_hits = max_total_hits; + self + } + pub fn time_budget(&mut self, time_budget: TimeBudget) -> &mut Search<'a> { self.time_budget = time_budget; self @@ -243,6 +251,8 @@ impl<'a> Search<'a> { &mut ctx, vector, self.scoring_strategy, + self.exhaustive_number_hits, + self.max_total_hits, universe, &self.sort_criteria, &self.distinct, @@ -261,6 +271,7 @@ impl<'a> Search<'a> { self.terms_matching_strategy, self.scoring_strategy, self.exhaustive_number_hits, + self.max_total_hits, universe, &self.sort_criteria, &self.distinct, @@ -314,6 +325,7 @@ impl fmt::Debug for Search<'_> { scoring_strategy, words_limit, exhaustive_number_hits, + max_total_hits, rtxn: _, index: _, semantic, @@ -333,6 +345,7 @@ impl fmt::Debug for Search<'_> { .field("terms_matching_strategy", terms_matching_strategy) .field("scoring_strategy", scoring_strategy) .field("exhaustive_number_hits", exhaustive_number_hits) + .field("max_total_hits", max_total_hits) .field("words_limit", words_limit) .field( "semantic.embedder_name", diff --git a/crates/milli/src/search/new/bucket_sort.rs b/crates/milli/src/search/new/bucket_sort.rs index 3c26cad5c..645d36e16 100644 --- a/crates/milli/src/search/new/bucket_sort.rs +++ b/crates/milli/src/search/new/bucket_sort.rs @@ -32,6 +32,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( logger: &mut dyn SearchLogger, time_budget: TimeBudget, ranking_score_threshold: Option, + exhaustive_number_hits: bool, + max_total_hits: Option, ) -> Result { logger.initial_query(query); logger.ranking_rules(&ranking_rules); @@ -159,7 +161,13 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; } - while valid_docids.len() < length { + let max_len_to_evaluate = + match (max_total_hits, exhaustive_number_hits && ranking_score_threshold.is_some()) { + (Some(max_total_hits), true) => max_total_hits, + _ => length, + }; + + while valid_docids.len() < max_len_to_evaluate { if time_budget.exceeded() { loop { let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); diff --git a/crates/milli/src/search/new/distinct.rs b/crates/milli/src/search/new/distinct.rs index 36172302a..455b495f5 100644 --- a/crates/milli/src/search/new/distinct.rs +++ b/crates/milli/src/search/new/distinct.rs @@ -82,7 +82,7 @@ fn facet_value_docids( } /// Return an iterator over each number value in the given field of the given document. -fn facet_number_values<'a>( +pub(crate) fn facet_number_values<'a>( docid: u32, field_id: u16, index: &Index, @@ -118,7 +118,7 @@ pub fn facet_string_values<'a>( } #[allow(clippy::drop_non_drop)] -fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { +pub(crate) fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } diff --git a/crates/milli/src/search/new/geo_sort.rs b/crates/milli/src/search/new/geo_sort.rs index 3e7fe3458..6c7d7b03b 100644 --- a/crates/milli/src/search/new/geo_sort.rs +++ b/crates/milli/src/search/new/geo_sort.rs @@ -1,96 +1,18 @@ use std::collections::VecDeque; -use heed::types::{Bytes, Unit}; -use heed::{RoPrefix, RoTxn}; use roaring::RoaringBitmap; use rstar::RTree; -use super::facet_string_values; use super::ranking_rules::{RankingRule, RankingRuleOutput, RankingRuleQueryTrait}; -use crate::heed_codec::facet::{FieldDocIdFacetCodec, OrderedF64Codec}; +use crate::documents::geo_sort::{fill_cache, next_bucket}; +use crate::documents::{GeoSortParameter, GeoSortStrategy}; use crate::score_details::{self, ScoreDetails}; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, GeoPoint, Index, Result, SearchContext, - SearchLogger, -}; - -const FID_SIZE: usize = 2; -const DOCID_SIZE: usize = 4; - -#[allow(clippy::drop_non_drop)] -fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { - concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) -} - -/// Return an iterator over each number value in the given field of the given document. -fn facet_number_values<'a>( - docid: u32, - field_id: u16, - index: &Index, - txn: &'a RoTxn<'a>, -) -> Result, Unit>> { - let key = facet_values_prefix_key(field_id, docid); - - let iter = index - .field_id_docid_facet_f64s - .remap_key_type::() - .prefix_iter(txn, &key)? - .remap_key_type(); - - Ok(iter) -} - -#[derive(Debug, Clone, Copy)] -pub struct Parameter { - // Define the strategy used by the geo sort - pub strategy: Strategy, - // Limit the number of docs in a single bucket to avoid unexpectedly large overhead - pub max_bucket_size: u64, - // Considering the errors of GPS and geographical calculations, distances less than distance_error_margin will be treated as equal - pub distance_error_margin: f64, -} - -impl Default for Parameter { - fn default() -> Self { - Self { strategy: Strategy::default(), max_bucket_size: 1000, distance_error_margin: 1.0 } - } -} -/// Define the strategy used by the geo sort. -/// The parameter represents the cache size, and, in the case of the Dynamic strategy, -/// the point where we move from using the iterative strategy to the rtree. -#[derive(Debug, Clone, Copy)] -pub enum Strategy { - AlwaysIterative(usize), - AlwaysRtree(usize), - Dynamic(usize), -} - -impl Default for Strategy { - fn default() -> Self { - Strategy::Dynamic(1000) - } -} - -impl Strategy { - pub fn use_rtree(&self, candidates: usize) -> bool { - match self { - Strategy::AlwaysIterative(_) => false, - Strategy::AlwaysRtree(_) => true, - Strategy::Dynamic(i) => candidates >= *i, - } - } - - pub fn cache_size(&self) -> usize { - match self { - Strategy::AlwaysIterative(i) | Strategy::AlwaysRtree(i) | Strategy::Dynamic(i) => *i, - } - } -} +use crate::{GeoPoint, Result, SearchContext, SearchLogger}; pub struct GeoSort { query: Option, - strategy: Strategy, + strategy: GeoSortStrategy, ascending: bool, point: [f64; 2], field_ids: Option<[u16; 2]>, @@ -107,12 +29,12 @@ pub struct GeoSort { impl GeoSort { pub fn new( - parameter: Parameter, + parameter: GeoSortParameter, geo_faceted_docids: RoaringBitmap, point: [f64; 2], ascending: bool, ) -> Result { - let Parameter { strategy, max_bucket_size, distance_error_margin } = parameter; + let GeoSortParameter { strategy, max_bucket_size, distance_error_margin } = parameter; Ok(Self { query: None, strategy, @@ -134,98 +56,22 @@ impl GeoSort { ctx: &mut SearchContext<'_>, geo_candidates: &RoaringBitmap, ) -> Result<()> { - debug_assert!(self.field_ids.is_some(), "fill_buffer can't be called without the lat&lng"); - debug_assert!(self.cached_sorted_docids.is_empty()); - - // lazily initialize the rtree if needed by the strategy, and cache it in `self.rtree` - let rtree = if self.strategy.use_rtree(geo_candidates.len() as usize) { - if let Some(rtree) = self.rtree.as_ref() { - // get rtree from cache - Some(rtree) - } else { - let rtree = ctx.index.geo_rtree(ctx.txn)?.expect("geo candidates but no rtree"); - // insert rtree in cache and returns it. - // Can't use `get_or_insert_with` because getting the rtree from the DB is a fallible operation. - Some(&*self.rtree.insert(rtree)) - } - } else { - None - }; - - let cache_size = self.strategy.cache_size(); - if let Some(rtree) = rtree { - if self.ascending { - let point = lat_lng_to_xyz(&self.point); - for point in rtree.nearest_neighbor_iter(&point) { - if geo_candidates.contains(point.data.0) { - self.cached_sorted_docids.push_back(point.data); - if self.cached_sorted_docids.len() >= cache_size { - break; - } - } - } - } else { - // in the case of the desc geo sort we look for the closest point to the opposite of the queried point - // and we insert the points in reverse order they get reversed when emptying the cache later on - let point = lat_lng_to_xyz(&opposite_of(self.point)); - for point in rtree.nearest_neighbor_iter(&point) { - if geo_candidates.contains(point.data.0) { - self.cached_sorted_docids.push_front(point.data); - if self.cached_sorted_docids.len() >= cache_size { - break; - } - } - } - } - } else { - // the iterative version - let [lat, lng] = self.field_ids.unwrap(); - - let mut documents = geo_candidates - .iter() - .map(|id| -> Result<_> { Ok((id, geo_value(id, lat, lng, ctx.index, ctx.txn)?)) }) - .collect::>>()?; - // computing the distance between two points is expensive thus we cache the result - documents - .sort_by_cached_key(|(_, p)| distance_between_two_points(&self.point, p) as usize); - self.cached_sorted_docids.extend(documents); - }; + fill_cache( + ctx.index, + ctx.txn, + self.strategy, + self.ascending, + self.point, + &self.field_ids, + &mut self.rtree, + geo_candidates, + &mut self.cached_sorted_docids, + )?; Ok(()) } } -/// Extracts the lat and long values from a single document. -/// -/// If it is not able to find it in the facet number index it will extract it -/// from the facet string index and parse it as f64 (as the geo extraction behaves). -fn geo_value( - docid: u32, - field_lat: u16, - field_lng: u16, - index: &Index, - rtxn: &RoTxn<'_>, -) -> Result<[f64; 2]> { - let extract_geo = |geo_field: u16| -> Result { - match facet_number_values(docid, geo_field, index, rtxn)?.next() { - Some(Ok(((_, _, geo), ()))) => Ok(geo), - Some(Err(e)) => Err(e.into()), - None => match facet_string_values(docid, geo_field, index, rtxn)?.next() { - Some(Ok((_, geo))) => { - Ok(geo.parse::().expect("cannot parse geo field as f64")) - } - Some(Err(e)) => Err(e.into()), - None => panic!("A geo faceted document doesn't contain any lat or lng"), - }, - } - }; - - let lat = extract_geo(field_lat)?; - let lng = extract_geo(field_lng)?; - - Ok([lat, lng]) -} - impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { fn id(&self) -> String { "geo_sort".to_owned() @@ -267,124 +113,33 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { ) -> Result>> { let query = self.query.as_ref().unwrap().clone(); - let mut geo_candidates = &self.geo_candidates & universe; - - if geo_candidates.is_empty() { - return Ok(Some(RankingRuleOutput { + next_bucket( + ctx.index, + ctx.txn, + universe, + self.ascending, + self.point, + &self.field_ids, + &mut self.rtree, + &mut self.cached_sorted_docids, + &self.geo_candidates, + GeoSortParameter { + strategy: self.strategy, + max_bucket_size: self.max_bucket_size, + distance_error_margin: self.distance_error_margin, + }, + ) + .map(|o| { + o.map(|(candidates, point)| RankingRuleOutput { query, - candidates: universe.clone(), + candidates, score: ScoreDetails::GeoSort(score_details::GeoSort { target_point: self.point, ascending: self.ascending, - value: None, + value: point, }), - })); - } - - let ascending = self.ascending; - let next = |cache: &mut VecDeque<_>| { - if ascending { - cache.pop_front() - } else { - cache.pop_back() - } - }; - let put_back = |cache: &mut VecDeque<_>, x: _| { - if ascending { - cache.push_front(x) - } else { - cache.push_back(x) - } - }; - - let mut current_bucket = RoaringBitmap::new(); - // current_distance stores the first point and distance in current bucket - let mut current_distance: Option<([f64; 2], f64)> = None; - loop { - // The loop will only exit when we have found all points with equal distance or have exhausted the candidates. - if let Some((id, point)) = next(&mut self.cached_sorted_docids) { - if geo_candidates.contains(id) { - let distance = distance_between_two_points(&self.point, &point); - if let Some((point0, bucket_distance)) = current_distance.as_ref() { - if (bucket_distance - distance).abs() > self.distance_error_margin { - // different distance, point belongs to next bucket - put_back(&mut self.cached_sorted_docids, (id, point)); - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point0.to_owned()), - }), - })); - } else { - // same distance, point belongs to current bucket - current_bucket.insert(id); - // remove from cadidates to prevent it from being added to the cache again - geo_candidates.remove(id); - // current bucket size reaches limit, force return - if current_bucket.len() == self.max_bucket_size { - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point0.to_owned()), - }), - })); - } - } - } else { - // first doc in current bucket - current_distance = Some((point, distance)); - current_bucket.insert(id); - geo_candidates.remove(id); - // current bucket size reaches limit, force return - if current_bucket.len() == self.max_bucket_size { - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point.to_owned()), - }), - })); - } - } - } - } else { - // cache exhausted, we need to refill it - self.fill_buffer(ctx, &geo_candidates)?; - - if self.cached_sorted_docids.is_empty() { - // candidates exhausted, exit - if let Some((point0, _)) = current_distance.as_ref() { - return Ok(Some(RankingRuleOutput { - query, - candidates: current_bucket, - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: Some(point0.to_owned()), - }), - })); - } else { - return Ok(Some(RankingRuleOutput { - query, - candidates: universe.clone(), - score: ScoreDetails::GeoSort(score_details::GeoSort { - target_point: self.point, - ascending: self.ascending, - value: None, - }), - })); - } - } - } - } + }) + }) } #[tracing::instrument(level = "trace", skip_all, target = "search::geo_sort")] @@ -394,16 +149,3 @@ impl<'ctx, Q: RankingRuleQueryTrait> RankingRule<'ctx, Q> for GeoSort { self.cached_sorted_docids.clear(); } } - -/// Compute the antipodal coordinate of `coord` -fn opposite_of(mut coord: [f64; 2]) -> [f64; 2] { - coord[0] *= -1.; - // in the case of x,0 we want to return x,180 - if coord[1] > 0. { - coord[1] -= 180.; - } else { - coord[1] += 180.; - } - - coord -} diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 2d6f2cf17..66f65f5e5 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -510,6 +510,7 @@ mod tests { crate::TermsMatchingStrategy::default(), crate::score_details::ScoringStrategy::Skip, false, + None, universe, &None, &None, diff --git a/crates/milli/src/search/new/mod.rs b/crates/milli/src/search/new/mod.rs index a65b4076b..e22883839 100644 --- a/crates/milli/src/search/new/mod.rs +++ b/crates/milli/src/search/new/mod.rs @@ -1,7 +1,7 @@ mod bucket_sort; mod db_cache; mod distinct; -mod geo_sort; +pub(crate) mod geo_sort; mod graph_based_ranking_rule; mod interner; mod limits; @@ -46,14 +46,14 @@ use resolve_query_graph::{compute_query_graph_docids, PhraseDocIdsCache}; use roaring::RoaringBitmap; use sort::Sort; -use self::distinct::facet_string_values; +pub(crate) use self::distinct::{facet_string_values, facet_values_prefix_key}; use self::geo_sort::GeoSort; -pub use self::geo_sort::{Parameter as GeoSortParameter, Strategy as GeoSortStrategy}; use self::graph_based_ranking_rule::Words; use self::interner::Interned; use self::vector_sort::VectorSort; use crate::attribute_patterns::{match_pattern, PatternMatch}; use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::documents::GeoSortParameter; use crate::index::PrefixSearch; use crate::localized_attributes_rules::LocalizedFieldIds; use crate::score_details::{ScoreDetails, ScoringStrategy}; @@ -319,7 +319,7 @@ fn resolve_negative_phrases( fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, ) -> Result>> { let mut sort = false; let mut sorted_fields = HashSet::new(); @@ -371,7 +371,7 @@ fn get_ranking_rules_for_placeholder_search<'ctx>( fn get_ranking_rules_for_vector<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, limit_plus_offset: usize, target: &[f32], embedder_name: &str, @@ -448,7 +448,7 @@ fn get_ranking_rules_for_vector<'ctx>( fn get_ranking_rules_for_query_graph_search<'ctx>( ctx: &SearchContext<'ctx>, sort_criteria: &Option>, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, terms_matching_strategy: TermsMatchingStrategy, ) -> Result>> { // query graph search @@ -559,7 +559,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( ranking_rules: &mut Vec>, sorted_fields: &mut HashSet, geo_sorted: &mut bool, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, ) -> Result<()> { let sort_criteria = sort_criteria.clone().unwrap_or_default(); ranking_rules.reserve(sort_criteria.len()); @@ -626,10 +626,12 @@ pub fn execute_vector_search( ctx: &mut SearchContext<'_>, vector: &[f32], scoring_strategy: ScoringStrategy, + exhaustive_number_hits: bool, + max_total_hits: Option, universe: RoaringBitmap, sort_criteria: &Option>, distinct: &Option, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, from: usize, length: usize, embedder_name: &str, @@ -669,6 +671,8 @@ pub fn execute_vector_search( placeholder_search_logger, time_budget, ranking_score_threshold, + exhaustive_number_hits, + max_total_hits, )?; Ok(PartialSearchResult { @@ -689,10 +693,11 @@ pub fn execute_search( terms_matching_strategy: TermsMatchingStrategy, scoring_strategy: ScoringStrategy, exhaustive_number_hits: bool, + max_total_hits: Option, mut universe: RoaringBitmap, sort_criteria: &Option>, distinct: &Option, - geo_param: geo_sort::Parameter, + geo_param: GeoSortParameter, from: usize, length: usize, words_limit: Option, @@ -825,6 +830,8 @@ pub fn execute_search( query_graph_logger, time_budget, ranking_score_threshold, + exhaustive_number_hits, + max_total_hits, )? } else { let ranking_rules = @@ -841,6 +848,8 @@ pub fn execute_search( placeholder_search_logger, time_budget, ranking_score_threshold, + exhaustive_number_hits, + max_total_hits, )? }; @@ -872,7 +881,7 @@ pub fn execute_search( }) } -fn check_sort_criteria( +pub(crate) fn check_sort_criteria( ctx: &SearchContext<'_>, sort_criteria: Option<&Vec>, ) -> Result<()> { @@ -902,7 +911,7 @@ fn check_sort_criteria( let (valid_fields, hidden_fields) = ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; - return Err(UserError::InvalidSortableAttribute { + return Err(UserError::InvalidSearchSortableAttribute { field: field.to_string(), valid_fields, hidden_fields, @@ -913,7 +922,7 @@ fn check_sort_criteria( let (valid_fields, hidden_fields) = ctx.index.remove_hidden_fields(ctx.txn, sortable_fields)?; - return Err(UserError::InvalidSortableAttribute { + return Err(UserError::InvalidSearchSortableAttribute { field: RESERVED_GEO_FIELD_NAME.to_string(), valid_fields, hidden_fields, diff --git a/crates/milli/src/update/chat.rs b/crates/milli/src/update/chat.rs index 2f364894d..a6c0b3fbc 100644 --- a/crates/milli/src/update/chat.rs +++ b/crates/milli/src/update/chat.rs @@ -93,7 +93,7 @@ pub struct ChatSearchParams { pub hybrid: Setting, #[serde(default, skip_serializing_if = "Setting::is_not_set")] - #[deserr(default = Setting::Set(20))] + #[deserr(default)] #[schema(value_type = Option)] pub limit: Setting, diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index 01631e9a3..84eeca7f9 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -2,7 +2,7 @@ use heed::RwTxn; use roaring::RoaringBitmap; use time::OffsetDateTime; -use crate::{FieldDistribution, Index, Result}; +use crate::{database_stats::DatabaseStats, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'i> { wtxn: &'t mut RwTxn<'i>, @@ -92,6 +92,10 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { documents.clear(self.wtxn)?; + // Update the stats of the documents database after clearing all documents. + let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?; + self.index.put_documents_stats(self.wtxn, stats)?; + Ok(number_of_documents) } } @@ -122,6 +126,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); + // Variables for statistics verification + let stats = index.documents_stats(&rtxn).unwrap().unwrap(); + // the value is 7 because there is `[id, name, age, country, _geo, _geo.lng, _geo.lat]` assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 7); @@ -142,5 +149,9 @@ mod tests { assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); assert!(index.field_id_docid_facet_strings.is_empty(&rtxn).unwrap()); assert!(index.documents.is_empty(&rtxn).unwrap()); + + // Verify that the statistics are correctly updated after clearing documents + assert_eq!(index.number_of_documents(&rtxn).unwrap(), 0); + assert_eq!(stats.number_of_entries(), 0); } } diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 064cfd154..a1dfa1aad 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -23,7 +23,7 @@ use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; +use crate::vector::db::{EmbedderInfo, EmbeddingStatusDelta}; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; @@ -441,6 +441,8 @@ pub fn extract_vector_points( { let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_)); + let (old_is_user_provided, old_must_regenerate) = + embedder_info.embedding_status.is_user_provided_must_regenerate(docid); let (old, new) = parsed_vectors.remove(embedder_name); let new_must_regenerate = new.must_regenerate(); let delta = match action { @@ -499,16 +501,19 @@ pub fn extract_vector_points( let is_adding_fragments = has_fragments && !old_has_fragments; - if is_adding_fragments { + if !has_fragments { + // removing fragments + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } else if is_adding_fragments || + // regenerate all fragments when going from user provided to ! user provided + old_is_user_provided + { regenerate_all_fragments( runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, ) - } else if !has_fragments { - // removing fragments - regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? } else { let mut fragment_diff = Vec::new(); let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); @@ -600,7 +605,8 @@ pub fn extract_vector_points( docid, &delta, new_must_regenerate, - &embedder_info.embedding_status, + old_is_user_provided, + old_must_regenerate, ); // and we finally push the unique vectors into the writer @@ -657,10 +663,9 @@ fn push_embedding_status_delta( docid: DocumentId, delta: &VectorStateDelta, new_must_regenerate: bool, - embedding_status: &EmbeddingStatus, + old_is_user_provided: bool, + old_must_regenerate: bool, ) { - let (old_is_user_provided, old_must_regenerate) = - embedding_status.is_user_provided_must_regenerate(docid); let new_is_user_provided = match delta { VectorStateDelta::NoChange => old_is_user_provided, VectorStateDelta::NowRemoved => { diff --git a/crates/milli/src/update/indexer_config.rs b/crates/milli/src/update/indexer_config.rs index a0f901818..845da5a51 100644 --- a/crates/milli/src/update/indexer_config.rs +++ b/crates/milli/src/update/indexer_config.rs @@ -16,6 +16,7 @@ pub struct IndexerConfig { pub max_positions_per_attributes: Option, pub skip_index_budget: bool, pub experimental_no_edition_2024_for_settings: bool, + pub experimental_no_edition_2024_for_dumps: bool, } impl IndexerConfig { @@ -65,6 +66,7 @@ impl Default for IndexerConfig { max_positions_per_attributes: None, skip_index_budget: false, experimental_no_edition_2024_for_settings: false, + experimental_no_edition_2024_for_dumps: false, } } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4ca68027c..71fa9bf09 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -620,12 +620,35 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { - match &mut self.kind { - ChunkType::Fragments { fragments: _, session } => { - let doc_alloc = session.doc_alloc(); + self.set_status(docid, old_is_user_provided, true, false, true); - if old_is_user_provided | full_reindex { + match &mut self.kind { + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let reindex_all_fragments = + // when the vectors were user-provided, Meilisearch cannot know if they come from a particular fragment, + // and so Meilisearch needs to clear all embeddings in that case. + // Fortunately, as dump export fragment vector with `regenerate` set to `false`, + // this case should be rare and opt-in. + old_is_user_provided || + // full-reindex case + full_reindex; + + if reindex_all_fragments { session.on_embed_mut().clear_vectors(docid); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + insert_autogenerated( + docid, + external_docid, + extractors, + document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } settings_delta.try_for_each_fragment_diff( @@ -669,7 +692,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Result::Ok(()) }, )?; - self.set_status(docid, old_is_user_provided, true, false, true); } ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -690,12 +712,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { ExtractorDiff::Removed => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } OnEmbed::process_embedding_response( session.on_embed_mut(), crate::vector::session::EmbeddingResponse { metadata, embedding: None }, ); } ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + if old_is_user_provided || full_reindex { + session.on_embed_mut().clear_vectors(docid); + } session.request_embedding(metadata, input, unused_vectors_distribution)?; } ExtractorDiff::Unchanged => { /* do nothing */ } @@ -722,6 +750,13 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + false, + new_must_regenerate, + ); match &mut self.kind { ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -731,10 +766,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map, ); - if old_is_user_provided { - session.on_embed_mut().clear_vectors(docid); - } - update_autogenerated( docid, external_docid, @@ -743,6 +774,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &external_docid, old_must_regenerate, + old_is_user_provided, session, unused_vectors_distribution, )? @@ -754,7 +786,21 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { }); if old_is_user_provided { + // when the document was `userProvided`, Meilisearch cannot know whose fragments a particular + // vector was referring to. + // So as a result Meilisearch will regenerate all fragments on this case. + // Fortunately, since dumps for fragments set regenerate to false, this case should be rare. session.on_embed_mut().clear_vectors(docid); + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + return Ok(()); } update_autogenerated( @@ -765,25 +811,18 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_document, &(), old_must_regenerate, + false, session, unused_vectors_distribution, )? } }; - self.set_status( - docid, - old_is_user_provided, - old_must_regenerate, - false, - new_must_regenerate, - ); - Ok(()) } #[allow(clippy::too_many_arguments)] - pub fn insert_autogenerated + Debug>( + pub fn insert_autogenerated<'doc, D: Document<'doc> + Debug>( &mut self, docid: DocumentId, external_docid: &'a str, @@ -791,7 +830,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { new_fields_ids_map: &'a RefCell, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, new_must_regenerate: bool, - ) -> Result<()> { + ) -> Result<()> + where + 'a: 'doc, + { let (default_is_user_provided, default_must_regenerate) = (false, true); self.set_status( docid, @@ -956,6 +998,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( new_document: ND, meta: &E::DocumentMetadata, old_must_regenerate: bool, + mut must_clear_on_generation: bool, session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, ) -> Result<()> @@ -984,6 +1027,11 @@ where }; if must_regenerate { + if must_clear_on_generation { + must_clear_on_generation = false; + session.on_embed_mut().clear_vectors(docid); + } + let metadata = Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; @@ -1002,7 +1050,7 @@ where Ok(()) } -fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( +fn insert_autogenerated<'doc, 'a: 'doc, 'b, E, D: Document<'doc> + Debug>( docid: DocumentId, external_docid: &'a str, extractors: impl IntoIterator, diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 911f51865..bca8fbc59 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -101,6 +101,10 @@ impl Setting { matches!(self, Self::NotSet) } + pub const fn is_reset(&self) -> bool { + matches!(self, Self::Reset) + } + /// If `Self` is `Reset`, then map self to `Set` with the provided `val`. pub fn or_reset(self, val: T) -> Self { match self { @@ -554,10 +558,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else - let old_fields = self.index.searchable_fields(self.wtxn)?; + let old_fields = self.index.user_defined_searchable_fields(self.wtxn)?; let did_change = { let new_fields = fields.iter().map(String::as_str).collect::>(); - new_fields != old_fields + old_fields.is_none_or(|old| new_fields != old) }; if !did_change { return Ok(false); @@ -1213,6 +1217,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // new config EitherOrBoth::Right((name, mut setting)) => { tracing::debug!(embedder = name, "new embedder"); + // if we are asked to reset an embedder that doesn't exist, just ignore it + if setting.is_reset() { + continue; + } // apply the default source in case the source was not set so that it gets validated crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); crate::vector::settings::EmbeddingSettings::apply_default_openai_model( diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index 9f64ca0e3..f53319a37 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -2,6 +2,7 @@ mod v1_12; mod v1_13; mod v1_14; mod v1_15; +mod v1_16; use heed::RwTxn; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; use v1_13::{V1_13_0_To_V1_13_1, V1_13_1_To_Latest_V1_13}; @@ -10,6 +11,7 @@ use v1_15::Latest_V1_14_To_Latest_V1_15; use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; use crate::progress::{Progress, VariableNameStep}; +use crate::update::upgrade::v1_16::Latest_V1_15_To_V1_16_0; use crate::{Index, InternalError, Result}; trait UpgradeIndex { @@ -24,6 +26,59 @@ trait UpgradeIndex { fn target_version(&self) -> (u32, u32, u32); } +const UPGRADE_FUNCTIONS: &[&dyn UpgradeIndex] = &[ + &V1_12_To_V1_12_3 {}, + &V1_12_3_To_V1_13_0 {}, + &V1_13_0_To_V1_13_1 {}, + &V1_13_1_To_Latest_V1_13 {}, + &Latest_V1_13_To_Latest_V1_14 {}, + &Latest_V1_14_To_Latest_V1_15 {}, + &Latest_V1_15_To_V1_16_0 {}, + // This is the last upgrade function, it will be called when the index is up to date. + // any other upgrade function should be added before this one. + &ToCurrentNoOp {}, +]; + +/// Causes a compile-time error if the argument is not in range of `0..UPGRADE_FUNCTIONS.len()` +macro_rules! function_index { + ($start:expr) => {{ + const _CHECK_INDEX: () = { + if $start >= $crate::update::upgrade::UPGRADE_FUNCTIONS.len() { + panic!("upgrade functions out of range") + } + }; + + $start + }}; +} + +const fn start(from: (u32, u32, u32)) -> Option { + let start = match from { + (1, 12, 0..=2) => function_index!(0), + (1, 12, 3..) => function_index!(1), + (1, 13, 0) => function_index!(2), + (1, 13, _) => function_index!(4), + (1, 14, _) => function_index!(5), + // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. + (1, 15, _) => function_index!(6), + (1, 16, _) => function_index!(7), + // We deliberately don't add a placeholder with (VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH) here to force manually + // considering dumpless upgrade. + (_major, _minor, _patch) => return None, + }; + + Some(start) +} + +/// Causes a compile-time error if the latest package cannot be upgraded. +/// +/// This serves as a reminder to consider the proper dumpless upgrade implementation when changing the package version. +const _CHECK_PACKAGE_CAN_UPGRADE: () = { + if start((VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH)).is_none() { + panic!("cannot upgrade from latest package version") + } +}; + /// Return true if the cached stats of the index must be regenerated pub fn upgrade( wtxn: &mut RwTxn, @@ -36,33 +91,12 @@ where MSP: Fn() -> bool + Sync, { let from = index.get_version(wtxn)?.unwrap_or(db_version); - let upgrade_functions: &[&dyn UpgradeIndex] = &[ - &V1_12_To_V1_12_3 {}, - &V1_12_3_To_V1_13_0 {}, - &V1_13_0_To_V1_13_1 {}, - &V1_13_1_To_Latest_V1_13 {}, - &Latest_V1_13_To_Latest_V1_14 {}, - &Latest_V1_14_To_Latest_V1_15 {}, - // This is the last upgrade function, it will be called when the index is up to date. - // any other upgrade function should be added before this one. - &ToCurrentNoOp {}, - ]; - let start = match from { - (1, 12, 0..=2) => 0, - (1, 12, 3..) => 1, - (1, 13, 0) => 2, - (1, 13, _) => 4, - (1, 14, _) => 5, - // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. - (1, 15, _) => 6, - (major, minor, patch) => { - return Err(InternalError::CannotUpgradeToVersion(major, minor, patch).into()) - } - }; + let start = + start(from).ok_or_else(|| InternalError::CannotUpgradeToVersion(from.0, from.1, from.2))?; enum UpgradeVersion {} - let upgrade_path = &upgrade_functions[start..]; + let upgrade_path = &UPGRADE_FUNCTIONS[start..]; let mut current_version = from; let mut regenerate_stats = false; diff --git a/crates/milli/src/update/upgrade/v1_15.rs b/crates/milli/src/update/upgrade/v1_15.rs index cea4783a1..3457e69ba 100644 --- a/crates/milli/src/update/upgrade/v1_15.rs +++ b/crates/milli/src/update/upgrade/v1_15.rs @@ -1,4 +1,6 @@ use heed::RwTxn; +use roaring::RoaringBitmap; +use serde::Deserialize; use super::UpgradeIndex; use crate::progress::Progress; @@ -26,3 +28,14 @@ impl UpgradeIndex for Latest_V1_14_To_Latest_V1_15 { (1, 15, 0) } } + +/// Parts of v1.15 `IndexingEmbeddingConfig` that are relevant for upgrade to v1.16 +/// +/// # Warning +/// +/// This object should not be rewritten to the DB, only read to get the name and `user_provided` roaring. +#[derive(Debug, Deserialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub user_provided: RoaringBitmap, +} diff --git a/crates/milli/src/update/upgrade/v1_16.rs b/crates/milli/src/update/upgrade/v1_16.rs new file mode 100644 index 000000000..f43efd77d --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_16.rs @@ -0,0 +1,48 @@ +use heed::types::{SerdeJson, Str}; +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::progress::Progress; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus}; +use crate::{Index, InternalError, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct Latest_V1_15_To_V1_16_0(); + +impl UpgradeIndex for Latest_V1_15_To_V1_16_0 { + fn upgrade( + &self, + wtxn: &mut RwTxn, + index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + let v1_15_indexing_configs = index + .main + .remap_types::>>() + .get(wtxn, crate::index::main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default(); + + let embedders = index.embedding_configs(); + for config in v1_15_indexing_configs { + let embedder_id = embedders.embedder_id(wtxn, &config.name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + let info = EmbedderInfo { + embedder_id, + // v1.15 used not to make a difference between `user_provided` and `! regenerate`. + embedding_status: EmbeddingStatus::from_user_provided(config.user_provided), + }; + embedders.put_embedder_info(wtxn, &config.name, &info)?; + } + + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + (1, 16, 0) + } +} diff --git a/crates/milli/src/vector/composite.rs b/crates/milli/src/vector/composite.rs index 8314b8649..2e31da094 100644 --- a/crates/milli/src/vector/composite.rs +++ b/crates/milli/src/vector/composite.rs @@ -59,12 +59,24 @@ pub struct EmbedderOptions { impl Embedder { pub fn new( - EmbedderOptions { search, index }: EmbedderOptions, + EmbedderOptions { search: search_options, index: index_options }: EmbedderOptions, cache_cap: usize, ) -> Result { - let search = SubEmbedder::new(search, cache_cap)?; + // don't check similarity if one child is a rest embedder with fragments + // FIXME: skipping the check isn't ideal but we are unsure how to handle fragments in this context + let mut skip_similarity_check = false; + for options in [&search_options, &index_options] { + if let SubEmbedderOptions::Rest(options) = &options { + if !options.search_fragments.is_empty() || !options.indexing_fragments.is_empty() { + skip_similarity_check = true; + break; + } + } + } + + let search = SubEmbedder::new(search_options, cache_cap)?; // cache is only used at search - let index = SubEmbedder::new(index, 0)?; + let index = SubEmbedder::new(index_options, 0)?; // check dimensions if search.dimensions() != index.dimensions() { @@ -73,7 +85,12 @@ impl Embedder { index.dimensions(), )); } + // check similarity + if skip_similarity_check { + return Ok(Self { search, index }); + } + let search_embeddings = search .embed( vec![ diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs index 0e890fac9..2fea75d68 100644 --- a/crates/milli/src/vector/db.rs +++ b/crates/milli/src/vector/db.rs @@ -117,6 +117,13 @@ impl EmbeddingStatus { Default::default() } + /// Create a new `EmbeddingStatus` that assumes that any `user_provided` docid is also skipping regenerate. + /// + /// Used for migration from v1.15 and earlier DBs. + pub(crate) fn from_user_provided(user_provided: RoaringBitmap) -> Self { + Self { user_provided, skip_regenerate_different_from_user_provided: Default::default() } + } + /// Whether the document contains user-provided vectors for that embedder. pub fn is_user_provided(&self, docid: DocumentId) -> bool { self.user_provided.contains(docid) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index f64223e41..873693a34 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -841,6 +841,25 @@ impl EmbedderOptions { } } } + + pub fn has_fragments(&self) -> bool { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => false, + EmbedderOptions::Rest(embedder_options) => { + !embedder_options.indexing_fragments.is_empty() + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + !embedder_options.indexing_fragments.is_empty() + } else { + false + } + } + } + } } impl Default for EmbedderOptions {