From e74c3b692abbd64531bf11dd997f28dfe053d4e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 12 Jun 2025 16:23:48 +0200 Subject: [PATCH 001/101] Introduce a new route to export documents and enqueue the export task --- crates/dump/src/lib.rs | 14 +++ crates/index-scheduler/src/dump.rs | 14 +++ crates/index-scheduler/src/insta_snapshot.rs | 3 + crates/index-scheduler/src/processing.rs | 9 ++ .../src/scheduler/autobatcher.rs | 1 + .../src/scheduler/create_batch.rs | 29 ++++- .../src/scheduler/process_batch.rs | 24 +++- crates/index-scheduler/src/utils.rs | 9 ++ crates/meilisearch-types/src/error.rs | 5 + crates/meilisearch-types/src/keys.rs | 5 + crates/meilisearch-types/src/task_view.rs | 45 ++++++++ crates/meilisearch-types/src/tasks.rs | 47 +++++++- crates/meilisearch/src/routes/export.rs | 105 ++++++++++++++++++ crates/meilisearch/src/routes/mod.rs | 3 + 14 files changed, 303 insertions(+), 10 deletions(-) create mode 100644 crates/meilisearch/src/routes/export.rs diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 285818a87..29007e9ce 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -141,6 +141,12 @@ pub enum KindDump { instance_uid: Option, }, SnapshotCreation, + Export { + url: String, + indexes: Vec, + skip_embeddings: bool, + api_key: Option, + }, UpgradeDatabase { from: (u32, u32, u32), }, @@ -213,6 +219,14 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, + KindWithContent::Export { url, indexes, skip_embeddings, api_key } => { + KindDump::Export { + url, + indexes: indexes.into_iter().map(|pattern| pattern.to_string()).collect(), + skip_embeddings, + api_key, + } + } KindWithContent::UpgradeDatabase { from: version } => { KindDump::UpgradeDatabase { from: version } } diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index ca26e50c8..457d80597 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -4,6 +4,7 @@ use std::io; use dump::{KindDump, TaskDump, UpdateFile}; use meilisearch_types::batches::{Batch, BatchId}; use meilisearch_types::heed::RwTxn; +use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli; use meilisearch_types::tasks::{Kind, KindWithContent, Status, Task}; use roaring::RoaringBitmap; @@ -211,6 +212,19 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, + KindDump::Export { url, indexes, skip_embeddings, api_key } => { + KindWithContent::Export { + url, + indexes: indexes + .into_iter() + .map(|index| { + IndexUidPattern::try_from(index).map_err(|_| Error::CorruptedDump) + }) + .collect::, Error>>()?, + skip_embeddings, + api_key, + } + } KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d01548319..d1db77b2f 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,6 +289,9 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } + Details::Export { url, api_key, exported_documents, skip_embeddings } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, exported_documents: {exported_documents:?}, skip_embeddings: {skip_embeddings:?} }}") + } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") } diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index f23b811e5..5d4ac11c3 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -175,8 +175,17 @@ make_enum_progress! { } } +make_enum_progress! { + pub enum Export { + EnsuringCorrectnessOfTheTarget, + ExportTheSettings, + ExportTheDocuments, + } +} + make_atomic_progress!(Task alias AtomicTaskStep => "task" ); make_atomic_progress!(Document alias AtomicDocumentStep => "document" ); +make_atomic_progress!(Index alias AtomicIndexStep => "index" ); make_atomic_progress!(Batch alias AtomicBatchStep => "batch" ); make_atomic_progress!(UpdateFile alias AtomicUpdateFileStep => "update file" ); diff --git a/crates/index-scheduler/src/scheduler/autobatcher.rs b/crates/index-scheduler/src/scheduler/autobatcher.rs index b57983291..b3f7d2743 100644 --- a/crates/index-scheduler/src/scheduler/autobatcher.rs +++ b/crates/index-scheduler/src/scheduler/autobatcher.rs @@ -71,6 +71,7 @@ impl From for AutobatchKind { KindWithContent::TaskCancelation { .. } | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } + | KindWithContent::Export { .. } | KindWithContent::UpgradeDatabase { .. } | KindWithContent::SnapshotCreation => { panic!("The autobatcher should never be called with tasks that don't apply to an index.") diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index e3763881b..7a6fa4a9b 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -47,6 +47,9 @@ pub(crate) enum Batch { IndexSwap { task: Task, }, + Export { + task: Task, + }, UpgradeDatabase { tasks: Vec, }, @@ -103,6 +106,7 @@ impl Batch { Batch::TaskCancelation { task, .. } | Batch::Dump(task) | Batch::IndexCreation { task, .. } + | Batch::Export { task } | Batch::IndexUpdate { task, .. } => { RoaringBitmap::from_sorted_iter(std::iter::once(task.uid)).unwrap() } @@ -142,6 +146,7 @@ impl Batch { | TaskDeletions(_) | SnapshotCreation(_) | Dump(_) + | Export { .. } | UpgradeDatabase { .. } | IndexSwap { .. } => None, IndexOperation { op, .. } => Some(op.index_uid()), @@ -167,6 +172,7 @@ impl fmt::Display for Batch { Batch::IndexUpdate { .. } => f.write_str("IndexUpdate")?, Batch::IndexDeletion { .. } => f.write_str("IndexDeletion")?, Batch::IndexSwap { .. } => f.write_str("IndexSwap")?, + Batch::Export { .. } => f.write_str("Export")?, Batch::UpgradeDatabase { .. } => f.write_str("UpgradeDatabase")?, }; match index_uid { @@ -426,9 +432,10 @@ impl IndexScheduler { /// 0. We get the *last* task to cancel. /// 1. We get the tasks to upgrade. /// 2. We get the *next* task to delete. - /// 3. We get the *next* snapshot to process. - /// 4. We get the *next* dump to process. - /// 5. We get the *next* tasks to process for a specific index. + /// 3. We get the *next* export to process. + /// 4. We get the *next* snapshot to process. + /// 5. We get the *next* dump to process. + /// 6. We get the *next* tasks to process for a specific index. #[tracing::instrument(level = "trace", skip(self, rtxn), target = "indexing::scheduler")] pub(crate) fn create_next_batch( &self, @@ -500,7 +507,17 @@ impl IndexScheduler { return Ok(Some((Batch::TaskDeletions(tasks), current_batch))); } - // 3. we batch the snapshot. + // 3. we batch the export. + let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; + if !to_export.is_empty() { + let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_export)?; + current_batch.processing(&mut tasks); + let task = tasks.pop().expect("There must be only one export task"); + current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); + return Ok(Some((Batch::Export { task }, current_batch))); + } + + // 4. we batch the snapshot. let to_snapshot = self.queue.tasks.get_kind(rtxn, Kind::SnapshotCreation)? & enqueued; if !to_snapshot.is_empty() { let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_snapshot)?; @@ -510,7 +527,7 @@ impl IndexScheduler { return Ok(Some((Batch::SnapshotCreation(tasks), current_batch))); } - // 4. we batch the dumps. + // 5. we batch the dumps. let to_dump = self.queue.tasks.get_kind(rtxn, Kind::DumpCreation)? & enqueued; if let Some(to_dump) = to_dump.min() { let mut task = @@ -523,7 +540,7 @@ impl IndexScheduler { return Ok(Some((Batch::Dump(task), current_batch))); } - // 5. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. + // 6. We make a batch from the unprioritised tasks. Start by taking the next enqueued task. let task_id = if let Some(task_id) = enqueued.min() { task_id } else { return Ok(None) }; let mut task = self.queue.tasks.get_task(rtxn, task_id)?.ok_or(Error::CorruptedTaskQueue)?; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index c349f90ad..1f6c4eb2c 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,6 +1,7 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; +use std::time::Duration; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; @@ -13,9 +14,9 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, - InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, - UpdateIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, Export, + FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, + TaskDeletionProgress, UpdateIndexProgress, }; use crate::utils::{ self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, @@ -361,6 +362,23 @@ impl IndexScheduler { task.status = Status::Succeeded; Ok((vec![task], ProcessBatchInfo::default())) } + Batch::Export { mut task } => { + progress.update_progress(Export::EnsuringCorrectnessOfTheTarget); + + // TODO send check requests with the API Key + + let mut wtxn = self.env.write_txn()?; + let KindWithContent::Export { url, indexes, skip_embeddings, api_key } = &task.kind + else { + unreachable!() + }; + + eprintln!("Exporting data to {}...", url); + std::thread::sleep(Duration::from_secs(30)); + + task.status = Status::Succeeded; + Ok((vec![task], ProcessBatchInfo::default())) + } Batch::UpgradeDatabase { mut tasks } => { let KindWithContent::UpgradeDatabase { from } = tasks.last().unwrap().kind else { unreachable!(); diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 67e8fc090..7fe44d1c1 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -273,6 +273,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } + | K::Export { .. } // TODO I have patterns, not index uids | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; @@ -600,6 +601,14 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } + Details::Export { + url: _, + api_key: _, + exported_documents: _, + skip_embeddings: _, + } => { + assert_eq!(kind.as_kind(), Kind::Export); + } Details::UpgradeDatabase { from: _, to: _ } => { assert_eq!(kind.as_kind(), Kind::UpgradeDatabase); } diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index d2500b7e1..22c668d59 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -389,6 +389,11 @@ InvalidDocumentEditionContext , InvalidRequest , BAD_REQU InvalidDocumentEditionFunctionFilter , InvalidRequest , BAD_REQUEST ; EditDocumentsByFunctionError , InvalidRequest , BAD_REQUEST ; InvalidSettingsIndexChat , InvalidRequest , BAD_REQUEST ; +// Export +InvalidExportUrl , InvalidRequest , BAD_REQUEST ; +InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; +InvalidExportSkipEmbeddings , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/keys.rs b/crates/meilisearch-types/src/keys.rs index df2810727..3ba31c2cb 100644 --- a/crates/meilisearch-types/src/keys.rs +++ b/crates/meilisearch-types/src/keys.rs @@ -317,6 +317,9 @@ pub enum Action { #[serde(rename = "experimental.update")] #[deserr(rename = "experimental.update")] ExperimentalFeaturesUpdate, + #[serde(rename = "export")] + #[deserr(rename = "export")] + Export, #[serde(rename = "network.get")] #[deserr(rename = "network.get")] NetworkGet, @@ -438,6 +441,8 @@ pub mod actions { pub const EXPERIMENTAL_FEATURES_GET: u8 = ExperimentalFeaturesGet.repr(); pub const EXPERIMENTAL_FEATURES_UPDATE: u8 = ExperimentalFeaturesUpdate.repr(); + pub const EXPORT: u8 = Export.repr(); + pub const NETWORK_GET: u8 = NetworkGet.repr(); pub const NETWORK_UPDATE: u8 = NetworkUpdate.repr(); diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 86a00426b..06fda0835 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; @@ -118,6 +120,15 @@ pub struct DetailsView { pub upgrade_from: Option, #[serde(skip_serializing_if = "Option::is_none")] pub upgrade_to: Option, + // exporting + #[serde(skip_serializing_if = "Option::is_none")] + pub url: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub api_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub exported_documents: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + pub skip_embeddings: Option, } impl DetailsView { @@ -238,6 +249,37 @@ impl DetailsView { Some(left) } }, + url: match (self.url.clone(), other.url.clone()) { + (None, None) => None, + (None, Some(url)) | (Some(url), None) => Some(url), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + api_key: match (self.api_key.clone(), other.api_key.clone()) { + (None, None) => None, + (None, Some(key)) | (Some(key), None) => Some(key), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + exported_documents: match ( + self.exported_documents.clone(), + other.exported_documents.clone(), + ) { + (None, None) => None, + (None, Some(exp)) | (Some(exp), None) => Some(exp), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, + skip_embeddings: match (self.skip_embeddings, other.skip_embeddings) { + (None, None) => None, + (None, Some(skip)) | (Some(skip), None) => Some(skip), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, // We want the earliest version upgrade_from: match (self.upgrade_from.clone(), other.upgrade_from.clone()) { (None, None) => None, @@ -327,6 +369,9 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } + Details::Export { url, api_key, exported_documents, skip_embeddings } => { + DetailsView { exported_documents: Some(exported_documents), ..Default::default() } + } Details::UpgradeDatabase { from, to } => DetailsView { upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 95c52d9a6..e31e6062b 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -1,5 +1,5 @@ use core::fmt; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::fmt::{Display, Write}; use std::str::FromStr; @@ -14,6 +14,7 @@ use uuid::Uuid; use crate::batches::BatchId; use crate::error::ResponseError; +use crate::index_uid_pattern::IndexUidPattern; use crate::keys::Key; use crate::settings::{Settings, Unchecked}; use crate::{versioning, InstanceUid}; @@ -50,6 +51,7 @@ impl Task { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | Export { .. } | UpgradeDatabase { .. } | IndexSwap { .. } => None, DocumentAdditionOrUpdate { index_uid, .. } @@ -86,6 +88,7 @@ impl Task { | KindWithContent::TaskDeletion { .. } | KindWithContent::DumpCreation { .. } | KindWithContent::SnapshotCreation + | KindWithContent::Export { .. } | KindWithContent::UpgradeDatabase { .. } => None, } } @@ -152,6 +155,12 @@ pub enum KindWithContent { instance_uid: Option, }, SnapshotCreation, + Export { + url: String, + api_key: Option, + indexes: Vec, + skip_embeddings: bool, + }, UpgradeDatabase { from: (u32, u32, u32), }, @@ -180,6 +189,7 @@ impl KindWithContent { KindWithContent::TaskDeletion { .. } => Kind::TaskDeletion, KindWithContent::DumpCreation { .. } => Kind::DumpCreation, KindWithContent::SnapshotCreation => Kind::SnapshotCreation, + KindWithContent::Export { .. } => Kind::Export, KindWithContent::UpgradeDatabase { .. } => Kind::UpgradeDatabase, } } @@ -192,6 +202,7 @@ impl KindWithContent { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } + | Export { .. } // TODO Should I resolve the index names? | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } @@ -269,6 +280,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: Default::default(), + skip_embeddings: *skip_embeddings, + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -335,6 +354,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: Default::default(), + skip_embeddings: skip_embeddings.clone(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -383,6 +410,14 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, + KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + exported_documents: BTreeMap::default(), + skip_embeddings: skip_embeddings.clone(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -499,6 +534,7 @@ pub enum Kind { TaskDeletion, DumpCreation, SnapshotCreation, + Export, UpgradeDatabase, } @@ -516,6 +552,7 @@ impl Kind { | Kind::TaskCancelation | Kind::TaskDeletion | Kind::DumpCreation + | Kind::Export | Kind::UpgradeDatabase | Kind::SnapshotCreation => false, } @@ -536,6 +573,7 @@ impl Display for Kind { Kind::TaskDeletion => write!(f, "taskDeletion"), Kind::DumpCreation => write!(f, "dumpCreation"), Kind::SnapshotCreation => write!(f, "snapshotCreation"), + Kind::Export => write!(f, "export"), Kind::UpgradeDatabase => write!(f, "upgradeDatabase"), } } @@ -643,6 +681,12 @@ pub enum Details { IndexSwap { swaps: Vec, }, + Export { + url: String, + api_key: Option, + exported_documents: BTreeMap, + skip_embeddings: bool, + }, UpgradeDatabase { from: (u32, u32, u32), to: (u32, u32, u32), @@ -667,6 +711,7 @@ impl Details { Self::SettingsUpdate { .. } | Self::IndexInfo { .. } | Self::Dump { .. } + | Self::Export { .. } | Self::UpgradeDatabase { .. } | Self::IndexSwap { .. } => (), } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs new file mode 100644 index 000000000..666799273 --- /dev/null +++ b/crates/meilisearch/src/routes/export.rs @@ -0,0 +1,105 @@ +use actix_web::web::{self, Data}; +use actix_web::{HttpRequest, HttpResponse}; +use deserr::actix_web::AwebJson; +use deserr::Deserr; +use index_scheduler::IndexScheduler; +use meilisearch_types::deserr::DeserrJsonError; +use meilisearch_types::error::deserr_codes::*; +use meilisearch_types::error::ResponseError; +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::keys::actions; +use meilisearch_types::tasks::KindWithContent; +use serde::Serialize; +use tracing::debug; +use utoipa::{OpenApi, ToSchema}; + +use crate::analytics::Analytics; +use crate::extractors::authentication::policies::ActionPolicy; +use crate::extractors::authentication::GuardedData; +use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; +use crate::Opt; + +#[derive(OpenApi)] +#[openapi( + paths(export), + tags(( + name = "Export", + description = "The `/export` route allows you to trigger an export process to a remote Meilisearch instance.", + external_docs(url = "https://www.meilisearch.com/docs/reference/api/export"), + )), +)] +pub struct ExportApi; + +pub fn configure(cfg: &mut web::ServiceConfig) { + cfg.service(web::resource("").route(web::post().to(export))); +} + +#[utoipa::path( + get, + path = "", + tag = "Export", + security(("Bearer" = ["export", "*"])), + responses( + (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( + { + "indexes": ["movie", "steam-*"], + "skip_embeddings": true, + "apiKey": "meilisearch-api-key" + })), + (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( + { + "message": "The Authorization header is missing. It must use the bearer authorization method.", + "code": "missing_authorization_header", + "type": "auth", + "link": "https://docs.meilisearch.com/errors#missing_authorization_header" + } + )), + ) +)] +async fn export( + index_scheduler: GuardedData, Data>, + export: AwebJson, + req: HttpRequest, + opt: web::Data, + _analytics: Data, +) -> Result { + // TODO make it experimental? + // index_scheduler.features().check_network("Using the /network route")?; + + let export = export.into_inner(); + debug!(returns = ?export, "Trigger export"); + + let Export { url, api_key, indexes, skip_embeddings } = export; + let task = KindWithContent::Export { url, api_key, indexes, skip_embeddings }; + let uid = get_task_id(&req, &opt)?; + let dry_run = is_dry_run(&req, &opt)?; + let task: SummarizedTaskView = + tokio::task::spawn_blocking(move || index_scheduler.register(task, uid, dry_run)) + .await?? + .into(); + + Ok(HttpResponse::Ok().json(task)) +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct Export { + #[schema(value_type = Option, example = json!("https://ms-1234.heaven.meilisearch.com"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub url: String, + #[schema(value_type = Option, example = json!("1234abcd"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub api_key: Option, + #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] + #[deserr(default, error = DeserrJsonError)] + #[serde(default)] + pub indexes: Vec, + #[schema(value_type = Option, example = json!("true"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub skip_embeddings: bool, +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index cc62e43c3..748cd5d83 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -54,6 +54,7 @@ mod api_key; pub mod batches; pub mod chats; mod dump; +mod export; pub mod features; pub mod indexes; mod logs; @@ -84,6 +85,7 @@ mod tasks_test; (path = "/multi-search", api = multi_search::MultiSearchApi), (path = "/swap-indexes", api = swap_indexes::SwapIndexesApi), (path = "/experimental-features", api = features::ExperimentalFeaturesApi), + (path = "/export", api = export::ExportApi), (path = "/network", api = network::NetworkApi), ), paths(get_health, get_version, get_stats), @@ -115,6 +117,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { .service(web::scope("/metrics").configure(metrics::configure)) .service(web::scope("/experimental-features").configure(features::configure)) .service(web::scope("/network").configure(network::configure)) + .service(web::scope("/export").configure(export::configure)) .service(web::scope("/chats").configure(chats::configure)); #[cfg(feature = "swagger")] From e023ee4b6b1a5f2a87f245579742dde43300f117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Jun 2025 11:39:53 +0200 Subject: [PATCH 002/101] Working first implementation --- crates/dump/src/lib.rs | 25 ++-- crates/index-scheduler/src/dump.rs | 27 ++-- crates/index-scheduler/src/error.rs | 4 + crates/index-scheduler/src/insta_snapshot.rs | 4 +- crates/index-scheduler/src/scheduler/mod.rs | 1 + .../src/scheduler/process_batch.rs | 45 ++++-- .../src/scheduler/process_export.rs | 141 ++++++++++++++++++ .../mod.rs => process_upgrade.rs} | 0 crates/index-scheduler/src/test_utils.rs | 1 + crates/index-scheduler/src/utils.rs | 7 +- crates/meilisearch-types/src/error.rs | 3 +- .../src/index_uid_pattern.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 36 +++-- crates/meilisearch-types/src/tasks.rs | 71 +++++---- crates/meilisearch/src/routes/export.rs | 34 ++++- 15 files changed, 298 insertions(+), 103 deletions(-) create mode 100644 crates/index-scheduler/src/scheduler/process_export.rs rename crates/index-scheduler/src/scheduler/{process_upgrade/mod.rs => process_upgrade.rs} (100%) diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 29007e9ce..5c67d7a94 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -1,12 +1,16 @@ #![allow(clippy::type_complexity)] #![allow(clippy::wrong_self_convention)] +use std::collections::BTreeMap; + use meilisearch_types::batches::BatchId; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::Key; use meilisearch_types::milli::update::IndexDocumentsMethod; use meilisearch_types::settings::Unchecked; -use meilisearch_types::tasks::{Details, IndexSwap, KindWithContent, Status, Task, TaskId}; +use meilisearch_types::tasks::{ + Details, ExportIndexSettings, IndexSwap, KindWithContent, Status, Task, TaskId, +}; use meilisearch_types::InstanceUid; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -143,9 +147,8 @@ pub enum KindDump { SnapshotCreation, Export { url: String, - indexes: Vec, - skip_embeddings: bool, api_key: Option, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -219,14 +222,14 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, - KindWithContent::Export { url, indexes, skip_embeddings, api_key } => { - KindDump::Export { - url, - indexes: indexes.into_iter().map(|pattern| pattern.to_string()).collect(), - skip_embeddings, - api_key, - } - } + KindWithContent::Export { url, api_key, indexes } => KindDump::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| (pattern.to_string(), settings)) + .collect(), + }, KindWithContent::UpgradeDatabase { from: version } => { KindDump::UpgradeDatabase { from: version } } diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 457d80597..2a99a74aa 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -212,19 +212,20 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - KindDump::Export { url, indexes, skip_embeddings, api_key } => { - KindWithContent::Export { - url, - indexes: indexes - .into_iter() - .map(|index| { - IndexUidPattern::try_from(index).map_err(|_| Error::CorruptedDump) - }) - .collect::, Error>>()?, - skip_embeddings, - api_key, - } - } + KindDump::Export { url, indexes, api_key } => KindWithContent::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| { + Ok(( + IndexUidPattern::try_from(pattern) + .map_err(|_| Error::CorruptedDump)?, + settings, + )) + }) + .collect::>()?, + }, KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index cb798b385..2020ac597 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -151,6 +151,8 @@ pub enum Error { CorruptedTaskQueue, #[error(transparent)] DatabaseUpgrade(Box), + #[error(transparent)] + Export(Box), #[error("Failed to rollback for index `{index}`: {rollback_outcome} ")] RollbackFailed { index: String, rollback_outcome: RollbackOutcome }, #[error(transparent)] @@ -221,6 +223,7 @@ impl Error { | Error::IoError(_) | Error::Persist(_) | Error::FeatureNotEnabled(_) + | Error::Export(_) | Error::Anyhow(_) => true, Error::CreateBatch(_) | Error::CorruptedTaskQueue @@ -294,6 +297,7 @@ impl ErrorCode for Error { Error::CorruptedTaskQueue => Code::Internal, Error::CorruptedDump => Code::Internal, Error::DatabaseUpgrade(_) => Code::Internal, + Error::Export(_) => Code::Internal, Error::RollbackFailed { .. } => Code::Internal, Error::UnrecoverableError(_) => Code::Internal, Error::IndexSchedulerVersionMismatch { .. } => Code::Internal, diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index d1db77b2f..138b591ff 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,8 +289,8 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } - Details::Export { url, api_key, exported_documents, skip_embeddings } => { - format!("{{ url: {url:?}, api_key: {api_key:?}, exported_documents: {exported_documents:?}, skip_embeddings: {skip_embeddings:?} }}") + Details::Export { url, api_key, indexes } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, indexes: {indexes:?} }}") } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") diff --git a/crates/index-scheduler/src/scheduler/mod.rs b/crates/index-scheduler/src/scheduler/mod.rs index 0e258e27b..5ac591143 100644 --- a/crates/index-scheduler/src/scheduler/mod.rs +++ b/crates/index-scheduler/src/scheduler/mod.rs @@ -4,6 +4,7 @@ mod autobatcher_test; mod create_batch; mod process_batch; mod process_dump_creation; +mod process_export; mod process_index_operation; mod process_snapshot_creation; mod process_upgrade; diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 1f6c4eb2c..99278756d 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -1,7 +1,6 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::panic::{catch_unwind, AssertUnwindSafe}; use std::sync::atomic::Ordering; -use std::time::Duration; use meilisearch_types::batches::{BatchEnqueuedAt, BatchId}; use meilisearch_types::heed::{RoTxn, RwTxn}; @@ -14,9 +13,9 @@ use roaring::RoaringBitmap; use super::create_batch::Batch; use crate::processing::{ - AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, Export, - FinalizingIndexStep, InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, - TaskDeletionProgress, UpdateIndexProgress, + AtomicBatchStep, AtomicTaskStep, CreateIndexProgress, DeleteIndexProgress, FinalizingIndexStep, + InnerSwappingTwoIndexes, SwappingTheIndexes, TaskCancelationProgress, TaskDeletionProgress, + UpdateIndexProgress, }; use crate::utils::{ self, remove_n_tasks_datetime_earlier_than, remove_task_datetime, swap_index_uid_in_task, @@ -363,18 +362,32 @@ impl IndexScheduler { Ok((vec![task], ProcessBatchInfo::default())) } Batch::Export { mut task } => { - progress.update_progress(Export::EnsuringCorrectnessOfTheTarget); - - // TODO send check requests with the API Key - - let mut wtxn = self.env.write_txn()?; - let KindWithContent::Export { url, indexes, skip_embeddings, api_key } = &task.kind - else { + let KindWithContent::Export { url, indexes, api_key } = &task.kind else { unreachable!() }; - eprintln!("Exporting data to {}...", url); - std::thread::sleep(Duration::from_secs(30)); + let ret = catch_unwind(AssertUnwindSafe(|| { + self.process_export(url, indexes, api_key.as_deref(), progress) + })); + + match ret { + // TODO return the matched and exported documents + Ok(Ok(())) => (), + Ok(Err(Error::AbortedTask)) => return Err(Error::AbortedTask), + Ok(Err(e)) => return Err(Error::Export(Box::new(e))), + Err(e) => { + let msg = match e.downcast_ref::<&'static str>() { + Some(s) => *s, + None => match e.downcast_ref::() { + Some(s) => &s[..], + None => "Box", + }, + }; + return Err(Error::Export(Box::new(Error::ProcessBatchPanicked( + msg.to_string(), + )))); + } + } task.status = Status::Succeeded; Ok((vec![task], ProcessBatchInfo::default())) @@ -726,9 +739,11 @@ impl IndexScheduler { from.1, from.2 ); - match std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + let ret = catch_unwind(std::panic::AssertUnwindSafe(|| { self.process_rollback(from, progress) - })) { + })); + + match ret { Ok(Ok(())) => {} Ok(Err(err)) => return Err(Error::DatabaseUpgrade(Box::new(err))), Err(e) => { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs new file mode 100644 index 000000000..e01ddf2e4 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -0,0 +1,141 @@ +use std::collections::BTreeMap; +use std::time::Duration; + +use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::{obkv_to_json, Filter}; +use meilisearch_types::settings::{self, SecretPolicy}; +use meilisearch_types::tasks::ExportIndexSettings; +use ureq::{json, Agent}; + +use crate::{Error, IndexScheduler, Result}; + +impl IndexScheduler { + pub(super) fn process_export( + &self, + url: &str, + indexes: &BTreeMap, + api_key: Option<&str>, + progress: Progress, + ) -> Result<()> { + #[cfg(test)] + self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; + + let indexes: Vec<_> = self + .index_names()? + .into_iter() + .flat_map(|uid| { + indexes + .iter() + .find(|(pattern, _)| pattern.matches_str(&uid)) + .map(|(_pattern, settings)| (uid, settings)) + }) + .collect(); + + let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); + + for (i, (uid, settings)) in indexes.iter().enumerate() { + let must_stop_processing = self.scheduler.must_stop_processing.clone(); + if must_stop_processing.get() { + return Err(Error::AbortedTask); + } + + progress.update_progress(VariableNameStep::::new( + format!("Exporting index `{uid}`"), + i as u32, + indexes.len() as u32, + )); + + let ExportIndexSettings { skip_embeddings, filter } = settings; + let index = self.index(uid)?; + let index_rtxn = index.read_txn()?; + + // Send the primary key + let primary_key = index.primary_key(&index_rtxn).unwrap(); + // TODO implement retry logic + let mut request = agent.post(&format!("{url}/indexes")); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(&json!({ "uid": uid, "primaryKey": primary_key })).unwrap(); + + // Send the index settings + let settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // TODO implement retry logic + // improve error reporting (get error message) + let mut request = agent.patch(&format!("{url}/indexes/{uid}/settings")); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(settings).unwrap(); + + let filter = filter + .as_deref() + .map(Filter::from_str) + .transpose() + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? + .flatten(); + + let filter_universe = filter + .map(|f| f.evaluate(&index_rtxn, &index)) + .transpose() + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let whole_universe = index + .documents_ids(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let universe = filter_universe.unwrap_or(whole_universe); + + let fields_ids_map = index.fields_ids_map(&index_rtxn)?; + let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index + .embedding_configs(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + let limit = 50 * 1024 * 1024; // 50 MiB + let mut buffer = Vec::new(); + let mut tmp_buffer = Vec::new(); + for docid in universe { + let document = index + .document(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + let value = obkv_to_json(&all_fields, &fields_ids_map, document) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + tmp_buffer.clear(); + serde_json::to_writer(&mut tmp_buffer, &value) + .map_err(meilisearch_types::milli::InternalError::from) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + if buffer.len() + tmp_buffer.len() > limit { + // TODO implement retry logic + post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + buffer.clear(); + } + buffer.extend_from_slice(&tmp_buffer); + } + + post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + } + + Ok(()) + } +} + +fn post_serialized_documents( + agent: &Agent, + url: &str, + uid: &str, + api_key: Option<&str>, + buffer: &[u8], +) -> Result { + let mut request = agent.post(&format!("{url}/indexes/{uid}/documents")); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_bytes(buffer) +} + +enum ExportIndex {} diff --git a/crates/index-scheduler/src/scheduler/process_upgrade/mod.rs b/crates/index-scheduler/src/scheduler/process_upgrade.rs similarity index 100% rename from crates/index-scheduler/src/scheduler/process_upgrade/mod.rs rename to crates/index-scheduler/src/scheduler/process_upgrade.rs diff --git a/crates/index-scheduler/src/test_utils.rs b/crates/index-scheduler/src/test_utils.rs index 5f206b55c..bfed7f53a 100644 --- a/crates/index-scheduler/src/test_utils.rs +++ b/crates/index-scheduler/src/test_utils.rs @@ -37,6 +37,7 @@ pub(crate) enum FailureLocation { InsideCreateBatch, InsideProcessBatch, PanicInsideProcessBatch, + ProcessExport, ProcessUpgrade, AcquiringWtxn, UpdatingTaskAfterProcessBatchSuccess { task_uid: u32 }, diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 7fe44d1c1..79571745b 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -601,12 +601,7 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } - Details::Export { - url: _, - api_key: _, - exported_documents: _, - skip_embeddings: _, - } => { + Details::Export { url: _, api_key: _, indexes: _ } => { assert_eq!(kind.as_kind(), Kind::Export); } Details::UpgradeDatabase { from: _, to: _ } => { diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 22c668d59..08ee803ef 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -393,7 +393,8 @@ InvalidSettingsIndexChat , InvalidRequest , BAD_REQU InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; -InvalidExportSkipEmbeddings , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/index_uid_pattern.rs b/crates/meilisearch-types/src/index_uid_pattern.rs index baf0249e2..f90fc7aee 100644 --- a/crates/meilisearch-types/src/index_uid_pattern.rs +++ b/crates/meilisearch-types/src/index_uid_pattern.rs @@ -12,7 +12,7 @@ use crate::index_uid::{IndexUid, IndexUidFormatError}; /// An index uid pattern is composed of only ascii alphanumeric characters, - and _, between 1 and 400 /// bytes long and optionally ending with a *. -#[derive(Serialize, Deserialize, Deserr, Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Serialize, Deserialize, Deserr, Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[deserr(try_from(&String) = FromStr::from_str -> IndexUidPatternFormatError)] pub struct IndexUidPattern(String); diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 06fda0835..0a8d7b8fe 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -8,7 +8,9 @@ use utoipa::ToSchema; use crate::batches::BatchId; use crate::error::ResponseError; use crate::settings::{Settings, Unchecked}; -use crate::tasks::{serialize_duration, Details, IndexSwap, Kind, Status, Task, TaskId}; +use crate::tasks::{ + serialize_duration, Details, DetailsExportIndexSettings, IndexSwap, Kind, Status, Task, TaskId, +}; #[derive(Debug, Clone, PartialEq, Serialize, ToSchema)] #[serde(rename_all = "camelCase")] @@ -126,9 +128,7 @@ pub struct DetailsView { #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, #[serde(skip_serializing_if = "Option::is_none")] - pub exported_documents: Option>, - #[serde(skip_serializing_if = "Option::is_none")] - pub skip_embeddings: Option, + pub indexes: Option>, } impl DetailsView { @@ -263,19 +263,9 @@ impl DetailsView { // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), }, - exported_documents: match ( - self.exported_documents.clone(), - other.exported_documents.clone(), - ) { + indexes: match (self.indexes.clone(), other.indexes.clone()) { (None, None) => None, - (None, Some(exp)) | (Some(exp), None) => Some(exp), - // We should never be able to batch multiple exports at the same time. - // So we return the first one we encounter but that shouldn't be an issue anyway. - (Some(left), Some(_right)) => Some(left), - }, - skip_embeddings: match (self.skip_embeddings, other.skip_embeddings) { - (None, None) => None, - (None, Some(skip)) | (Some(skip), None) => Some(skip), + (None, Some(indexes)) | (Some(indexes), None) => Some(indexes), // We should never be able to batch multiple exports at the same time. // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), @@ -369,9 +359,17 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } - Details::Export { url, api_key, exported_documents, skip_embeddings } => { - DetailsView { exported_documents: Some(exported_documents), ..Default::default() } - } + Details::Export { url, api_key, indexes } => DetailsView { + url: Some(url), + api_key, + indexes: Some( + indexes + .into_iter() + .map(|(pattern, settings)| (pattern.to_string(), settings)) + .collect(), + ), + ..Default::default() + }, Details::UpgradeDatabase { from, to } => DetailsView { upgrade_from: Some(format!("v{}.{}.{}", from.0, from.1, from.2)), upgrade_to: Some(format!("v{}.{}.{}", to.0, to.1, to.2)), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index e31e6062b..1f8f7e7cb 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -9,7 +9,7 @@ use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; use time::{Duration, OffsetDateTime}; -use utoipa::ToSchema; +use utoipa::{schema, ToSchema}; use uuid::Uuid; use crate::batches::BatchId; @@ -158,8 +158,7 @@ pub enum KindWithContent { Export { url: String, api_key: Option, - indexes: Vec, - skip_embeddings: bool, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -172,6 +171,13 @@ pub struct IndexSwap { pub indexes: (String, String), } +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "camelCase")] +pub struct ExportIndexSettings { + pub skip_embeddings: bool, + pub filter: Option, +} + impl KindWithContent { pub fn as_kind(&self) -> Kind { match self { @@ -280,14 +286,11 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: Default::default(), - skip_embeddings: *skip_embeddings, - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -354,14 +357,11 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: Default::default(), - skip_embeddings: skip_embeddings.clone(), - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -410,14 +410,11 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes: _, skip_embeddings } => { - Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - exported_documents: BTreeMap::default(), - skip_embeddings: skip_embeddings.clone(), - }) - } + KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -684,8 +681,7 @@ pub enum Details { Export { url: String, api_key: Option, - exported_documents: BTreeMap, - skip_embeddings: bool, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), @@ -693,6 +689,23 @@ pub enum Details { }, } +#[derive(Debug, PartialEq, Clone, Serialize, Deserialize, ToSchema)] +#[schema(rename_all = "camelCase")] +pub struct DetailsExportIndexSettings { + #[serde(flatten)] + settings: ExportIndexSettings, + #[serde(skip_serializing_if = "Option::is_none")] + matched_documents: Option, + #[serde(skip_serializing_if = "Option::is_none")] + exported_documents: Option, +} + +impl From for DetailsExportIndexSettings { + fn from(settings: ExportIndexSettings) -> Self { + DetailsExportIndexSettings { settings, matched_documents: None, exported_documents: None } + } +} + impl Details { pub fn to_failed(&self) -> Self { let mut details = self.clone(); diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 666799273..7029f0ebf 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -1,3 +1,5 @@ +use std::collections::BTreeMap; + use actix_web::web::{self, Data}; use actix_web::{HttpRequest, HttpResponse}; use deserr::actix_web::AwebJson; @@ -8,7 +10,7 @@ use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::ResponseError; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::actions; -use meilisearch_types::tasks::KindWithContent; +use meilisearch_types::tasks::{ExportIndexSettings as DbExportIndexSettings, KindWithContent}; use serde::Serialize; use tracing::debug; use utoipa::{OpenApi, ToSchema}; @@ -69,8 +71,17 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); - let Export { url, api_key, indexes, skip_embeddings } = export; - let task = KindWithContent::Export { url, api_key, indexes, skip_embeddings }; + let Export { url, api_key, indexes } = export; + let task = KindWithContent::Export { + url, + api_key, + indexes: indexes + .into_iter() + .map(|(pattern, ExportIndexSettings { skip_embeddings, filter })| { + (pattern, DbExportIndexSettings { skip_embeddings, filter }) + }) + .collect(), + }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -95,11 +106,22 @@ pub struct Export { #[deserr(default, error = DeserrJsonError)] pub api_key: Option, #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] - #[deserr(default, error = DeserrJsonError)] + #[deserr(default)] #[serde(default)] - pub indexes: Vec, + pub indexes: BTreeMap, +} + +#[derive(Debug, Deserr, ToSchema, Serialize)] +#[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] +#[serde(rename_all = "camelCase")] +#[schema(rename_all = "camelCase")] +pub struct ExportIndexSettings { #[schema(value_type = Option, example = json!("true"))] #[serde(default)] - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub skip_embeddings: bool, + #[schema(value_type = Option, example = json!("genres = action"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub filter: Option, } From e8795d2608326dff111098d64ea25b646ff4361c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 14 Jun 2025 12:43:24 +0200 Subject: [PATCH 003/101] Export embeddings --- .../src/scheduler/process_export.rs | 73 ++++++++++++++++++- 1 file changed, 70 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e01ddf2e4..1686472ab 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,13 +1,17 @@ use std::collections::BTreeMap; +use std::sync::atomic; use std::time::Duration; use meilisearch_types::index_uid_pattern::IndexUidPattern; +use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, Filter}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use ureq::{json, Agent}; +use crate::processing::AtomicDocumentStep; use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { @@ -92,19 +96,77 @@ impl IndexScheduler { .embedding_configs(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let total_documents = universe.len() as u32; + let (step, progress_step) = AtomicDocumentStep::new(total_documents); + progress.update_progress(progress_step); + let limit = 50 * 1024 * 1024; // 50 MiB let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); - for docid in universe { + for (i, docid) in universe.into_iter().enumerate() { let document = index .document(&index_rtxn, docid) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - let value = obkv_to_json(&all_fields, &fields_ids_map, document) + let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // TODO definitely factorize this code + if !*skip_embeddings { + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&index_rtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } + }, + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, + }; + vectors + .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); + } + } + } + tmp_buffer.clear(); - serde_json::to_writer(&mut tmp_buffer, &value) + serde_json::to_writer(&mut tmp_buffer, &document) .map_err(meilisearch_types::milli::InternalError::from) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; @@ -114,9 +176,14 @@ impl IndexScheduler { buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); + + if i % 100 == 0 { + step.fetch_add(100, atomic::Ordering::Relaxed); + } } post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + step.store(total_documents, atomic::Ordering::Relaxed); } Ok(()) From acb7c0a449462d682448d5362cc189ad6410d155 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 11:35:47 +0200 Subject: [PATCH 004/101] Implement a retry strategy --- Cargo.lock | 1 + crates/index-scheduler/Cargo.toml | 1 + crates/index-scheduler/src/error.rs | 4 + .../src/scheduler/process_export.rs | 108 ++++++++++++++---- crates/meilisearch-types/src/settings.rs | 1 + 5 files changed, 91 insertions(+), 24 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7455ff1b4..a883b749f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2997,6 +2997,7 @@ name = "index-scheduler" version = "1.15.2" dependencies = [ "anyhow", + "backoff", "big_s", "bincode", "bumpalo", diff --git a/crates/index-scheduler/Cargo.toml b/crates/index-scheduler/Cargo.toml index f4901b2f2..de0d01935 100644 --- a/crates/index-scheduler/Cargo.toml +++ b/crates/index-scheduler/Cargo.toml @@ -44,6 +44,7 @@ time = { version = "0.3.41", features = [ tracing = "0.1.41" ureq = "2.12.1" uuid = { version = "1.17.0", features = ["serde", "v4"] } +backoff = "0.4.0" [dev-dependencies] big_s = "1.0.2" diff --git a/crates/index-scheduler/src/error.rs b/crates/index-scheduler/src/error.rs index 2020ac597..60669ff2d 100644 --- a/crates/index-scheduler/src/error.rs +++ b/crates/index-scheduler/src/error.rs @@ -153,6 +153,8 @@ pub enum Error { DatabaseUpgrade(Box), #[error(transparent)] Export(Box), + #[error("Failed to export documents to remote server {code} ({type}): {message} <{link}>")] + FromRemoteWhenExporting { message: String, code: String, r#type: String, link: String }, #[error("Failed to rollback for index `{index}`: {rollback_outcome} ")] RollbackFailed { index: String, rollback_outcome: RollbackOutcome }, #[error(transparent)] @@ -214,6 +216,7 @@ impl Error { | Error::BatchNotFound(_) | Error::TaskDeletionWithEmptyQuery | Error::TaskCancelationWithEmptyQuery + | Error::FromRemoteWhenExporting { .. } | Error::AbortedTask | Error::Dump(_) | Error::Heed(_) @@ -285,6 +288,7 @@ impl ErrorCode for Error { Error::Dump(e) => e.error_code(), Error::Milli { error, .. } => error.error_code(), Error::ProcessBatchPanicked(_) => Code::Internal, + Error::FromRemoteWhenExporting { .. } => Code::Internal, Error::Heed(e) => e.error_code(), Error::HeedTransaction(e) => e.error_code(), Error::FileStore(e) => e.error_code(), diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 1686472ab..7501c260e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,14 +1,18 @@ use std::collections::BTreeMap; +use std::io; use std::sync::atomic; use std::time::Duration; +use backoff::ExponentialBackoff; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; +use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{obkv_to_json, Filter}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; +use serde::Deserialize; use ureq::{json, Agent}; use crate::processing::AtomicDocumentStep; @@ -17,7 +21,7 @@ use crate::{Error, IndexScheduler, Result}; impl IndexScheduler { pub(super) fn process_export( &self, - url: &str, + base_url: &str, indexes: &BTreeMap, api_key: Option<&str>, progress: Progress, @@ -56,24 +60,34 @@ impl IndexScheduler { // Send the primary key let primary_key = index.primary_key(&index_rtxn).unwrap(); - // TODO implement retry logic - let mut request = agent.post(&format!("{url}/indexes")); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); - } - request.send_json(&json!({ "uid": uid, "primaryKey": primary_key })).unwrap(); + let url = format!("{base_url}/indexes"); + retry(|| { + let mut request = agent.post(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "uid": uid, "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; // Send the index settings - let settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // TODO implement retry logic - // improve error reporting (get error message) - let mut request = agent.patch(&format!("{url}/indexes/{uid}/settings")); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + // Remove the experimental chat setting if not enabled + if self.features().check_chat_completions("exporting chat settings").is_err() { + settings.chat = Setting::NotSet; } - request.send_json(settings).unwrap(); + // Retry logic for sending settings + let url = format!("{base_url}/indexes/{uid}/settings"); + retry(|| { + let mut request = agent.patch(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + request.send_json(settings.clone()).map_err(into_backoff_error) + })?; + // TODO support JSON Value objects let filter = filter .as_deref() .map(Filter::from_str) @@ -171,8 +185,7 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; if buffer.len() + tmp_buffer.len() > limit { - // TODO implement retry logic - post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); @@ -182,7 +195,7 @@ impl IndexScheduler { } } - post_serialized_documents(&agent, url, uid, api_key, &buffer).unwrap(); + post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); step.store(total_documents, atomic::Ordering::Relaxed); } @@ -190,19 +203,66 @@ impl IndexScheduler { } } +fn retry(send_request: F) -> Result +where + F: Fn() -> Result>, +{ + match backoff::retry(ExponentialBackoff::default(), || send_request()) { + Ok(response) => Ok(response), + Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), + Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), + } +} + fn post_serialized_documents( agent: &Agent, - url: &str, + base_url: &str, uid: &str, api_key: Option<&str>, buffer: &[u8], -) -> Result { - let mut request = agent.post(&format!("{url}/indexes/{uid}/documents")); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); +) -> Result { + let url = format!("{base_url}/indexes/{uid}/documents"); + retry(|| { + let mut request = agent.post(&url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(buffer).map_err(into_backoff_error) + }) +} + +fn into_backoff_error(err: ureq::Error) -> backoff::Error { + match err { + // Those code status must trigger an automatic retry + // + ureq::Error::Status(408 | 429 | 500 | 502 | 503 | 504, _) => { + backoff::Error::Transient { err, retry_after: None } + } + ureq::Error::Status(_, _) => backoff::Error::Permanent(err), + ureq::Error::Transport(_) => backoff::Error::Transient { err, retry_after: None }, + } +} + +/// Converts a `ureq::Error` into an `Error`. +fn ureq_error_into_error(error: ureq::Error) -> Error { + #[derive(Deserialize)] + struct MeiliError { + message: String, + code: String, + r#type: String, + link: String, + } + + match error { + ureq::Error::Status(_, response) => match response.into_json() { + Ok(MeiliError { message, code, r#type, link }) => { + Error::FromRemoteWhenExporting { message, code, r#type, link } + } + Err(e) => io::Error::from(e).into(), + }, + ureq::Error::Transport(transport) => io::Error::new(io::ErrorKind::Other, transport).into(), } - request.send_bytes(buffer) } enum ExportIndex {} diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 1c225b355..295318f4b 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -968,6 +968,7 @@ pub fn settings( if let SecretPolicy::HideSecrets = secret_policy { settings.hide_secrets() } + Ok(settings) } From 7c448bcc003c99f125ad8e75dca590b71c984187 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 14:53:50 +0200 Subject: [PATCH 005/101] Make clippy happy --- crates/meilisearch-types/src/tasks.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 1f8f7e7cb..3ef60cacf 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,7 +289,7 @@ impl KindWithContent { KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), @@ -360,7 +360,7 @@ impl KindWithContent { KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, @@ -413,7 +413,7 @@ impl From<&KindWithContent> for Option
{ KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - indexes: indexes.into_iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }), KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, From 3e2f4682137159745848bee46d637dbd35cc9cc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:34:05 +0200 Subject: [PATCH 006/101] Support task cancelation --- .../src/scheduler/process_export.rs | 54 ++++++++++--------- 1 file changed, 28 insertions(+), 26 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 7501c260e..ceac18632 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -15,6 +15,7 @@ use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; use ureq::{json, Agent}; +use super::MustStopProcessing; use crate::processing::AtomicDocumentStep; use crate::{Error, IndexScheduler, Result}; @@ -41,9 +42,8 @@ impl IndexScheduler { .collect(); let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); - + let must_stop_processing = self.scheduler.must_stop_processing.clone(); for (i, (uid, settings)) in indexes.iter().enumerate() { - let must_stop_processing = self.scheduler.must_stop_processing.clone(); if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -59,9 +59,9 @@ impl IndexScheduler { let index_rtxn = index.read_txn()?; // Send the primary key - let primary_key = index.primary_key(&index_rtxn).unwrap(); + let primary_key = index.primary_key(&index_rtxn)?; let url = format!("{base_url}/indexes"); - retry(|| { + retry(&must_stop_processing, || { let mut request = agent.post(&url); if let Some(api_key) = api_key { request = request.set("Authorization", &format!("Bearer {api_key}")); @@ -79,7 +79,7 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); - retry(|| { + retry(&must_stop_processing, || { let mut request = agent.patch(&url); if let Some(api_key) = api_key { request = request.set("Authorization", &format!("Bearer {api_key}")); @@ -115,6 +115,8 @@ impl IndexScheduler { progress.update_progress(progress_step); let limit = 50 * 1024 * 1024; // 50 MiB + let documents_url = format!("{base_url}/indexes/{uid}/documents"); + let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); for (i, docid) in universe.into_iter().enumerate() { @@ -185,7 +187,14 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; if buffer.len() + tmp_buffer.len() > limit { - post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); @@ -195,7 +204,14 @@ impl IndexScheduler { } } - post_serialized_documents(&agent, base_url, uid, api_key, &buffer).unwrap(); + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request.set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; step.store(total_documents, atomic::Ordering::Relaxed); } @@ -203,10 +219,14 @@ impl IndexScheduler { } } -fn retry(send_request: F) -> Result +fn retry(must_stop_processing: &MustStopProcessing, send_request: F) -> Result where F: Fn() -> Result>, { + if must_stop_processing.get() { + return Err(Error::AbortedTask); + } + match backoff::retry(ExponentialBackoff::default(), || send_request()) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), @@ -214,24 +234,6 @@ where } } -fn post_serialized_documents( - agent: &Agent, - base_url: &str, - uid: &str, - api_key: Option<&str>, - buffer: &[u8], -) -> Result { - let url = format!("{base_url}/indexes/{uid}/documents"); - retry(|| { - let mut request = agent.post(&url); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); - } - request.send_bytes(buffer).map_err(into_backoff_error) - }) -} - fn into_backoff_error(err: ureq::Error) -> backoff::Error { match err { // Those code status must trigger an automatic retry From bc08cd0deb8805b126c64dc384b18d2ee203f508 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:37:15 +0200 Subject: [PATCH 007/101] Make clippy happy again --- .../index-scheduler/src/scheduler/process_export.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index ceac18632..e10c468fc 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -59,7 +59,10 @@ impl IndexScheduler { let index_rtxn = index.read_txn()?; // Send the primary key - let primary_key = index.primary_key(&index_rtxn)?; + let primary_key = index + .primary_key(&index_rtxn) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); @@ -108,7 +111,7 @@ impl IndexScheduler { let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); let embedding_configs = index .embedding_configs(&index_rtxn) - .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; let total_documents = universe.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); @@ -227,7 +230,7 @@ where return Err(Error::AbortedTask); } - match backoff::retry(ExponentialBackoff::default(), || send_request()) { + match backoff::retry(ExponentialBackoff::default(), send_request) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), @@ -261,7 +264,7 @@ fn ureq_error_into_error(error: ureq::Error) -> Error { Ok(MeiliError { message, code, r#type, link }) => { Error::FromRemoteWhenExporting { message, code, r#type, link } } - Err(e) => io::Error::from(e).into(), + Err(e) => e.into(), }, ureq::Error::Transport(transport) => io::Error::new(io::ErrorKind::Other, transport).into(), } From 3329248a8448cc1ea8b2356dac803f38b8972287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:50:32 +0200 Subject: [PATCH 008/101] Support no pattern when exporting --- .../src/scheduler/process_export.rs | 89 +++++++++---------- crates/meilisearch-types/src/tasks.rs | 3 +- crates/meilisearch/src/routes/export.rs | 21 +++-- 3 files changed, 54 insertions(+), 59 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e10c468fc..5c65ca51e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -54,7 +54,7 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { skip_embeddings, filter } = settings; + let ExportIndexSettings { filter } = settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; @@ -131,56 +131,53 @@ impl IndexScheduler { .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // TODO definitely factorize this code - if !*skip_embeddings { - 'inject_vectors: { - let embeddings = index - .embeddings(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - if embeddings.is_empty() { - break 'inject_vectors; - } + if embeddings.is_empty() { + break 'inject_vectors; + } - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME) - .or_insert(serde_json::Value::Object(Default::default())); + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); - let serde_json::Value::Object(vectors) = vectors else { - return Err(Error::from_milli( - meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&index_rtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - } - }, - value: vectors.clone(), + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + meilisearch_types::milli::Error::UserError( + meilisearch_types::milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of(&index_rtxn, std::iter::once(docid)) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } }, - ), - Some(uid.to_string()), - )); + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + )), + regenerate: !user_provided, }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - - let embeddings = ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - )), - regenerate: !user_provided, - }; - vectors - .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); - } + vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 3ef60cacf..b5e2581fc 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -171,10 +171,9 @@ pub struct IndexSwap { pub indexes: (String, String), } -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { - pub skip_embeddings: bool, pub filter: Option, } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 7029f0ebf..40ef20008 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -72,16 +72,19 @@ async fn export( debug!(returns = ?export, "Trigger export"); let Export { url, api_key, indexes } = export; - let task = KindWithContent::Export { - url, - api_key, - indexes: indexes + + let indexes = if indexes.is_empty() { + BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) + } else { + indexes .into_iter() - .map(|(pattern, ExportIndexSettings { skip_embeddings, filter })| { - (pattern, DbExportIndexSettings { skip_embeddings, filter }) + .map(|(pattern, ExportIndexSettings { filter })| { + (pattern, DbExportIndexSettings { filter }) }) - .collect(), + .collect() }; + + let task = KindWithContent::Export { url, api_key, indexes }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -116,10 +119,6 @@ pub struct Export { #[serde(rename_all = "camelCase")] #[schema(rename_all = "camelCase")] pub struct ExportIndexSettings { - #[schema(value_type = Option, example = json!("true"))] - #[serde(default)] - #[deserr(default, error = DeserrJsonError)] - pub skip_embeddings: bool, #[schema(value_type = Option, example = json!("genres = action"))] #[serde(default)] #[deserr(default, error = DeserrJsonError)] From ee812b31c4ef73305fb417869e6ca0d89b856642 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 15:56:26 +0200 Subject: [PATCH 009/101] Support JSON value as filters --- crates/index-scheduler/src/scheduler/process_export.rs | 5 ++--- crates/meilisearch-types/src/tasks.rs | 7 ++++--- crates/meilisearch/src/routes/export.rs | 3 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 5c65ca51e..e6c09e58a 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -90,10 +90,9 @@ impl IndexScheduler { request.send_json(settings.clone()).map_err(into_backoff_error) })?; - // TODO support JSON Value objects let filter = filter - .as_deref() - .map(Filter::from_str) + .as_ref() + .map(Filter::from_json) .transpose() .map_err(|e| Error::from_milli(e, Some(uid.to_string())))? .flatten(); diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index b5e2581fc..86951192c 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -8,6 +8,7 @@ use milli::update::IndexDocumentsMethod; use milli::Object; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize, Serializer}; +use serde_json::Value; use time::{Duration, OffsetDateTime}; use utoipa::{schema, ToSchema}; use uuid::Uuid; @@ -111,11 +112,11 @@ pub enum KindWithContent { }, DocumentDeletionByFilter { index_uid: String, - filter_expr: serde_json::Value, + filter_expr: Value, }, DocumentEdition { index_uid: String, - filter_expr: Option, + filter_expr: Option, context: Option, function: String, }, @@ -174,7 +175,7 @@ pub struct IndexSwap { #[derive(Debug, Default, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { - pub filter: Option, + pub filter: Option, } impl KindWithContent { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 40ef20008..de1fe2c38 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -12,6 +12,7 @@ use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::keys::actions; use meilisearch_types::tasks::{ExportIndexSettings as DbExportIndexSettings, KindWithContent}; use serde::Serialize; +use serde_json::Value; use tracing::debug; use utoipa::{OpenApi, ToSchema}; @@ -122,5 +123,5 @@ pub struct ExportIndexSettings { #[schema(value_type = Option, example = json!("genres = action"))] #[serde(default)] #[deserr(default, error = DeserrJsonError)] - pub filter: Option, + pub filter: Option, } From 2d4f7c635eedc00e3ecf4c07cb5c14f300379103 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 16:18:31 +0200 Subject: [PATCH 010/101] Make tests happy --- crates/index-scheduler/src/scheduler/test.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 06bc14051..fb309f882 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -732,6 +732,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, From c6216517c7243809ae7b886eb8e07cecf34ab5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 16 Jun 2025 16:30:35 +0200 Subject: [PATCH 011/101] Parallelize document upload --- .../src/scheduler/process_export.rs | 189 ++++++++++-------- crates/index-scheduler/src/scheduler/test.rs | 3 + crates/milli/src/thread_pool_no_abort.rs | 18 +- .../src/update/index_documents/extract/mod.rs | 2 +- .../milli/src/update/index_documents/mod.rs | 1 + crates/milli/src/update/mod.rs | 2 +- 6 files changed, 133 insertions(+), 82 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e6c09e58a..3054c919b 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -7,9 +7,9 @@ use backoff::ExponentialBackoff; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; -use meilisearch_types::milli::update::Setting; +use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; -use meilisearch_types::milli::{obkv_to_json, Filter}; +use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; @@ -112,6 +112,10 @@ impl IndexScheduler { .embedding_configs(&index_rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // We don't need to keep this one alive as we will + // spawn many threads to process the documents + drop(index_rtxn); + let total_documents = universe.len() as u32; let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); @@ -119,73 +123,107 @@ impl IndexScheduler { let limit = 50 * 1024 * 1024; // 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); - let mut buffer = Vec::new(); - let mut tmp_buffer = Vec::new(); - for (i, docid) in universe.into_iter().enumerate() { - let document = index - .document(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + request_threads() + .broadcast(|ctx| { + let index_rtxn = index + .read_txn() + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let mut buffer = Vec::new(); + let mut tmp_buffer = Vec::new(); + for (i, docid) in universe.iter().enumerate() { + if i % ctx.num_threads() != ctx.index() { + continue; + } - // TODO definitely factorize this code - 'inject_vectors: { - let embeddings = index - .embeddings(&index_rtxn, docid) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + let document = index + .document(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - if embeddings.is_empty() { - break 'inject_vectors; + let mut document = obkv_to_json(&all_fields, &fields_ids_map, document) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + // TODO definitely factorize this code + 'inject_vectors: { + let embeddings = index + .embeddings(&index_rtxn, docid) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + + if embeddings.is_empty() { + break 'inject_vectors; + } + + let vectors = document + .entry(RESERVED_VECTORS_FIELD_NAME) + .or_insert(serde_json::Value::Object(Default::default())); + + let serde_json::Value::Object(vectors) = vectors else { + return Err(Error::from_milli( + milli::Error::UserError( + milli::UserError::InvalidVectorsMapType { + document_id: { + if let Ok(Some(Ok(index))) = index + .external_id_of( + &index_rtxn, + std::iter::once(docid), + ) + .map(|it| it.into_iter().next()) + { + index + } else { + format!("internal docid={docid}") + } + }, + value: vectors.clone(), + }, + ), + Some(uid.to_string()), + )); + }; + + for (embedder_name, embeddings) in embeddings { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(docid)); + + let embeddings = ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), + ), + regenerate: !user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); + } + } + + tmp_buffer.clear(); + serde_json::to_writer(&mut tmp_buffer, &document) + .map_err(milli::InternalError::from) + .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; + + if buffer.len() + tmp_buffer.len() > limit { + retry(&must_stop_processing, || { + let mut request = agent.post(&documents_url); + request = request.set("Content-Type", "application/x-ndjson"); + if let Some(api_key) = api_key { + request = request + .set("Authorization", &(format!("Bearer {api_key}"))); + } + request.send_bytes(&buffer).map_err(into_backoff_error) + })?; + buffer.clear(); + } + buffer.extend_from_slice(&tmp_buffer); + + if i % 100 == 0 { + step.fetch_add(100, atomic::Ordering::Relaxed); + } } - let vectors = document - .entry(RESERVED_VECTORS_FIELD_NAME) - .or_insert(serde_json::Value::Object(Default::default())); - - let serde_json::Value::Object(vectors) = vectors else { - return Err(Error::from_milli( - meilisearch_types::milli::Error::UserError( - meilisearch_types::milli::UserError::InvalidVectorsMapType { - document_id: { - if let Ok(Some(Ok(index))) = index - .external_id_of(&index_rtxn, std::iter::once(docid)) - .map(|it| it.into_iter().next()) - { - index - } else { - format!("internal docid={docid}") - } - }, - value: vectors.clone(), - }, - ), - Some(uid.to_string()), - )); - }; - - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - - let embeddings = ExplicitVectors { - embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - )), - regenerate: !user_provided, - }; - vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); - } - } - - tmp_buffer.clear(); - serde_json::to_writer(&mut tmp_buffer, &document) - .map_err(meilisearch_types::milli::InternalError::from) - .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - - if buffer.len() + tmp_buffer.len() > limit { retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); @@ -194,23 +232,16 @@ impl IndexScheduler { } request.send_bytes(&buffer).map_err(into_backoff_error) })?; - buffer.clear(); - } - buffer.extend_from_slice(&tmp_buffer); - if i % 100 == 0 { - step.fetch_add(100, atomic::Ordering::Relaxed); - } - } + Ok(()) + }) + .map_err(|e| { + Error::from_milli( + milli::Error::InternalError(InternalError::PanicInThreadPool(e)), + Some(uid.to_string()), + ) + })?; - retry(&must_stop_processing, || { - let mut request = agent.post(&documents_url); - request = request.set("Content-Type", "application/x-ndjson"); - if let Some(api_key) = api_key { - request = request.set("Authorization", &(format!("Bearer {api_key}"))); - } - request.send_bytes(&buffer).map_err(into_backoff_error) - })?; step.store(total_documents, atomic::Ordering::Relaxed); } diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index fb309f882..ee26165c7 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -766,6 +766,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, @@ -806,6 +807,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, @@ -847,6 +849,7 @@ fn basic_get_stats() { "documentDeletion": 0, "documentEdition": 0, "dumpCreation": 0, + "export": 0, "indexCreation": 3, "indexDeletion": 0, "indexSwap": 0, diff --git a/crates/milli/src/thread_pool_no_abort.rs b/crates/milli/src/thread_pool_no_abort.rs index 0c2fbb30d..66380ff36 100644 --- a/crates/milli/src/thread_pool_no_abort.rs +++ b/crates/milli/src/thread_pool_no_abort.rs @@ -1,7 +1,7 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering}; use std::sync::Arc; -use rayon::{ThreadPool, ThreadPoolBuilder}; +use rayon::{BroadcastContext, ThreadPool, ThreadPoolBuilder}; use thiserror::Error; /// A rayon ThreadPool wrapper that can catch panics in the pool @@ -32,6 +32,22 @@ impl ThreadPoolNoAbort { } } + pub fn broadcast(&self, op: OP) -> Result, PanicCatched> + where + OP: Fn(BroadcastContext<'_>) -> R + Sync, + R: Send, + { + self.active_operations.fetch_add(1, Ordering::Relaxed); + let output = self.thread_pool.broadcast(op); + self.active_operations.fetch_sub(1, Ordering::Relaxed); + // While reseting the pool panic catcher we return an error if we catched one. + if self.pool_catched_panic.swap(false, Ordering::SeqCst) { + Err(PanicCatched) + } else { + Ok(output) + } + } + pub fn current_num_threads(&self) -> usize { self.thread_pool.current_num_threads() } diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index 8cd664a2f..cb4ac03a6 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -210,7 +210,7 @@ fn run_extraction_task( }) } -fn request_threads() -> &'static ThreadPoolNoAbort { +pub fn request_threads() -> &'static ThreadPoolNoAbort { static REQUEST_THREADS: OnceLock = OnceLock::new(); REQUEST_THREADS.get_or_init(|| { diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index f547c68d4..dd0238fcb 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -12,6 +12,7 @@ use std::sync::Arc; use crossbeam_channel::{Receiver, Sender}; use enrich::enrich_documents_batch; +pub use extract::request_threads; use grenad::{Merger, MergerBuilder}; use hashbrown::HashMap; use heed::types::Str; diff --git a/crates/milli/src/update/mod.rs b/crates/milli/src/update/mod.rs index 04ce68fc7..64eb9f1d3 100644 --- a/crates/milli/src/update/mod.rs +++ b/crates/milli/src/update/mod.rs @@ -4,7 +4,7 @@ pub use self::clear_documents::ClearDocuments; pub use self::concurrent_available_ids::ConcurrentAvailableIds; pub use self::facet::bulk::FacetsUpdateBulk; pub use self::facet::incremental::FacetsUpdateIncrementalInner; -pub use self::index_documents::*; +pub use self::index_documents::{request_threads, *}; pub use self::indexer_config::{default_thread_pool_and_threads, IndexerConfig}; pub use self::new::ChannelCongestion; pub use self::settings::{validate_embedding_settings, Setting, Settings}; From a743da30618850e6e6e302b1c7e009d932d7a8b6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 12:29:14 +0200 Subject: [PATCH 012/101] Gzip-compress the content --- .../src/scheduler/process_export.rs | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 3054c919b..180162eda 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -1,9 +1,11 @@ use std::collections::BTreeMap; -use std::io; +use std::io::{self, Write as _}; use std::sync::atomic; use std::time::Duration; use backoff::ExponentialBackoff; +use flate2::write::GzEncoder; +use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME; use meilisearch_types::milli::progress::{Progress, VariableNameStep}; @@ -131,6 +133,7 @@ impl IndexScheduler { let mut buffer = Vec::new(); let mut tmp_buffer = Vec::new(); + let mut compressed_buffer = Vec::new(); for (i, docid) in universe.iter().enumerate() { if i % ctx.num_threads() != ctx.index() { continue; @@ -205,17 +208,31 @@ impl IndexScheduler { .map_err(milli::InternalError::from) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - if buffer.len() + tmp_buffer.len() > limit { + // Make sure we put at least one document in the buffer even + // though we might go above the buffer limit before sending + if !buffer.is_empty() && buffer.len() + tmp_buffer.len() > limit { + // We compress the documents before sending them + let mut encoder = + GzEncoder::new(&mut compressed_buffer, Compression::default()); + encoder + .write_all(&buffer) + .map_err(|e| Error::from_milli(e.into(), Some(uid.clone())))?; + encoder + .finish() + .map_err(|e| Error::from_milli(e.into(), Some(uid.clone())))?; + retry(&must_stop_processing, || { let mut request = agent.post(&documents_url); request = request.set("Content-Type", "application/x-ndjson"); + request = request.set("Content-Encoding", "gzip"); if let Some(api_key) = api_key { request = request .set("Authorization", &(format!("Bearer {api_key}"))); } - request.send_bytes(&buffer).map_err(into_backoff_error) + request.send_bytes(&compressed_buffer).map_err(into_backoff_error) })?; buffer.clear(); + compressed_buffer.clear(); } buffer.extend_from_slice(&tmp_buffer); From 63031219c543318258aaf4bb268b9e29bebf4968 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 25 Jun 2025 18:24:50 +0200 Subject: [PATCH 013/101] Add the payload size to the parameters --- Cargo.lock | 1 + crates/dump/src/lib.rs | 5 +- crates/index-scheduler/src/dump.rs | 31 ++++++----- crates/index-scheduler/src/insta_snapshot.rs | 4 +- .../src/scheduler/process_batch.rs | 11 +++- .../src/scheduler/process_export.rs | 6 ++- crates/index-scheduler/src/utils.rs | 2 +- crates/meilisearch-types/Cargo.toml | 1 + crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch-types/src/lib.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 14 ++++- crates/meilisearch-types/src/tasks.rs | 42 +++++++++------ crates/meilisearch/src/routes/export.rs | 51 ++++++++++++++++++- 13 files changed, 130 insertions(+), 41 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a883b749f..be6aa4b21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3855,6 +3855,7 @@ dependencies = [ "anyhow", "bumpalo", "bumparaw-collections", + "byte-unit", "convert_case 0.8.0", "csv", "deserr", diff --git a/crates/dump/src/lib.rs b/crates/dump/src/lib.rs index 5c67d7a94..7fd0ea376 100644 --- a/crates/dump/src/lib.rs +++ b/crates/dump/src/lib.rs @@ -4,6 +4,7 @@ use std::collections::BTreeMap; use meilisearch_types::batches::BatchId; +use meilisearch_types::byte_unit::Byte; use meilisearch_types::error::ResponseError; use meilisearch_types::keys::Key; use meilisearch_types::milli::update::IndexDocumentsMethod; @@ -148,6 +149,7 @@ pub enum KindDump { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { @@ -222,9 +224,10 @@ impl From for KindDump { KindDump::DumpCreation { keys, instance_uid } } KindWithContent::SnapshotCreation => KindDump::SnapshotCreation, - KindWithContent::Export { url, api_key, indexes } => KindDump::Export { + KindWithContent::Export { url, api_key, payload_size, indexes } => KindDump::Export { url, api_key, + payload_size, indexes: indexes .into_iter() .map(|(pattern, settings)| (pattern.to_string(), settings)) diff --git a/crates/index-scheduler/src/dump.rs b/crates/index-scheduler/src/dump.rs index 2a99a74aa..1e681c8e8 100644 --- a/crates/index-scheduler/src/dump.rs +++ b/crates/index-scheduler/src/dump.rs @@ -212,20 +212,23 @@ impl<'a> Dump<'a> { KindWithContent::DumpCreation { keys, instance_uid } } KindDump::SnapshotCreation => KindWithContent::SnapshotCreation, - KindDump::Export { url, indexes, api_key } => KindWithContent::Export { - url, - api_key, - indexes: indexes - .into_iter() - .map(|(pattern, settings)| { - Ok(( - IndexUidPattern::try_from(pattern) - .map_err(|_| Error::CorruptedDump)?, - settings, - )) - }) - .collect::>()?, - }, + KindDump::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { + url, + api_key, + payload_size, + indexes: indexes + .into_iter() + .map(|(pattern, settings)| { + Ok(( + IndexUidPattern::try_from(pattern) + .map_err(|_| Error::CorruptedDump)?, + settings, + )) + }) + .collect::>()?, + } + } KindDump::UpgradeDatabase { from } => KindWithContent::UpgradeDatabase { from }, }, }; diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 138b591ff..f48821520 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -289,8 +289,8 @@ fn snapshot_details(d: &Details) -> String { Details::IndexSwap { swaps } => { format!("{{ swaps: {swaps:?} }}") } - Details::Export { url, api_key, indexes } => { - format!("{{ url: {url:?}, api_key: {api_key:?}, indexes: {indexes:?} }}") + Details::Export { url, api_key, payload_size, indexes } => { + format!("{{ url: {url:?}, api_key: {api_key:?}, payload_size: {payload_size:?}, indexes: {indexes:?} }}") } Details::UpgradeDatabase { from, to } => { format!("{{ from: {from:?}, to: {to:?} }}") diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index 99278756d..e56b8e13a 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -362,12 +362,19 @@ impl IndexScheduler { Ok((vec![task], ProcessBatchInfo::default())) } Batch::Export { mut task } => { - let KindWithContent::Export { url, indexes, api_key } = &task.kind else { + let KindWithContent::Export { url, api_key, payload_size, indexes } = &task.kind + else { unreachable!() }; let ret = catch_unwind(AssertUnwindSafe(|| { - self.process_export(url, indexes, api_key.as_deref(), progress) + self.process_export( + url, + api_key.as_deref(), + payload_size.as_ref(), + indexes, + progress, + ) })); match ret { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 180162eda..e777809fd 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -4,6 +4,7 @@ use std::sync::atomic; use std::time::Duration; use backoff::ExponentialBackoff; +use byte_unit::Byte; use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::index_uid_pattern::IndexUidPattern; @@ -25,8 +26,9 @@ impl IndexScheduler { pub(super) fn process_export( &self, base_url: &str, - indexes: &BTreeMap, api_key: Option<&str>, + payload_size: Option<&Byte>, + indexes: &BTreeMap, progress: Progress, ) -> Result<()> { #[cfg(test)] @@ -122,7 +124,7 @@ impl IndexScheduler { let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); - let limit = 50 * 1024 * 1024; // 50 MiB + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); request_threads() diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 79571745b..594023145 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -601,7 +601,7 @@ impl crate::IndexScheduler { Details::Dump { dump_uid: _ } => { assert_eq!(kind.as_kind(), Kind::DumpCreation); } - Details::Export { url: _, api_key: _, indexes: _ } => { + Details::Export { url: _, api_key: _, payload_size: _, indexes: _ } => { assert_eq!(kind.as_kind(), Kind::Export); } Details::UpgradeDatabase { from: _, to: _ } => { diff --git a/crates/meilisearch-types/Cargo.toml b/crates/meilisearch-types/Cargo.toml index f76044078..faf59643f 100644 --- a/crates/meilisearch-types/Cargo.toml +++ b/crates/meilisearch-types/Cargo.toml @@ -15,6 +15,7 @@ actix-web = { version = "4.11.0", default-features = false } anyhow = "1.0.98" bumpalo = "3.18.1" bumparaw-collections = "0.1.4" +byte-unit = { version = "5.1.6", features = ["serde"] } convert_case = "0.8.0" csv = "1.3.1" deserr = { version = "0.6.3", features = ["actix-web"] } diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 08ee803ef..a8f45b4ef 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -392,6 +392,7 @@ InvalidSettingsIndexChat , InvalidRequest , BAD_REQU // Export InvalidExportUrl , InvalidRequest , BAD_REQUEST ; InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; +InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; diff --git a/crates/meilisearch-types/src/lib.rs b/crates/meilisearch-types/src/lib.rs index a1a57b7e6..fe69da526 100644 --- a/crates/meilisearch-types/src/lib.rs +++ b/crates/meilisearch-types/src/lib.rs @@ -18,7 +18,7 @@ pub mod versioning; pub use milli::{heed, Index}; use uuid::Uuid; pub use versioning::VERSION_FILE_NAME; -pub use {milli, serde_cs}; +pub use {byte_unit, milli, serde_cs}; pub type Document = serde_json::Map; pub type InstanceUid = Uuid; diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 0a8d7b8fe..1dbd5637b 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -1,5 +1,6 @@ use std::collections::BTreeMap; +use byte_unit::UnitType; use milli::Object; use serde::{Deserialize, Serialize}; use time::{Duration, OffsetDateTime}; @@ -128,6 +129,8 @@ pub struct DetailsView { #[serde(skip_serializing_if = "Option::is_none")] pub api_key: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub payload_size: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub indexes: Option>, } @@ -263,6 +266,13 @@ impl DetailsView { // So we return the first one we encounter but that shouldn't be an issue anyway. (Some(left), Some(_right)) => Some(left), }, + payload_size: match (self.payload_size.clone(), other.payload_size.clone()) { + (None, None) => None, + (None, Some(size)) | (Some(size), None) => Some(size), + // We should never be able to batch multiple exports at the same time. + // So we return the first one we encounter but that shouldn't be an issue anyway. + (Some(left), Some(_right)) => Some(left), + }, indexes: match (self.indexes.clone(), other.indexes.clone()) { (None, None) => None, (None, Some(indexes)) | (Some(indexes), None) => Some(indexes), @@ -359,9 +369,11 @@ impl From
for DetailsView { Details::IndexSwap { swaps } => { DetailsView { swaps: Some(swaps), ..Default::default() } } - Details::Export { url, api_key, indexes } => DetailsView { + Details::Export { url, api_key, payload_size, indexes } => DetailsView { url: Some(url), api_key, + payload_size: payload_size + .map(|ps| ps.get_appropriate_unit(UnitType::Both).to_string()), indexes: Some( indexes .into_iter() diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 86951192c..508035bb7 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; use std::fmt::{Display, Write}; use std::str::FromStr; +use byte_unit::Byte; use enum_iterator::Sequence; use milli::update::IndexDocumentsMethod; use milli::Object; @@ -159,6 +160,7 @@ pub enum KindWithContent { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { @@ -286,11 +288,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: (from.0, from.1, from.2), to: ( @@ -357,11 +362,14 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -410,11 +418,14 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, indexes } => Some(Details::Export { - url: url.clone(), - api_key: api_key.clone(), - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), - }), + KindWithContent::Export { url, api_key, payload_size, indexes } => { + Some(Details::Export { + url: url.clone(), + api_key: api_key.clone(), + payload_size: payload_size.clone(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + }) + } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { from: *from, to: ( @@ -681,6 +692,7 @@ pub enum Details { Export { url: String, api_key: Option, + payload_size: Option, indexes: BTreeMap, }, UpgradeDatabase { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index de1fe2c38..1c519224c 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -1,7 +1,10 @@ use std::collections::BTreeMap; +use std::convert::Infallible; +use std::str::FromStr as _; use actix_web::web::{self, Data}; use actix_web::{HttpRequest, HttpResponse}; +use byte_unit::Byte; use deserr::actix_web::AwebJson; use deserr::Deserr; use index_scheduler::IndexScheduler; @@ -72,7 +75,7 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); - let Export { url, api_key, indexes } = export; + let Export { url, api_key, payload_size, indexes } = export; let indexes = if indexes.is_empty() { BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) @@ -85,7 +88,12 @@ async fn export( .collect() }; - let task = KindWithContent::Export { url, api_key, indexes }; + let task = KindWithContent::Export { + url, + api_key, + payload_size: payload_size.map(|ByteWithDeserr(bytes)| bytes), + indexes, + }; let uid = get_task_id(&req, &opt)?; let dry_run = is_dry_run(&req, &opt)?; let task: SummarizedTaskView = @@ -109,12 +117,51 @@ pub struct Export { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub api_key: Option, + #[schema(value_type = Option, example = json!("24MiB"))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub payload_size: Option, #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] #[deserr(default)] #[serde(default)] pub indexes: BTreeMap, } +/// A wrapper around the `Byte` type that implements `Deserr`. +#[derive(Debug, Serialize)] +#[serde(transparent)] +pub struct ByteWithDeserr(pub Byte); + +impl deserr::Deserr for ByteWithDeserr +where + E: deserr::DeserializeError, +{ + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + use deserr::{ErrorKind, Value, ValueKind}; + match value { + Value::Integer(integer) => Ok(ByteWithDeserr(Byte::from_u64(integer))), + Value::String(string) => Byte::from_str(&string).map(ByteWithDeserr).map_err(|e| { + deserr::take_cf_content(E::error::( + None, + ErrorKind::Unexpected { msg: e.to_string() }, + location, + )) + }), + actual => Err(deserr::take_cf_content(E::error( + None, + ErrorKind::IncorrectValueKind { + actual, + accepted: &[ValueKind::Integer, ValueKind::String], + }, + location, + ))), + } + } +} + #[derive(Debug, Deserr, ToSchema, Serialize)] #[deserr(error = DeserrJsonError, rename_all = camelCase, deny_unknown_fields)] #[serde(rename_all = "camelCase")] From e6e9a033aa153250b9fe96addb13701d49feccd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 15:45:24 +0200 Subject: [PATCH 014/101] Introduce new analytics to the export route --- crates/meilisearch/src/routes/export.rs | 7 +- .../src/routes/export_analytics.rs | 67 +++++++++++++++++++ crates/meilisearch/src/routes/mod.rs | 1 + 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 crates/meilisearch/src/routes/export_analytics.rs diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 1c519224c..21a77ae32 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -22,6 +22,7 @@ use utoipa::{OpenApi, ToSchema}; use crate::analytics::Analytics; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::GuardedData; +use crate::routes::export_analytics::ExportAnalytics; use crate::routes::{get_task_id, is_dry_run, SummarizedTaskView}; use crate::Opt; @@ -67,7 +68,7 @@ async fn export( export: AwebJson, req: HttpRequest, opt: web::Data, - _analytics: Data, + analytics: Data, ) -> Result { // TODO make it experimental? // index_scheduler.features().check_network("Using the /network route")?; @@ -75,6 +76,8 @@ async fn export( let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); + let analytics_aggregate = ExportAnalytics::from_export(&export); + let Export { url, api_key, payload_size, indexes } = export; let indexes = if indexes.is_empty() { @@ -101,6 +104,8 @@ async fn export( .await?? .into(); + analytics.publish(analytics_aggregate, &req); + Ok(HttpResponse::Ok().json(task)) } diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs new file mode 100644 index 000000000..7299dba8d --- /dev/null +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -0,0 +1,67 @@ +use crate::analytics::Aggregate; +use crate::routes::export::Export; + +#[derive(Default)] +pub struct ExportAnalytics { + total_received: usize, + has_api_key: bool, + total_index_patterns: usize, + total_patterns_with_filter: usize, + payload_sizes: Vec, +} + +impl ExportAnalytics { + pub fn from_export(export: &Export) -> Self { + let Export { url: _, api_key, payload_size, indexes } = export; + + let has_api_key = api_key.is_some(); + let total_index_patterns = indexes.len(); + let total_patterns_with_filter = + indexes.values().filter(|settings| settings.filter.is_some()).count(); + let payload_sizes = + if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { + vec![byte_size.as_u64()] + } else { + vec![] + }; + + Self { + total_received: 1, + has_api_key, + total_index_patterns, + total_patterns_with_filter, + payload_sizes, + } + } +} + +impl Aggregate for ExportAnalytics { + fn event_name(&self) -> &'static str { + "Export Triggered" + } + + fn aggregate(mut self: Box, other: Box) -> Box { + self.total_received += other.total_received; + self.has_api_key |= other.has_api_key; + self.total_index_patterns += other.total_index_patterns; + self.total_patterns_with_filter += other.total_patterns_with_filter; + self.payload_sizes.extend(other.payload_sizes); + self + } + + fn into_event(self: Box) -> serde_json::Value { + let avg_payload_size = if self.payload_sizes.is_empty() { + None + } else { + Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) + }; + + serde_json::json!({ + "total_received": self.total_received, + "has_api_key": self.has_api_key, + "total_index_patterns": self.total_index_patterns, + "total_patterns_with_filter": self.total_patterns_with_filter, + "avg_payload_size": avg_payload_size, + }) + } +} diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 748cd5d83..08583d20f 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -55,6 +55,7 @@ pub mod batches; pub mod chats; mod dump; mod export; +mod export_analytics; pub mod features; pub mod indexes; mod logs; From 0bb7866f1e549c8791ac752f90af0dfcbd5fd6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 15:48:21 +0200 Subject: [PATCH 015/101] Remove the skip embeddings boolean in the settings --- crates/meilisearch-types/src/error.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index a8f45b4ef..1c2840084 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -394,7 +394,6 @@ InvalidExportUrl , InvalidRequest , BAD_REQU InvalidExportApiKey , InvalidRequest , BAD_REQUEST ; InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; -InvalidExportIndexSkipEmbeddings , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; From bf13268649343ad2a410ca1411b5dce4f5b0fcf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:03:13 +0200 Subject: [PATCH 016/101] Better compute aggragates --- .../src/routes/export_analytics.rs | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 7299dba8d..44dba2c9b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -5,8 +5,8 @@ use crate::routes::export::Export; pub struct ExportAnalytics { total_received: usize, has_api_key: bool, - total_index_patterns: usize, - total_patterns_with_filter: usize, + sum_index_patterns: usize, + sum_patterns_with_filter: usize, payload_sizes: Vec, } @@ -15,8 +15,8 @@ impl ExportAnalytics { let Export { url: _, api_key, payload_size, indexes } = export; let has_api_key = api_key.is_some(); - let total_index_patterns = indexes.len(); - let total_patterns_with_filter = + let index_patterns_count = indexes.len(); + let patterns_with_filter_count = indexes.values().filter(|settings| settings.filter.is_some()).count(); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { @@ -28,8 +28,8 @@ impl ExportAnalytics { Self { total_received: 1, has_api_key, - total_index_patterns, - total_patterns_with_filter, + sum_index_patterns: index_patterns_count, + sum_patterns_with_filter: patterns_with_filter_count, payload_sizes, } } @@ -43,8 +43,8 @@ impl Aggregate for ExportAnalytics { fn aggregate(mut self: Box, other: Box) -> Box { self.total_received += other.total_received; self.has_api_key |= other.has_api_key; - self.total_index_patterns += other.total_index_patterns; - self.total_patterns_with_filter += other.total_patterns_with_filter; + self.sum_index_patterns += other.sum_index_patterns; + self.sum_patterns_with_filter += other.sum_patterns_with_filter; self.payload_sizes.extend(other.payload_sizes); self } @@ -56,11 +56,23 @@ impl Aggregate for ExportAnalytics { Some(self.payload_sizes.iter().sum::() / self.payload_sizes.len() as u64) }; + let avg_index_patterns = if self.total_received == 0 { + None + } else { + Some(self.sum_index_patterns as f64 / self.total_received as f64) + }; + + let avg_patterns_with_filter = if self.total_received == 0 { + None + } else { + Some(self.sum_patterns_with_filter as f64 / self.total_received as f64) + }; + serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, - "total_index_patterns": self.total_index_patterns, - "total_patterns_with_filter": self.total_patterns_with_filter, + "avg_index_patterns": avg_index_patterns, + "avg_patterns_with_filter": avg_patterns_with_filter, "avg_payload_size": avg_payload_size, }) } From e3003c1609fda6e0a2af649b8fc7bd3bff429d74 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:05:12 +0200 Subject: [PATCH 017/101] Improve OpenAPI schema --- crates/meilisearch/src/routes/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 08583d20f..51298411a 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -2,6 +2,7 @@ use std::collections::BTreeMap; use actix_web::web::Data; use actix_web::{web, HttpRequest, HttpResponse}; +use export::Export; use index_scheduler::IndexScheduler; use meilisearch_auth::AuthController; use meilisearch_types::batch_view::BatchView; @@ -98,7 +99,7 @@ mod tasks_test; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, ExportApi, Export)) )] pub struct MeilisearchApi; From b956918c11bd66a02ca9abda1ab905aa178a0ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 16:31:35 +0200 Subject: [PATCH 018/101] Fix clippy and more utoipa issues --- crates/meilisearch-types/src/tasks.rs | 6 +++--- crates/meilisearch/src/routes/mod.rs | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 508035bb7..3301b4320 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -292,7 +292,7 @@ impl KindWithContent { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } @@ -366,7 +366,7 @@ impl KindWithContent { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } @@ -422,7 +422,7 @@ impl From<&KindWithContent> for Option
{ Some(Details::Export { url: url.clone(), api_key: api_key.clone(), - payload_size: payload_size.clone(), + payload_size: *payload_size, indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } diff --git a/crates/meilisearch/src/routes/mod.rs b/crates/meilisearch/src/routes/mod.rs index 51298411a..260d973a1 100644 --- a/crates/meilisearch/src/routes/mod.rs +++ b/crates/meilisearch/src/routes/mod.rs @@ -99,7 +99,7 @@ mod tasks_test; url = "/", description = "Local server", )), - components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, ExportApi, Export)) + components(schemas(PaginationView, PaginationView, IndexView, DocumentDeletionByFilter, AllBatches, BatchStats, ProgressStepView, ProgressView, BatchView, RuntimeTogglableFeatures, SwapIndexesPayload, DocumentEditionByFunction, MergeFacets, FederationOptions, SearchQueryWithIndex, Federation, FederatedSearch, FederatedSearchResult, SearchResults, SearchResultWithIndex, SimilarQuery, SimilarResult, PaginationView, BrowseQuery, UpdateIndexRequest, IndexUid, IndexCreateRequest, KeyView, Action, CreateApiKey, UpdateStderrLogs, LogMode, GetLogs, IndexStats, Stats, HealthStatus, HealthResponse, VersionResponse, Code, ErrorType, AllTasks, TaskView, Status, DetailsView, ResponseError, Settings, Settings, TypoSettings, MinWordSizeTyposSetting, FacetingSettings, PaginationSettings, SummarizedTaskView, Kind, Network, Remote, FilterableAttributesRule, FilterableAttributesPatterns, AttributePatterns, FilterableAttributesFeatures, FilterFeatures, Export)) )] pub struct MeilisearchApi; From 0f1dd3614cc86753ca26dc10ebd2cc659659c55a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 17:51:57 +0200 Subject: [PATCH 019/101] Update tasks tests --- crates/meilisearch/src/routes/tasks_test.rs | 2 +- crates/meilisearch/tests/batches/errors.rs | 2 +- crates/meilisearch/tests/tasks/errors.rs | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/routes/tasks_test.rs b/crates/meilisearch/src/routes/tasks_test.rs index a17b80c82..b09eb0fb3 100644 --- a/crates/meilisearch/src/routes/tasks_test.rs +++ b/crates/meilisearch/src/routes/tasks_test.rs @@ -228,7 +228,7 @@ mod tests { let err = deserr_query_params::(params).unwrap_err(); snapshot!(meili_snap::json_string!(err), @r#" { - "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `createIndex` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/batches/errors.rs b/crates/meilisearch/tests/batches/errors.rs index 7f5fedb6a..bfc0d9251 100644 --- a/crates/meilisearch/tests/batches/errors.rs +++ b/crates/meilisearch/tests/batches/errors.rs @@ -42,7 +42,7 @@ async fn batch_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" diff --git a/crates/meilisearch/tests/tasks/errors.rs b/crates/meilisearch/tests/tasks/errors.rs index 759531d42..9970bafa4 100644 --- a/crates/meilisearch/tests/tasks/errors.rs +++ b/crates/meilisearch/tests/tasks/errors.rs @@ -97,7 +97,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" @@ -108,7 +108,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" @@ -119,7 +119,7 @@ async fn task_bad_types() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r#" { - "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `upgradeDatabase`.", + "message": "Invalid value in parameter `types`: `doggo` is not a valid task type. Available types are `documentAdditionOrUpdate`, `documentEdition`, `documentDeletion`, `settingsUpdate`, `indexCreation`, `indexDeletion`, `indexUpdate`, `indexSwap`, `taskCancelation`, `taskDeletion`, `dumpCreation`, `snapshotCreation`, `export`, `upgradeDatabase`.", "code": "invalid_task_types", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_task_types" From 7fa1c41190620506bd31bcd54c5e4c713903b948 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 26 Jun 2025 18:25:49 +0200 Subject: [PATCH 020/101] Fix some api key errors --- crates/meilisearch/tests/auth/api_keys.rs | 2 +- crates/meilisearch/tests/auth/errors.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/tests/auth/api_keys.rs b/crates/meilisearch/tests/auth/api_keys.rs index 5a18b4dbf..2688dd918 100644 --- a/crates/meilisearch/tests/auth/api_keys.rs +++ b/crates/meilisearch/tests/auth/api_keys.rs @@ -421,7 +421,7 @@ async fn error_add_api_key_invalid_parameters_actions() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response, { ".createdAt" => "[ignored]", ".updatedAt" => "[ignored]" }), @r###" { - "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doc.add` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" diff --git a/crates/meilisearch/tests/auth/errors.rs b/crates/meilisearch/tests/auth/errors.rs index ebe2e53fa..687cb67a0 100644 --- a/crates/meilisearch/tests/auth/errors.rs +++ b/crates/meilisearch/tests/auth/errors.rs @@ -93,7 +93,7 @@ async fn create_api_key_bad_actions() { snapshot!(code, @"400 Bad Request"); snapshot!(json_string!(response), @r###" { - "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", + "message": "Unknown value `doggo` at `.actions[0]`: expected one of `*`, `search`, `documents.*`, `documents.add`, `documents.get`, `documents.delete`, `indexes.*`, `indexes.create`, `indexes.get`, `indexes.update`, `indexes.delete`, `indexes.swap`, `tasks.*`, `tasks.cancel`, `tasks.delete`, `tasks.get`, `settings.*`, `settings.get`, `settings.update`, `stats.*`, `stats.get`, `metrics.*`, `metrics.get`, `dumps.*`, `dumps.create`, `snapshots.*`, `snapshots.create`, `version`, `keys.create`, `keys.get`, `keys.update`, `keys.delete`, `experimental.get`, `experimental.update`, `export`, `network.get`, `network.update`, `chatCompletions`, `chats.*`, `chats.get`, `chats.delete`, `chatsSettings.*`, `chatsSettings.get`, `chatsSettings.update`", "code": "invalid_api_key_actions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_api_key_actions" From 657bbf5d1e4f4dba0c816d94ff3ee9002fe0b880 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 27 Jun 2025 10:14:26 +0200 Subject: [PATCH 021/101] Fix more tests --- crates/meilisearch-types/src/tasks.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 3301b4320..a6ed593db 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -614,6 +614,8 @@ impl FromStr for Kind { Ok(Kind::DumpCreation) } else if kind.eq_ignore_ascii_case("snapshotCreation") { Ok(Kind::SnapshotCreation) + } else if kind.eq_ignore_ascii_case("export") { + Ok(Kind::Export) } else if kind.eq_ignore_ascii_case("upgradeDatabase") { Ok(Kind::UpgradeDatabase) } else { From 72192994363c8fc4060014eecb1905dd88cb979f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Fri, 27 Jun 2025 12:23:55 +0200 Subject: [PATCH 022/101] Better handle task abortion --- .../src/scheduler/process_export.rs | 22 +++++++++++++------ 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index e777809fd..57f79c83f 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -16,7 +16,7 @@ use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; use meilisearch_types::tasks::ExportIndexSettings; use serde::Deserialize; -use ureq::{json, Agent}; +use ureq::{json, Response}; use super::MustStopProcessing; use crate::processing::AtomicDocumentStep; @@ -45,7 +45,7 @@ impl IndexScheduler { }) .collect(); - let agent: Agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); + let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); for (i, (uid, settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { @@ -272,11 +272,16 @@ fn retry(must_stop_processing: &MustStopProcessing, send_request: F) -> Resul where F: Fn() -> Result>, { - if must_stop_processing.get() { - return Err(Error::AbortedTask); - } - - match backoff::retry(ExponentialBackoff::default(), send_request) { + match backoff::retry(ExponentialBackoff::default(), || { + if must_stop_processing.get() { + return Err(backoff::Error::Permanent(ureq::Error::Status( + u16::MAX, + // 444: Connection Closed Without Response + Response::new(444, "Abort", "Aborted task").unwrap(), + ))); + } + send_request() + }) { Ok(response) => Ok(response), Err(backoff::Error::Permanent(e)) => Err(ureq_error_into_error(e)), Err(backoff::Error::Transient { err, retry_after: _ }) => Err(ureq_error_into_error(err)), @@ -306,6 +311,9 @@ fn ureq_error_into_error(error: ureq::Error) -> Error { } match error { + // This is a workaround to handle task abortion - the error propagation path + // makes it difficult to cleanly surface the abortion at this level. + ureq::Error::Status(u16::MAX, _) => Error::AbortedTask, ureq::Error::Status(_, response) => match response.into_json() { Ok(MeiliError { message, code, r#type, link }) => { Error::FromRemoteWhenExporting { message, code, r#type, link } From 85037352b95d947151692307c1f00371fed134a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:31:32 +0200 Subject: [PATCH 023/101] Fix most of the easy issues --- crates/index-scheduler/src/processing.rs | 4 ++-- .../src/scheduler/create_batch.rs | 6 ++--- .../src/scheduler/process_export.rs | 5 ++-- crates/index-scheduler/src/utils.rs | 2 +- crates/meilisearch-types/src/task_view.rs | 23 ++++++++++++++++++- crates/meilisearch/src/routes/export.rs | 15 ++++++------ 6 files changed, 39 insertions(+), 16 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 5d4ac11c3..631719f73 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -178,8 +178,8 @@ make_enum_progress! { make_enum_progress! { pub enum Export { EnsuringCorrectnessOfTheTarget, - ExportTheSettings, - ExportTheDocuments, + ExporingTheSettings, + ExporingTheDocuments, } } diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index 7a6fa4a9b..b08d27d48 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -510,9 +510,9 @@ impl IndexScheduler { // 3. we batch the export. let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; if !to_export.is_empty() { - let mut tasks = self.queue.tasks.get_existing_tasks(rtxn, to_export)?; - current_batch.processing(&mut tasks); - let task = tasks.pop().expect("There must be only one export task"); + let task_id = to_export.iter().next().expect("There must be only one export task"); + let mut task = self.queue.tasks.get_task(rtxn, task_id)?.unwrap(); + current_batch.processing([&mut task]); current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); return Ok(Some((Batch::Export { task }, current_batch))); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 57f79c83f..b81ff0b96 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -86,10 +86,11 @@ impl IndexScheduler { } // Retry logic for sending settings let url = format!("{base_url}/indexes/{uid}/settings"); + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); retry(&must_stop_processing, || { let mut request = agent.patch(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); + if let Some(bearer) = bearer.as_ref() { + request = request.set("Authorization", bearer); } request.send_json(settings.clone()).map_err(into_backoff_error) })?; diff --git a/crates/index-scheduler/src/utils.rs b/crates/index-scheduler/src/utils.rs index 594023145..2cfe63bff 100644 --- a/crates/index-scheduler/src/utils.rs +++ b/crates/index-scheduler/src/utils.rs @@ -273,7 +273,7 @@ pub fn swap_index_uid_in_task(task: &mut Task, swap: (&str, &str)) { K::TaskCancelation { .. } | K::TaskDeletion { .. } | K::DumpCreation { .. } - | K::Export { .. } // TODO I have patterns, not index uids + | K::Export { .. } | K::UpgradeDatabase { .. } | K::SnapshotCreation => (), }; diff --git a/crates/meilisearch-types/src/task_view.rs b/crates/meilisearch-types/src/task_view.rs index 1dbd5637b..7521137c0 100644 --- a/crates/meilisearch-types/src/task_view.rs +++ b/crates/meilisearch-types/src/task_view.rs @@ -371,7 +371,10 @@ impl From
for DetailsView { } Details::Export { url, api_key, payload_size, indexes } => DetailsView { url: Some(url), - api_key, + api_key: api_key.map(|mut api_key| { + hide_secret(&mut api_key); + api_key + }), payload_size: payload_size .map(|ps| ps.get_appropriate_unit(UnitType::Both).to_string()), indexes: Some( @@ -390,3 +393,21 @@ impl From
for DetailsView { } } } + +// We definitely need to factorize the code to hide the secret key +fn hide_secret(secret: &mut String) { + match secret.len() { + x if x < 10 => { + secret.replace_range(.., "XXX..."); + } + x if x < 20 => { + secret.replace_range(2.., "XXXX..."); + } + x if x < 30 => { + secret.replace_range(3.., "XXXXX..."); + } + _x => { + secret.replace_range(5.., "XXXXXX..."); + } + } +} diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 21a77ae32..1df2d271e 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -42,17 +42,18 @@ pub fn configure(cfg: &mut web::ServiceConfig) { } #[utoipa::path( - get, + post, path = "", tag = "Export", security(("Bearer" = ["export", "*"])), responses( (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( - { - "indexes": ["movie", "steam-*"], - "skip_embeddings": true, - "apiKey": "meilisearch-api-key" - })), + { + "taskUid": 1, + "status": "enqueued", + "type": "export", + "enqueuedAt": "2021-08-11T09:25:53.000000Z" + })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { "message": "The Authorization header is missing. It must use the bearer authorization method.", @@ -126,7 +127,7 @@ pub struct Export { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub payload_size: Option, - #[schema(value_type = Option>, example = json!(["movies", "steam-*"]))] + #[schema(value_type = Option>, example = json!({ "*": { "filter": null } }))] #[deserr(default)] #[serde(default)] pub indexes: BTreeMap, From ad03c86c4493cb1dec38897983bd0a4d6ec21631 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:46:47 +0200 Subject: [PATCH 024/101] Display an accurate number of uploaded documents --- .../src/scheduler/process_batch.rs | 10 +++++---- .../src/scheduler/process_export.rs | 21 +++++++++++++------ crates/meilisearch-types/src/tasks.rs | 8 +++---- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_batch.rs b/crates/index-scheduler/src/scheduler/process_batch.rs index e56b8e13a..090ff844d 100644 --- a/crates/index-scheduler/src/scheduler/process_batch.rs +++ b/crates/index-scheduler/src/scheduler/process_batch.rs @@ -377,9 +377,8 @@ impl IndexScheduler { ) })); - match ret { - // TODO return the matched and exported documents - Ok(Ok(())) => (), + let stats = match ret { + Ok(Ok(stats)) => stats, Ok(Err(Error::AbortedTask)) => return Err(Error::AbortedTask), Ok(Err(e)) => return Err(Error::Export(Box::new(e))), Err(e) => { @@ -394,9 +393,12 @@ impl IndexScheduler { msg.to_string(), )))); } - } + }; task.status = Status::Succeeded; + if let Some(Details::Export { indexes, .. }) = task.details.as_mut() { + *indexes = stats; + } Ok((vec![task], ProcessBatchInfo::default())) } Batch::UpgradeDatabase { mut tasks } => { diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index b81ff0b96..bf2917b73 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -14,7 +14,7 @@ use meilisearch_types::milli::update::{request_threads, Setting}; use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError}; use meilisearch_types::settings::{self, SecretPolicy}; -use meilisearch_types::tasks::ExportIndexSettings; +use meilisearch_types::tasks::{DetailsExportIndexSettings, ExportIndexSettings}; use serde::Deserialize; use ureq::{json, Response}; @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result<()> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -41,13 +41,14 @@ impl IndexScheduler { indexes .iter() .find(|(pattern, _)| pattern.matches_str(&uid)) - .map(|(_pattern, settings)| (uid, settings)) + .map(|(pattern, settings)| (pattern, uid, settings)) }) .collect(); + let mut output = BTreeMap::new(); let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - for (i, (uid, settings)) in indexes.iter().enumerate() { + for (i, (pattern, uid, export_settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -58,7 +59,7 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { filter } = settings; + let ExportIndexSettings { filter } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; @@ -125,6 +126,14 @@ impl IndexScheduler { let (step, progress_step) = AtomicDocumentStep::new(total_documents); progress.update_progress(progress_step); + output.insert( + (*pattern).clone(), + DetailsExportIndexSettings { + settings: (*export_settings).clone(), + matched_documents: Some(total_documents as u64), + }, + ); + let limit = payload_size.map(|ps| ps.as_u64() as usize).unwrap_or(50 * 1024 * 1024); // defaults to 50 MiB let documents_url = format!("{base_url}/indexes/{uid}/documents"); @@ -265,7 +274,7 @@ impl IndexScheduler { step.store(total_documents, atomic::Ordering::Relaxed); } - Ok(()) + Ok(output) } } diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index a6ed593db..cdbf6d3aa 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -707,16 +707,14 @@ pub enum Details { #[schema(rename_all = "camelCase")] pub struct DetailsExportIndexSettings { #[serde(flatten)] - settings: ExportIndexSettings, + pub settings: ExportIndexSettings, #[serde(skip_serializing_if = "Option::is_none")] - matched_documents: Option, - #[serde(skip_serializing_if = "Option::is_none")] - exported_documents: Option, + pub matched_documents: Option, } impl From for DetailsExportIndexSettings { fn from(settings: ExportIndexSettings) -> Self { - DetailsExportIndexSettings { settings, matched_documents: None, exported_documents: None } + DetailsExportIndexSettings { settings, matched_documents: None } } } From f4bb6cbca894e690e9789a7945cbf1f4f2d5d800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 30 Jun 2025 18:59:16 +0200 Subject: [PATCH 025/101] Better behavior when null indexes --- crates/meilisearch-types/src/tasks.rs | 2 +- crates/meilisearch/src/routes/export.rs | 14 ++++++++------ crates/meilisearch/src/routes/export_analytics.rs | 7 ++++--- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index cdbf6d3aa..0618fa333 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -210,7 +210,7 @@ impl KindWithContent { | SnapshotCreation | TaskCancelation { .. } | TaskDeletion { .. } - | Export { .. } // TODO Should I resolve the index names? + | Export { .. } | UpgradeDatabase { .. } => vec![], DocumentAdditionOrUpdate { index_uid, .. } | DocumentEdition { index_uid, .. } diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 1df2d271e..31f8812c7 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -81,15 +81,17 @@ async fn export( let Export { url, api_key, payload_size, indexes } = export; - let indexes = if indexes.is_empty() { - BTreeMap::from([(IndexUidPattern::new_unchecked("*"), DbExportIndexSettings::default())]) - } else { - indexes + let indexes = match indexes { + Some(indexes) => indexes .into_iter() .map(|(pattern, ExportIndexSettings { filter })| { (pattern, DbExportIndexSettings { filter }) }) - .collect() + .collect(), + None => BTreeMap::from([( + IndexUidPattern::new_unchecked("*"), + DbExportIndexSettings::default(), + )]), }; let task = KindWithContent::Export { @@ -130,7 +132,7 @@ pub struct Export { #[schema(value_type = Option>, example = json!({ "*": { "filter": null } }))] #[deserr(default)] #[serde(default)] - pub indexes: BTreeMap, + pub indexes: Option>, } /// A wrapper around the `Byte` type that implements `Deserr`. diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 44dba2c9b..7ac713e9b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -15,9 +15,10 @@ impl ExportAnalytics { let Export { url: _, api_key, payload_size, indexes } = export; let has_api_key = api_key.is_some(); - let index_patterns_count = indexes.len(); - let patterns_with_filter_count = - indexes.values().filter(|settings| settings.filter.is_some()).count(); + let index_patterns_count = indexes.as_ref().map_or(0, |indexes| indexes.len()); + let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { + indexes.values().filter(|settings| settings.filter.is_some()).count() + }); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { vec![byte_size.as_u64()] From efd5fd96ccc63a886005b0d42e79cd9a5aaa13f9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:02:42 +0200 Subject: [PATCH 026/101] Add the overrideSettings parameter --- .../src/scheduler/process_export.rs | 83 +++++++++++++------ crates/meilisearch-types/src/error.rs | 1 + crates/meilisearch-types/src/tasks.rs | 1 + crates/meilisearch/src/routes/export.rs | 8 +- 4 files changed, 65 insertions(+), 28 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index bf2917b73..19b2bf743 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -59,42 +59,73 @@ impl IndexScheduler { indexes.len() as u32, )); - let ExportIndexSettings { filter } = export_settings; + let ExportIndexSettings { filter, override_settings } = export_settings; let index = self.index(uid)?; let index_rtxn = index.read_txn()?; - // Send the primary key + let url = format!("{base_url}/indexes/{uid}"); + + // First, check if the index already exists + let response = retry(&must_stop_processing, || { + let mut request = agent.get(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + + request.send_string("").map_err(into_backoff_error) + })?; + let already_existed = response.status() == 200; + let primary_key = index .primary_key(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; - let url = format!("{base_url}/indexes"); - retry(&must_stop_processing, || { - let mut request = agent.post(&url); - if let Some(api_key) = api_key { - request = request.set("Authorization", &format!("Bearer {api_key}")); - } - let index_param = json!({ "uid": uid, "primaryKey": primary_key }); - request.send_json(&index_param).map_err(into_backoff_error) - })?; + // Create the index + if !already_existed { + let url = format!("{base_url}/indexes"); + retry(&must_stop_processing, || { + let mut request = agent.post(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "uid": uid, "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; + } + + // Patch the index primary key + if already_existed && *override_settings { + let url = format!("{base_url}/indexes/{uid}"); + retry(&must_stop_processing, || { + let mut request = agent.patch(&url); + if let Some(api_key) = api_key { + request = request.set("Authorization", &format!("Bearer {api_key}")); + } + let index_param = json!({ "primaryKey": primary_key }); + request.send_json(&index_param).map_err(into_backoff_error) + })?; + } // Send the index settings - let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // Remove the experimental chat setting if not enabled - if self.features().check_chat_completions("exporting chat settings").is_err() { - settings.chat = Setting::NotSet; - } - // Retry logic for sending settings - let url = format!("{base_url}/indexes/{uid}/settings"); - let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); - retry(&must_stop_processing, || { - let mut request = agent.patch(&url); - if let Some(bearer) = bearer.as_ref() { - request = request.set("Authorization", bearer); + if !already_existed || *override_settings { + let mut settings = + settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) + .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; + // Remove the experimental chat setting if not enabled + if self.features().check_chat_completions("exporting chat settings").is_err() { + settings.chat = Setting::NotSet; } - request.send_json(settings.clone()).map_err(into_backoff_error) - })?; + // Retry logic for sending settings + let url = format!("{base_url}/indexes/{uid}/settings"); + let bearer = api_key.map(|api_key| format!("Bearer {api_key}")); + retry(&must_stop_processing, || { + let mut request = agent.patch(&url); + if let Some(bearer) = bearer.as_ref() { + request = request.set("Authorization", bearer); + } + request.send_json(settings.clone()).map_err(into_backoff_error) + })?; + } let filter = filter .as_ref() diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 1c2840084..30f6868f6 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -395,6 +395,7 @@ InvalidExportApiKey , InvalidRequest , BAD_REQU InvalidExportPayloadSize , InvalidRequest , BAD_REQUEST ; InvalidExportIndexesPatterns , InvalidRequest , BAD_REQUEST ; InvalidExportIndexFilter , InvalidRequest , BAD_REQUEST ; +InvalidExportIndexOverrideSettings , InvalidRequest , BAD_REQUEST ; // Experimental features - Chat Completions UnimplementedExternalFunctionCalling , InvalidRequest , NOT_IMPLEMENTED ; UnimplementedNonStreamingChatCompletions , InvalidRequest , NOT_IMPLEMENTED ; diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 0618fa333..99b04f1e3 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -178,6 +178,7 @@ pub struct IndexSwap { #[serde(rename_all = "camelCase")] pub struct ExportIndexSettings { pub filter: Option, + pub override_settings: bool, } impl KindWithContent { diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 31f8812c7..172a162c6 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -84,8 +84,8 @@ async fn export( let indexes = match indexes { Some(indexes) => indexes .into_iter() - .map(|(pattern, ExportIndexSettings { filter })| { - (pattern, DbExportIndexSettings { filter }) + .map(|(pattern, ExportIndexSettings { filter, override_settings })| { + (pattern, DbExportIndexSettings { filter, override_settings }) }) .collect(), None => BTreeMap::from([( @@ -179,4 +179,8 @@ pub struct ExportIndexSettings { #[serde(default)] #[deserr(default, error = DeserrJsonError)] pub filter: Option, + #[schema(value_type = Option, example = json!(true))] + #[serde(default)] + #[deserr(default, error = DeserrJsonError)] + pub override_settings: bool, } From 9cfbef478eb80258b1698c75abe80b5a0f92b85b Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:04:59 +0200 Subject: [PATCH 027/101] Add override setttings to analytics --- crates/meilisearch/src/routes/export_analytics.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/crates/meilisearch/src/routes/export_analytics.rs b/crates/meilisearch/src/routes/export_analytics.rs index 7ac713e9b..b66a5133b 100644 --- a/crates/meilisearch/src/routes/export_analytics.rs +++ b/crates/meilisearch/src/routes/export_analytics.rs @@ -7,6 +7,7 @@ pub struct ExportAnalytics { has_api_key: bool, sum_index_patterns: usize, sum_patterns_with_filter: usize, + sum_patterns_with_override_settings: usize, payload_sizes: Vec, } @@ -19,6 +20,9 @@ impl ExportAnalytics { let patterns_with_filter_count = indexes.as_ref().map_or(0, |indexes| { indexes.values().filter(|settings| settings.filter.is_some()).count() }); + let patterns_with_override_settings_count = indexes.as_ref().map_or(0, |indexes| { + indexes.values().filter(|settings| settings.override_settings).count() + }); let payload_sizes = if let Some(crate::routes::export::ByteWithDeserr(byte_size)) = payload_size { vec![byte_size.as_u64()] @@ -31,6 +35,7 @@ impl ExportAnalytics { has_api_key, sum_index_patterns: index_patterns_count, sum_patterns_with_filter: patterns_with_filter_count, + sum_patterns_with_override_settings: patterns_with_override_settings_count, payload_sizes, } } @@ -46,6 +51,7 @@ impl Aggregate for ExportAnalytics { self.has_api_key |= other.has_api_key; self.sum_index_patterns += other.sum_index_patterns; self.sum_patterns_with_filter += other.sum_patterns_with_filter; + self.sum_patterns_with_override_settings += other.sum_patterns_with_override_settings; self.payload_sizes.extend(other.payload_sizes); self } @@ -69,11 +75,18 @@ impl Aggregate for ExportAnalytics { Some(self.sum_patterns_with_filter as f64 / self.total_received as f64) }; + let avg_patterns_with_override_settings = if self.total_received == 0 { + None + } else { + Some(self.sum_patterns_with_override_settings as f64 / self.total_received as f64) + }; + serde_json::json!({ "total_received": self.total_received, "has_api_key": self.has_api_key, "avg_index_patterns": avg_index_patterns, "avg_patterns_with_filter": avg_patterns_with_filter, + "avg_patterns_with_override_settings": avg_patterns_with_override_settings, "avg_payload_size": avg_payload_size, }) } From 259fc067d33ff78593ae3b842ea2aabd169f7ac5 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 11:14:59 +0200 Subject: [PATCH 028/101] Count exported documents by index name, not pattern --- .../src/scheduler/process_export.rs | 9 ++++----- crates/meilisearch-types/src/tasks.rs | 14 +++++++------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 19b2bf743..d1f5616b7 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -48,7 +48,7 @@ impl IndexScheduler { let mut output = BTreeMap::new(); let agent = ureq::AgentBuilder::new().timeout(Duration::from_secs(5)).build(); let must_stop_processing = self.scheduler.must_stop_processing.clone(); - for (i, (pattern, uid, export_settings)) in indexes.iter().enumerate() { + for (i, (_pattern, uid, export_settings)) in indexes.iter().enumerate() { if must_stop_processing.get() { return Err(Error::AbortedTask); } @@ -63,9 +63,8 @@ impl IndexScheduler { let index = self.index(uid)?; let index_rtxn = index.read_txn()?; - let url = format!("{base_url}/indexes/{uid}"); - // First, check if the index already exists + let url = format!("{base_url}/indexes/{uid}"); let response = retry(&must_stop_processing, || { let mut request = agent.get(&url); if let Some(api_key) = api_key { @@ -158,7 +157,7 @@ impl IndexScheduler { progress.update_progress(progress_step); output.insert( - (*pattern).clone(), + uid.clone(), DetailsExportIndexSettings { settings: (*export_settings).clone(), matched_documents: Some(total_documents as u64), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 99b04f1e3..423cf539e 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,12 +289,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -363,12 +363,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -419,12 +419,12 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes } => { + KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), + indexes: BTreeMap::new(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -696,7 +696,7 @@ pub enum Details { url: String, api_key: Option, payload_size: Option, - indexes: BTreeMap, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), From d439a3cb9d05f6b69a41a7a1fd4370c0cd1ce128 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:39:24 +0200 Subject: [PATCH 029/101] Fix progress names --- crates/index-scheduler/src/processing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 631719f73..2aa7cf859 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -178,8 +178,8 @@ make_enum_progress! { make_enum_progress! { pub enum Export { EnsuringCorrectnessOfTheTarget, - ExporingTheSettings, - ExporingTheDocuments, + ExportingTheSettings, + ExportingTheDocuments, } } From 074d509d9280cdc277b80950dec111737126c375 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:39:52 +0200 Subject: [PATCH 030/101] Fix expect message --- crates/index-scheduler/src/scheduler/create_batch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/create_batch.rs b/crates/index-scheduler/src/scheduler/create_batch.rs index b08d27d48..693275c32 100644 --- a/crates/index-scheduler/src/scheduler/create_batch.rs +++ b/crates/index-scheduler/src/scheduler/create_batch.rs @@ -510,7 +510,7 @@ impl IndexScheduler { // 3. we batch the export. let to_export = self.queue.tasks.get_kind(rtxn, Kind::Export)? & enqueued; if !to_export.is_empty() { - let task_id = to_export.iter().next().expect("There must be only one export task"); + let task_id = to_export.iter().next().expect("There must be at least one export task"); let mut task = self.queue.tasks.get_task(rtxn, task_id)?.unwrap(); current_batch.processing([&mut task]); current_batch.reason(BatchStopReason::TaskKindCannotBeBatched { kind: Kind::Export }); From 9dac91efe056d17eeabe18aaafdd1da401b44416 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:40:39 +0200 Subject: [PATCH 031/101] Fix utoipa response --- crates/meilisearch/src/routes/export.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 172a162c6..97356f7eb 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -47,7 +47,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { tag = "Export", security(("Bearer" = ["export", "*"])), responses( - (status = OK, description = "Known nodes are returned", body = Export, content_type = "application/json", example = json!( + (status = 202, description = "Export successfully enqueued", body = SummarizedTaskView, content_type = "application/json", example = json!( { "taskUid": 1, "status": "enqueued", From c078efd730ffec4a4f2d9670437287d080269ca9 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:40:59 +0200 Subject: [PATCH 032/101] Remove experimental todo --- crates/meilisearch/src/routes/export.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/meilisearch/src/routes/export.rs b/crates/meilisearch/src/routes/export.rs index 97356f7eb..a4b6720d1 100644 --- a/crates/meilisearch/src/routes/export.rs +++ b/crates/meilisearch/src/routes/export.rs @@ -71,9 +71,6 @@ async fn export( opt: web::Data, analytics: Data, ) -> Result { - // TODO make it experimental? - // index_scheduler.features().check_network("Using the /network route")?; - let export = export.into_inner(); debug!(returns = ?export, "Trigger export"); From 25c19a306b1fa4967b013066c693012293347272 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:42:44 +0200 Subject: [PATCH 033/101] Rename variable Co-authored-by: Kero --- crates/index-scheduler/src/scheduler/process_export.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index d1f5616b7..b5134deb9 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -71,16 +71,16 @@ impl IndexScheduler { request = request.set("Authorization", &format!("Bearer {api_key}")); } - request.send_string("").map_err(into_backoff_error) + request.send_bytes(Default::default()).map_err(into_backoff_error) })?; - let already_existed = response.status() == 200; + let index_exists = response.status() == 200; let primary_key = index .primary_key(&index_rtxn) .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?; // Create the index - if !already_existed { + if !index_exists { let url = format!("{base_url}/indexes"); retry(&must_stop_processing, || { let mut request = agent.post(&url); @@ -93,7 +93,7 @@ impl IndexScheduler { } // Patch the index primary key - if already_existed && *override_settings { + if index_exists && *override_settings { let url = format!("{base_url}/indexes/{uid}"); retry(&must_stop_processing, || { let mut request = agent.patch(&url); @@ -106,7 +106,7 @@ impl IndexScheduler { } // Send the index settings - if !already_existed || *override_settings { + if !index_exists || *override_settings { let mut settings = settings::settings(&index, &index_rtxn, SecretPolicy::RevealSecrets) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; From 37a692f942253c128980e31e6d3be75b94a12a0e Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 14:47:43 +0200 Subject: [PATCH 034/101] Keep `IndexUidPattern` --- .../src/scheduler/process_export.rs | 4 ++-- crates/meilisearch-types/src/tasks.rs | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index b5134deb9..eaad7aa34 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -30,7 +30,7 @@ impl IndexScheduler { payload_size: Option<&Byte>, indexes: &BTreeMap, progress: Progress, - ) -> Result> { + ) -> Result> { #[cfg(test)] self.maybe_fail(crate::test_utils::FailureLocation::ProcessExport)?; @@ -157,7 +157,7 @@ impl IndexScheduler { progress.update_progress(progress_step); output.insert( - uid.clone(), + IndexUidPattern::new_unchecked(uid.clone()), DetailsExportIndexSettings { settings: (*export_settings).clone(), matched_documents: Some(total_documents as u64), diff --git a/crates/meilisearch-types/src/tasks.rs b/crates/meilisearch-types/src/tasks.rs index 423cf539e..99b04f1e3 100644 --- a/crates/meilisearch-types/src/tasks.rs +++ b/crates/meilisearch-types/src/tasks.rs @@ -289,12 +289,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -363,12 +363,12 @@ impl KindWithContent { }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -419,12 +419,12 @@ impl From<&KindWithContent> for Option
{ }), KindWithContent::DumpCreation { .. } => Some(Details::Dump { dump_uid: None }), KindWithContent::SnapshotCreation => None, - KindWithContent::Export { url, api_key, payload_size, indexes: _ } => { + KindWithContent::Export { url, api_key, payload_size, indexes } => { Some(Details::Export { url: url.clone(), api_key: api_key.clone(), payload_size: *payload_size, - indexes: BTreeMap::new(), + indexes: indexes.iter().map(|(p, s)| (p.clone(), s.clone().into())).collect(), }) } KindWithContent::UpgradeDatabase { from } => Some(Details::UpgradeDatabase { @@ -696,7 +696,7 @@ pub enum Details { url: String, api_key: Option, payload_size: Option, - indexes: BTreeMap, + indexes: BTreeMap, }, UpgradeDatabase { from: (u32, u32, u32), From b7bebe9bbb33b4ba87408362068f732281f609ea Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:03:04 +0200 Subject: [PATCH 035/101] Fix export when index already exists --- crates/index-scheduler/src/scheduler/process_export.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index eaad7aa34..676481319 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -72,8 +72,12 @@ impl IndexScheduler { } request.send_bytes(Default::default()).map_err(into_backoff_error) - })?; - let index_exists = response.status() == 200; + }); + let index_exists = match response { + Ok(response) => response.status() == 200, + Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => false, + Err(e) => return Err(e), + }; let primary_key = index .primary_key(&index_rtxn) From 9211e94c4f019a890175a109b1ce78a43c10bb5f Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:03:20 +0200 Subject: [PATCH 036/101] Format --- crates/index-scheduler/src/scheduler/process_export.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 676481319..30721065e 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -75,7 +75,9 @@ impl IndexScheduler { }); let index_exists = match response { Ok(response) => response.status() == 200, - Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => false, + Err(Error::FromRemoteWhenExporting { code, .. }) if code == "index_not_found" => { + false + } Err(e) => return Err(e), }; From d2776efb11f85f1df9501eb6079d98aa4013ba29 Mon Sep 17 00:00:00 2001 From: Mubelotix Date: Tue, 1 Jul 2025 15:14:56 +0200 Subject: [PATCH 037/101] Fix flaky last_error test --- crates/meilisearch/tests/vector/rest.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 6e781e525..87296c36a 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -2183,6 +2183,7 @@ async fn last_error_stats() { snapshot!(json_string!(response["results"][0], { ".progress" => "[ignored]", ".stats.embedderRequests.total" => "[ignored]", + ".stats.embedderRequests.failed" => "[ignored]", ".startedAt" => "[ignored]" }), @r#" { @@ -2205,7 +2206,7 @@ async fn last_error_stats() { }, "embedderRequests": { "total": "[ignored]", - "failed": 5, + "failed": "[ignored]", "lastError": "runtime error: received internal error HTTP 500 from embedding server\n - server replied with `Service Unavailable`" } }, From c2d5b20a424a2b34fa19a14fc7654464c2c37e95 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 17:23:08 +0000 Subject: [PATCH 038/101] Bump Swatinem/rust-cache from 2.7.8 to 2.8.0 Bumps [Swatinem/rust-cache](https://github.com/swatinem/rust-cache) from 2.7.8 to 2.8.0. - [Release notes](https://github.com/swatinem/rust-cache/releases) - [Changelog](https://github.com/Swatinem/rust-cache/blob/master/CHANGELOG.md) - [Commits](https://github.com/swatinem/rust-cache/compare/v2.7.8...v2.8.0) --- updated-dependencies: - dependency-name: Swatinem/rust-cache dependency-version: 2.8.0 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/test-suite.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test-suite.yml b/.github/workflows/test-suite.yml index 6cf8bfa0f..2924a07bc 100644 --- a/.github/workflows/test-suite.yml +++ b/.github/workflows/test-suite.yml @@ -29,7 +29,7 @@ jobs: - name: Setup test with Rust stable uses: dtolnay/rust-toolchain@1.85 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 with: @@ -51,7 +51,7 @@ jobs: steps: - uses: actions/checkout@v3 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - uses: dtolnay/rust-toolchain@1.85 - name: Run cargo check without any default features uses: actions-rs/cargo@v1 @@ -155,7 +155,7 @@ jobs: apt-get install build-essential -y - uses: dtolnay/rust-toolchain@1.85 - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run tests in debug uses: actions-rs/cargo@v1 with: @@ -172,7 +172,7 @@ jobs: profile: minimal components: clippy - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run cargo clippy uses: actions-rs/cargo@v1 with: @@ -191,7 +191,7 @@ jobs: override: true components: rustfmt - name: Cache dependencies - uses: Swatinem/rust-cache@v2.7.8 + uses: Swatinem/rust-cache@v2.8.0 - name: Run cargo fmt # Since we never ran the `build.rs` script in the benchmark directory we are missing one auto-generated import file. # Since we want to trigger (and fail) this action as fast as possible, instead of building the benchmark crate From 879cf850373b8e4defddbcd81b10f8d7d7bb7542 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 1 Jul 2025 17:23:13 +0000 Subject: [PATCH 039/101] Bump svenstaro/upload-release-action from 2.7.0 to 2.11.1 Bumps [svenstaro/upload-release-action](https://github.com/svenstaro/upload-release-action) from 2.7.0 to 2.11.1. - [Release notes](https://github.com/svenstaro/upload-release-action/releases) - [Changelog](https://github.com/svenstaro/upload-release-action/blob/master/CHANGELOG.md) - [Commits](https://github.com/svenstaro/upload-release-action/compare/2.7.0...2.11.1) --- updated-dependencies: - dependency-name: svenstaro/upload-release-action dependency-version: 2.11.1 dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- .github/workflows/publish-apt-brew-pkg.yml | 2 +- .github/workflows/publish-binaries.yml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/publish-apt-brew-pkg.yml b/.github/workflows/publish-apt-brew-pkg.yml index e6adfca57..5b6994dcf 100644 --- a/.github/workflows/publish-apt-brew-pkg.yml +++ b/.github/workflows/publish-apt-brew-pkg.yml @@ -32,7 +32,7 @@ jobs: - name: Build deb package run: cargo deb -p meilisearch -o target/debian/meilisearch.deb - name: Upload debian pkg to release - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/debian/meilisearch.deb diff --git a/.github/workflows/publish-binaries.yml b/.github/workflows/publish-binaries.yml index 885a04d0d..3200e778e 100644 --- a/.github/workflows/publish-binaries.yml +++ b/.github/workflows/publish-binaries.yml @@ -51,7 +51,7 @@ jobs: # No need to upload binaries for dry run (cron) - name: Upload binaries to release if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/release/meilisearch @@ -81,7 +81,7 @@ jobs: # No need to upload binaries for dry run (cron) - name: Upload binaries to release if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/release/${{ matrix.artifact_name }} @@ -113,7 +113,7 @@ jobs: - name: Upload the binary to release # No need to upload binaries for dry run (cron) if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/${{ matrix.target }}/release/meilisearch @@ -178,7 +178,7 @@ jobs: - name: Upload the binary to release # No need to upload binaries for dry run (cron) if: github.event_name == 'release' - uses: svenstaro/upload-release-action@2.7.0 + uses: svenstaro/upload-release-action@2.11.1 with: repo_token: ${{ secrets.MEILI_BOT_GH_PAT }} file: target/${{ matrix.target }}/release/meilisearch From d2e4d6dd8ae78273fe7644262fbdf86116273276 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:11:40 +0200 Subject: [PATCH 040/101] prompt: Publishes some types --- crates/milli/src/prompt/mod.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index a8288f83d..f1b4ddf89 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -9,12 +9,11 @@ use std::fmt::Debug; use std::num::NonZeroUsize; use bumpalo::Bump; -use document::ParseableDocument; +pub(crate) use document::{Document, ParseableDocument}; use error::{NewPromptError, RenderPromptError}; -use fields::{BorrowedFields, OwnedFields}; +pub use fields::{BorrowedFields, OwnedFields}; -use self::context::Context; -use self::document::Document; +pub use self::context::Context; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; use crate::GlobalFieldsIdsMap; From 76ca44b2141e18e7ba399e031ae3bb6b468cf36f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:47:49 +0200 Subject: [PATCH 041/101] Expand `json_template` module --- .../injectable_value.rs} | 180 ++++------- crates/milli/src/vector/json_template/mod.rs | 283 ++++++++++++++++++ 2 files changed, 342 insertions(+), 121 deletions(-) rename crates/milli/src/vector/{json_template.rs => json_template/injectable_value.rs} (84%) create mode 100644 crates/milli/src/vector/json_template/mod.rs diff --git a/crates/milli/src/vector/json_template.rs b/crates/milli/src/vector/json_template/injectable_value.rs similarity index 84% rename from crates/milli/src/vector/json_template.rs rename to crates/milli/src/vector/json_template/injectable_value.rs index 179cbe9af..ec7d900db 100644 --- a/crates/milli/src/vector/json_template.rs +++ b/crates/milli/src/vector/json_template/injectable_value.rs @@ -1,20 +1,17 @@ -//! Module to manipulate JSON templates. +//! Module to manipulate JSON values containing placeholder strings. //! //! This module allows two main operations: -//! 1. Render JSON values from a template and a context value. -//! 2. Retrieve data from a template and JSON values. - -#![warn(rustdoc::broken_intra_doc_links)] -#![warn(missing_docs)] +//! 1. Render JSON values from a template value containing placeholders and a value to inject. +//! 2. Extract data from a template value containing placeholders and a concrete JSON value that fits the template value. use serde::Deserialize; use serde_json::{Map, Value}; -type ValuePath = Vec; +use super::{format_value, inject_value, path_with_root, PathComponent, ValuePath}; /// Encapsulates a JSON template and allows injecting and extracting values from it. #[derive(Debug)] -pub struct ValueTemplate { +pub struct InjectableValue { template: Value, value_kind: ValueKind, } @@ -32,34 +29,13 @@ struct ArrayPath { value_path_in_array: ValuePath, } -/// Component of a path to a Value -#[derive(Debug, Clone)] -pub enum PathComponent { - /// A key inside of an object - MapKey(String), - /// An index inside of an array - ArrayIndex(usize), -} - -impl PartialEq for PathComponent { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, - (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, - _ => false, - } - } -} - -impl Eq for PathComponent {} - -/// Error that occurs when no few value was provided to a template for injection. +/// Error that occurs when no value was provided to a template for injection. #[derive(Debug)] pub struct MissingValue; -/// Error that occurs when trying to parse a template in [`ValueTemplate::new`] +/// Error that occurs when trying to parse a template in [`InjectableValue::new`] #[derive(Debug)] -pub enum TemplateParsingError { +pub enum InjectableParsingError { /// A repeat string appears inside a repeated value NestedRepeatString(ValuePath), /// A repeat string appears outside of an array @@ -85,42 +61,42 @@ pub enum TemplateParsingError { }, } -impl TemplateParsingError { +impl InjectableParsingError { /// Produce an error message from the error kind, the name of the root object, the placeholder string and the repeat string pub fn error_message(&self, root: &str, placeholder: &str, repeat: &str) -> String { match self { - TemplateParsingError::NestedRepeatString(path) => { + InjectableParsingError::NestedRepeatString(path) => { format!( r#"in {}: "{repeat}" appears nested inside of a value that is itself repeated"#, path_with_root(root, path) ) } - TemplateParsingError::RepeatStringNotInArray(path) => format!( + InjectableParsingError::RepeatStringNotInArray(path) => format!( r#"in {}: "{repeat}" appears outside of an array"#, path_with_root(root, path) ), - TemplateParsingError::BadIndexForRepeatString(path, index) => format!( + InjectableParsingError::BadIndexForRepeatString(path, index) => format!( r#"in {}: "{repeat}" expected at position #1, but found at position #{index}"#, path_with_root(root, path) ), - TemplateParsingError::MissingPlaceholderInRepeatedValue(path) => format!( + InjectableParsingError::MissingPlaceholderInRepeatedValue(path) => format!( r#"in {}: Expected "{placeholder}" inside of the repeated value"#, path_with_root(root, path) ), - TemplateParsingError::MultipleRepeatString(current, previous) => format!( + InjectableParsingError::MultipleRepeatString(current, previous) => format!( r#"in {}: Found "{repeat}", but it was already present in {}"#, path_with_root(root, current), path_with_root(root, previous) ), - TemplateParsingError::MultiplePlaceholderString(current, previous) => format!( + InjectableParsingError::MultiplePlaceholderString(current, previous) => format!( r#"in {}: Found "{placeholder}", but it was already present in {}"#, path_with_root(root, current), path_with_root(root, previous) ), - TemplateParsingError::MissingPlaceholderString => { + InjectableParsingError::MissingPlaceholderString => { format!(r#"in `{root}`: "{placeholder}" not found"#) } - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path, path_to_array, array_to_placeholder, @@ -140,41 +116,41 @@ impl TemplateParsingError { fn prepend_path(self, mut prepended_path: ValuePath) -> Self { match self { - TemplateParsingError::NestedRepeatString(mut path) => { + InjectableParsingError::NestedRepeatString(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::NestedRepeatString(prepended_path) + InjectableParsingError::NestedRepeatString(prepended_path) } - TemplateParsingError::RepeatStringNotInArray(mut path) => { + InjectableParsingError::RepeatStringNotInArray(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::RepeatStringNotInArray(prepended_path) + InjectableParsingError::RepeatStringNotInArray(prepended_path) } - TemplateParsingError::BadIndexForRepeatString(mut path, index) => { + InjectableParsingError::BadIndexForRepeatString(mut path, index) => { prepended_path.append(&mut path); - TemplateParsingError::BadIndexForRepeatString(prepended_path, index) + InjectableParsingError::BadIndexForRepeatString(prepended_path, index) } - TemplateParsingError::MissingPlaceholderInRepeatedValue(mut path) => { + InjectableParsingError::MissingPlaceholderInRepeatedValue(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::MissingPlaceholderInRepeatedValue(prepended_path) + InjectableParsingError::MissingPlaceholderInRepeatedValue(prepended_path) } - TemplateParsingError::MultipleRepeatString(mut path, older_path) => { + InjectableParsingError::MultipleRepeatString(mut path, older_path) => { let older_prepended_path = prepended_path.iter().cloned().chain(older_path).collect(); prepended_path.append(&mut path); - TemplateParsingError::MultipleRepeatString(prepended_path, older_prepended_path) + InjectableParsingError::MultipleRepeatString(prepended_path, older_prepended_path) } - TemplateParsingError::MultiplePlaceholderString(mut path, older_path) => { + InjectableParsingError::MultiplePlaceholderString(mut path, older_path) => { let older_prepended_path = prepended_path.iter().cloned().chain(older_path).collect(); prepended_path.append(&mut path); - TemplateParsingError::MultiplePlaceholderString( + InjectableParsingError::MultiplePlaceholderString( prepended_path, older_prepended_path, ) } - TemplateParsingError::MissingPlaceholderString => { - TemplateParsingError::MissingPlaceholderString + InjectableParsingError::MissingPlaceholderString => { + InjectableParsingError::MissingPlaceholderString } - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path, mut path_to_array, array_to_placeholder, @@ -184,7 +160,7 @@ impl TemplateParsingError { prepended_path.iter().cloned().chain(single_path).collect(); prepended_path.append(&mut path_to_array); // we don't prepend the array_to_placeholder path as it is the array path that is prepended - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path: single_prepended_path, path_to_array: prepended_path, array_to_placeholder, @@ -194,7 +170,7 @@ impl TemplateParsingError { } } -/// Error that occurs when [`ValueTemplate::extract`] fails. +/// Error that occurs when [`InjectableValue::extract`] fails. #[derive(Debug)] pub struct ExtractionError { /// The cause of the failure @@ -336,27 +312,6 @@ enum LastNamedObject<'a> { NestedArrayInsideObject { object_name: &'a str, index: usize, nesting_level: usize }, } -/// Builds a string representation of a path, preprending the name of the root value. -pub fn path_with_root<'a>( - root: &str, - path: impl IntoIterator + 'a, -) -> String { - use std::fmt::Write as _; - let mut res = format!("`{root}"); - for component in path.into_iter() { - match component { - PathComponent::MapKey(key) => { - let _ = write!(&mut res, ".{key}"); - } - PathComponent::ArrayIndex(index) => { - let _ = write!(&mut res, "[{index}]"); - } - } - } - res.push('`'); - res -} - /// Context where an extraction failure happened /// /// The operation that failed @@ -405,7 +360,7 @@ enum ArrayParsingContext<'a> { NotNested(&'a mut Option), } -impl ValueTemplate { +impl InjectableValue { /// Prepare a template for injection or extraction. /// /// # Parameters @@ -419,12 +374,12 @@ impl ValueTemplate { /// /// # Errors /// - /// - [`TemplateParsingError`]: refer to the documentation of this type + /// - [`InjectableParsingError`]: refer to the documentation of this type pub fn new( template: Value, placeholder_string: &str, repeat_string: &str, - ) -> Result { + ) -> Result { let mut value_path = None; let mut array_path = None; let mut current_path = Vec::new(); @@ -438,11 +393,11 @@ impl ValueTemplate { )?; let value_kind = match (array_path, value_path) { - (None, None) => return Err(TemplateParsingError::MissingPlaceholderString), + (None, None) => return Err(InjectableParsingError::MissingPlaceholderString), (None, Some(value_path)) => ValueKind::Single(value_path), (Some(array_path), None) => ValueKind::Array(array_path), (Some(array_path), Some(value_path)) => { - return Err(TemplateParsingError::BothArrayAndSingle { + return Err(InjectableParsingError::BothArrayAndSingle { single_path: value_path, path_to_array: array_path.path_to_array, array_to_placeholder: array_path.value_path_in_array, @@ -564,29 +519,29 @@ impl ValueTemplate { value_path: &mut Option, mut array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { // two modes for parsing array. match array { // 1. array contains a repeat string in second position [first, second, rest @ ..] if second == repeat_string => { let ArrayParsingContext::NotNested(array_path) = &mut array_path else { - return Err(TemplateParsingError::NestedRepeatString(current_path.clone())); + return Err(InjectableParsingError::NestedRepeatString(current_path.clone())); }; if let Some(array_path) = array_path { - return Err(TemplateParsingError::MultipleRepeatString( + return Err(InjectableParsingError::MultipleRepeatString( current_path.clone(), array_path.path_to_array.clone(), )); } if first == repeat_string { - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), 0, )); } if let Some(position) = rest.iter().position(|value| value == repeat_string) { let position = position + 2; - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), position, )); @@ -609,7 +564,9 @@ impl ValueTemplate { value_path.ok_or_else(|| { let mut repeated_value_path = current_path.clone(); repeated_value_path.push(PathComponent::ArrayIndex(0)); - TemplateParsingError::MissingPlaceholderInRepeatedValue(repeated_value_path) + InjectableParsingError::MissingPlaceholderInRepeatedValue( + repeated_value_path, + ) })? }; **array_path = Some(ArrayPath { @@ -621,7 +578,7 @@ impl ValueTemplate { // 2. array does not contain a repeat string array => { if let Some(position) = array.iter().position(|value| value == repeat_string) { - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), position, )); @@ -650,7 +607,7 @@ impl ValueTemplate { value_path: &mut Option, array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { for (key, value) in object.iter() { current_path.push(PathComponent::MapKey(key.to_owned())); Self::parse_value( @@ -673,12 +630,12 @@ impl ValueTemplate { value_path: &mut Option, array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { match value { Value::String(str) => { if placeholder_string == str { if let Some(value_path) = value_path { - return Err(TemplateParsingError::MultiplePlaceholderString( + return Err(InjectableParsingError::MultiplePlaceholderString( current_path.clone(), value_path.clone(), )); @@ -687,7 +644,9 @@ impl ValueTemplate { *value_path = Some(current_path.clone()); } if repeat_string == str { - return Err(TemplateParsingError::RepeatStringNotInArray(current_path.clone())); + return Err(InjectableParsingError::RepeatStringNotInArray( + current_path.clone(), + )); } } Value::Null | Value::Bool(_) | Value::Number(_) => {} @@ -712,27 +671,6 @@ impl ValueTemplate { } } -fn inject_value(rendered: &mut Value, injection_path: &Vec, injected_value: Value) { - let mut current_value = rendered; - for injection_component in injection_path { - current_value = match injection_component { - PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), - PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), - } - } - *current_value = injected_value; -} - -fn format_value(value: &Value) -> String { - match value { - Value::Array(array) => format!("an array of size {}", array.len()), - Value::Object(object) => { - format!("an object with {} field(s)", object.len()) - } - value => value.to_string(), - } -} - fn extract_value( extraction_path: &[PathComponent], initial_value: &mut Value, @@ -838,10 +776,10 @@ impl ExtractionResultErrorContext for Result { mod test { use serde_json::{json, Value}; - use super::{PathComponent, TemplateParsingError, ValueTemplate}; + use super::{InjectableParsingError, InjectableValue, PathComponent}; - fn new_template(template: Value) -> Result { - ValueTemplate::new(template, "{{text}}", "{{..}}") + fn new_template(template: Value) -> Result { + InjectableValue::new(template, "{{text}}", "{{..}}") } #[test] @@ -853,7 +791,7 @@ mod test { }); let error = new_template(template.clone()).unwrap_err(); - assert!(matches!(error, TemplateParsingError::MissingPlaceholderString)) + assert!(matches!(error, InjectableParsingError::MissingPlaceholderString)) } #[test] @@ -887,7 +825,7 @@ mod test { }); match new_template(template.clone()) { - Err(TemplateParsingError::MultiplePlaceholderString(left, right)) => { + Err(InjectableParsingError::MultiplePlaceholderString(left, right)) => { assert_eq!( left, vec![PathComponent::MapKey("titi".into()), PathComponent::ArrayIndex(3)] diff --git a/crates/milli/src/vector/json_template/mod.rs b/crates/milli/src/vector/json_template/mod.rs new file mode 100644 index 000000000..57a3b67b1 --- /dev/null +++ b/crates/milli/src/vector/json_template/mod.rs @@ -0,0 +1,283 @@ +//! Exposes types to manipulate JSON values +//! +//! - [`JsonTemplate`]: renders JSON values by rendering its strings as [`Template`]s. +//! - [`InjectableValue`]: Describes a JSON value containing placeholders, +//! then allows to inject values instead of the placeholder to produce new concrete JSON values, +//! or extract sub-values at the placeholder location from concrete JSON values. +//! +//! The module also exposes foundational types to work with JSON paths: +//! +//! - [`ValuePath`] is made of [`PathComponent`]s to indicate the location of a sub-value inside of a JSON value. +//! - [`inject_value`] is a primitive that replaces the sub-value at the described location by an injected value. + +#![warn(rustdoc::broken_intra_doc_links)] +#![warn(missing_docs)] + +use bumpalo::Bump; +use liquid::{Parser, Template}; +use serde_json::{Map, Value}; + +use crate::prompt::ParseableDocument; +use crate::update::new::document::Document; + +mod injectable_value; + +pub use injectable_value::InjectableValue; + +/// Represents a JSON [`Value`] where each string is rendered as a [`Template`]. +#[derive(Debug)] +pub struct JsonTemplate { + value: Value, + templates: Vec, +} + +impl Clone for JsonTemplate { + fn clone(&self) -> Self { + Self::new(self.value.clone()).unwrap() + } +} + +struct TemplateAtPath { + template: Template, + path: ValuePath, +} + +impl std::fmt::Debug for TemplateAtPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TemplateAtPath") + .field("template", &&"template") + .field("path", &self.path) + .finish() + } +} + +/// Error that can occur either when parsing the templates in the value, or when trying to render them. +#[derive(Debug)] +pub struct Error { + template_error: liquid::Error, + path: ValuePath, +} + +impl Error { + /// Produces an error message when the error happened at rendering time. + pub fn rendering_error(&self, root: &str) -> String { + format!( + "in `{}`, error while rendering template: {}", + path_with_root(root, self.path.iter()), + &self.template_error + ) + } + + /// Produces an error message when the error happened at parsing time. + pub fn parsing(&self, root: &str) -> String { + format!( + "in `{}`, error while parsing template: {}", + path_with_root(root, self.path.iter()), + &self.template_error + ) + } +} + +impl JsonTemplate { + /// Creates a new `JsonTemplate` by parsing all strings inside the value as templates. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be parsed. + pub fn new(value: Value) -> Result { + let templates = build_templates(&value)?; + Ok(Self { value, templates }) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the given context. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered with the given context. + pub fn render(&self, context: &dyn liquid::ObjectView) -> Result { + let mut rendered = self.value.clone(); + for TemplateAtPath { template, path } in &self.templates { + let injected_value = + template.render(context).map_err(|err| error_with_path(err, path.clone()))?; + inject_value(&mut rendered, path, Value::String(injected_value)); + } + Ok(rendered) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the given document. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered with the given document. + pub fn render_document<'a, 'doc, D: Document<'a> + std::fmt::Debug>( + &self, + document: D, + doc_alloc: &'doc Bump, + ) -> Result { + let document = ParseableDocument::new(document, doc_alloc); + let v: Vec = vec![]; + let context = crate::prompt::Context::new(&document, &v); + self.render(&context) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the search query. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered from the contents of the search query + pub fn render_search(&self, q: Option<&str>, media: Option<&Value>) -> Result { + let search_data = match (q, media) { + (None, None) => liquid::object!({}), + (None, Some(media)) => liquid::object!({ "media": media }), + (Some(q), None) => liquid::object!({"q": q}), + (Some(q), Some(media)) => liquid::object!({"q": q, "media": media}), + }; + self.render(&search_data) + } + + /// The JSON value representing the underlying template + pub fn template(&self) -> &Value { + &self.value + } +} + +fn build_templates(value: &Value) -> Result, Error> { + let mut current_path = ValuePath::new(); + let mut templates = Vec::new(); + let compiler = liquid::ParserBuilder::with_stdlib().build().unwrap(); + parse_value(value, &mut current_path, &mut templates, &compiler)?; + Ok(templates) +} + +fn error_with_path(template_error: liquid::Error, path: ValuePath) -> Error { + Error { template_error, path } +} + +fn parse_value( + value: &Value, + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + match value { + Value::String(template) => { + let template = compiler + .parse(template) + .map_err(|err| error_with_path(err, current_path.clone()))?; + templates.push(TemplateAtPath { template, path: current_path.clone() }); + } + Value::Array(values) => { + parse_array(values, current_path, templates, compiler)?; + } + Value::Object(map) => { + parse_object(map, current_path, templates, compiler)?; + } + _ => {} + } + Ok(()) +} + +fn parse_object( + map: &Map, + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + for (key, value) in map { + current_path.push(PathComponent::MapKey(key.clone())); + parse_value(value, current_path, templates, compiler)?; + current_path.pop(); + } + Ok(()) +} + +fn parse_array( + values: &[Value], + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + for (index, value) in values.iter().enumerate() { + current_path.push(PathComponent::ArrayIndex(index)); + parse_value(value, current_path, templates, compiler)?; + current_path.pop(); + } + Ok(()) +} + +/// A list of [`PathComponent`]s describing a path to a value inside a JSON value. +/// +/// The empty list refers to the root value. +pub type ValuePath = Vec; + +/// Component of a path to a Value +#[derive(Debug, Clone)] +pub enum PathComponent { + /// A key inside of an object + MapKey(String), + /// An index inside of an array + ArrayIndex(usize), +} + +impl PartialEq for PathComponent { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, + (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, + _ => false, + } + } +} + +impl Eq for PathComponent {} + +/// Builds a string representation of a path, preprending the name of the root value. +pub fn path_with_root<'a>( + root: &str, + path: impl IntoIterator + 'a, +) -> String { + use std::fmt::Write as _; + let mut res = format!("`{root}"); + for component in path.into_iter() { + match component { + PathComponent::MapKey(key) => { + let _ = write!(&mut res, ".{key}"); + } + PathComponent::ArrayIndex(index) => { + let _ = write!(&mut res, "[{index}]"); + } + } + } + res.push('`'); + res +} + +/// Modifies `rendered` to replace the sub-value at the `injection_path` location by the `injected_value`. +/// +/// # Panics +/// +/// - if the provided `injection_path` cannot be traversed in `rendered`. +pub fn inject_value( + rendered: &mut Value, + injection_path: &Vec, + injected_value: Value, +) { + let mut current_value = rendered; + for injection_component in injection_path { + current_value = match injection_component { + PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), + PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), + } + } + *current_value = injected_value; +} + +fn format_value(value: &Value) -> String { + match value { + Value::Array(array) => format!("an array of size {}", array.len()), + Value::Object(object) => { + format!("an object with {} field(s)", object.len()) + } + value => value.to_string(), + } +} From 17a94c40dc63c70bf66162a9fcd71fcad1d8ebfc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:48:38 +0200 Subject: [PATCH 042/101] Add `vector::db` module --- crates/milli/src/vector/db.rs | 443 +++++++++++++++++++++++++++++++++ crates/milli/src/vector/mod.rs | 1 + 2 files changed, 444 insertions(+) create mode 100644 crates/milli/src/vector/db.rs diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs new file mode 100644 index 000000000..0e890fac9 --- /dev/null +++ b/crates/milli/src/vector/db.rs @@ -0,0 +1,443 @@ +//! Module containing types and methods to store meta-information about the embedders and fragments + +use std::borrow::Cow; + +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use heed::types::{SerdeJson, Str, U8}; +use heed::{BytesEncode, Database, RoTxn, RwTxn, Unspecified}; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; + +use crate::vector::settings::RemoveFragments; +use crate::vector::EmbeddingConfig; +use crate::{CboRoaringBitmapCodec, DocumentId, UserError}; + +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + #[serde(default)] + pub fragments: FragmentConfigs, +} + +#[derive(Debug, Clone, Deserialize, Serialize, Default)] +pub struct FragmentConfigs(Vec); + +impl FragmentConfigs { + pub fn new() -> Self { + Default::default() + } + pub fn as_slice(&self) -> &[FragmentConfig] { + self.0.as_slice() + } + + pub fn into_inner(self) -> Vec { + self.0 + } + + pub fn remove_fragments<'a>( + &mut self, + fragments: impl IntoIterator, + ) -> Option { + let mut remove_fragments = Vec::new(); + for fragment in fragments { + let Ok(index_to_remove) = self.0.binary_search_by_key(&fragment, |f| &f.name) else { + continue; + }; + let fragment = self.0.swap_remove(index_to_remove); + remove_fragments.push(fragment.id); + } + (!remove_fragments.is_empty()).then_some(RemoveFragments { fragment_ids: remove_fragments }) + } + + pub fn add_new_fragments( + &mut self, + new_fragments: impl IntoIterator, + ) -> crate::Result<()> { + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + + for FragmentConfig { id, name: _ } in self.0.iter() { + free_indices[*id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + + let mut new_fragments = new_fragments.into_iter(); + + for name in &mut new_fragments { + let id = match find_free_index() { + Some(id) => id, + None => { + let more = (&mut new_fragments).count(); + return Err(UserError::TooManyFragments(u8::MAX as usize + more + 1).into()); + } + }; + self.0.push(FragmentConfig { id, name }); + } + Ok(()) + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct FragmentConfig { + pub id: u8, + pub name: String, +} + +pub struct IndexEmbeddingConfigs { + main: Database, + embedder_info: Database, +} + +pub struct EmbedderInfo { + pub embedder_id: u8, + pub embedding_status: EmbeddingStatus, +} + +impl EmbedderInfo { + pub fn to_bytes(&self) -> Result, heed::BoxedError> { + EmbedderInfoCodec::bytes_encode(self) + } +} + +/// Optimized struct to hold the list of documents that are `user_provided` and `must_regenerate`. +/// +/// Because most documents have the same value for `user_provided` and `must_regenerate`, we store only +/// the `user_provided` and a list of the documents for which `must_regenerate` assumes the other value +/// than `user_provided`. +#[derive(Default)] +pub struct EmbeddingStatus { + user_provided: RoaringBitmap, + skip_regenerate_different_from_user_provided: RoaringBitmap, +} + +impl EmbeddingStatus { + pub fn new() -> Self { + Default::default() + } + + /// Whether the document contains user-provided vectors for that embedder. + pub fn is_user_provided(&self, docid: DocumentId) -> bool { + self.user_provided.contains(docid) + } + /// Whether vectors should be regenerated for that document and that embedder. + pub fn must_regenerate(&self, docid: DocumentId) -> bool { + let invert = self.skip_regenerate_different_from_user_provided.contains(docid); + let user_provided = self.user_provided.contains(docid); + !(user_provided ^ invert) + } + + pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) { + let invert = self.skip_regenerate_different_from_user_provided.contains(docid); + let user_provided = self.user_provided.contains(docid); + (user_provided, !(user_provided ^ invert)) + } + + pub fn user_provided_docids(&self) -> &RoaringBitmap { + &self.user_provided + } + + pub fn skip_regenerate_docids(&self) -> RoaringBitmap { + &self.user_provided ^ &self.skip_regenerate_different_from_user_provided + } + + pub(crate) fn into_user_provided(self) -> RoaringBitmap { + self.user_provided + } +} + +#[derive(Default)] +pub struct EmbeddingStatusDelta { + del_status: EmbeddingStatus, + add_status: EmbeddingStatus, +} + +impl EmbeddingStatusDelta { + pub fn new() -> Self { + Self::default() + } + + pub fn needs_change( + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) -> bool { + let old_skip_regenerate_different_user_provided = + old_is_user_provided == old_must_regenerate; + let new_skip_regenerate_different_user_provided = + new_is_user_provided == new_must_regenerate; + + old_is_user_provided != new_is_user_provided + || old_skip_regenerate_different_user_provided + != new_skip_regenerate_different_user_provided + } + + pub fn needs_clear(is_user_provided: bool, must_regenerate: bool) -> bool { + Self::needs_change(is_user_provided, must_regenerate, false, true) + } + + pub fn clear_docid( + &mut self, + docid: DocumentId, + is_user_provided: bool, + must_regenerate: bool, + ) { + self.push_delta(docid, is_user_provided, must_regenerate, false, true); + } + + pub fn push_delta( + &mut self, + docid: DocumentId, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) { + // must_regenerate == !skip_regenerate + let old_skip_regenerate_different_user_provided = + old_is_user_provided == old_must_regenerate; + let new_skip_regenerate_different_user_provided = + new_is_user_provided == new_must_regenerate; + + match (old_is_user_provided, new_is_user_provided) { + (true, true) | (false, false) => { /* no change */ } + (true, false) => { + self.del_status.user_provided.insert(docid); + } + (false, true) => { + self.add_status.user_provided.insert(docid); + } + } + + match ( + old_skip_regenerate_different_user_provided, + new_skip_regenerate_different_user_provided, + ) { + (true, true) | (false, false) => { /* no change */ } + (true, false) => { + self.del_status.skip_regenerate_different_from_user_provided.insert(docid); + } + (false, true) => { + self.add_status.skip_regenerate_different_from_user_provided.insert(docid); + } + } + } + + pub fn push_new(&mut self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) { + self.push_delta( + docid, + !is_user_provided, + !must_regenerate, + is_user_provided, + must_regenerate, + ); + } + + pub fn apply_to(&self, status: &mut EmbeddingStatus) { + status.user_provided -= &self.del_status.user_provided; + status.user_provided |= &self.add_status.user_provided; + + status.skip_regenerate_different_from_user_provided -= + &self.del_status.skip_regenerate_different_from_user_provided; + status.skip_regenerate_different_from_user_provided |= + &self.add_status.skip_regenerate_different_from_user_provided; + } +} + +struct EmbedderInfoCodec; + +impl<'a> heed::BytesDecode<'a> for EmbedderInfoCodec { + type DItem = EmbedderInfo; + + fn bytes_decode(mut bytes: &'a [u8]) -> Result { + let embedder_id = bytes.read_u8()?; + // Support all version that didn't store the embedding status + if bytes.is_empty() { + return Ok(EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() }); + } + let first_bitmap_size = bytes.read_u32::()?; + let first_bitmap_bytes = &bytes[..first_bitmap_size as usize]; + let user_provided = CboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?; + let skip_regenerate_different_from_user_provided = + CboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?; + Ok(EmbedderInfo { + embedder_id, + embedding_status: EmbeddingStatus { + user_provided, + skip_regenerate_different_from_user_provided, + }, + }) + } +} + +impl<'a> heed::BytesEncode<'a> for EmbedderInfoCodec { + type EItem = EmbedderInfo; + + fn bytes_encode(item: &'a Self::EItem) -> Result, heed::BoxedError> { + let first_bitmap_size = + CboRoaringBitmapCodec::serialized_size(&item.embedding_status.user_provided); + let second_bitmap_size = CboRoaringBitmapCodec::serialized_size( + &item.embedding_status.skip_regenerate_different_from_user_provided, + ); + + let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size); + bytes.write_u8(item.embedder_id)?; + bytes.write_u32::(first_bitmap_size.try_into()?)?; + CboRoaringBitmapCodec::serialize_into_writer( + &item.embedding_status.user_provided, + &mut bytes, + )?; + CboRoaringBitmapCodec::serialize_into_writer( + &item.embedding_status.skip_regenerate_different_from_user_provided, + &mut bytes, + )?; + Ok(bytes.into()) + } +} + +impl IndexEmbeddingConfigs { + pub(crate) fn new( + main: Database, + embedder_info: Database, + ) -> Self { + Self { main, embedder_info: embedder_info.remap_types() } + } + + pub(crate) fn put_embedding_configs( + &self, + wtxn: &mut RwTxn<'_>, + configs: Vec, + ) -> heed::Result<()> { + self.main.remap_types::>>().put( + wtxn, + crate::index::main_key::EMBEDDING_CONFIGS, + &configs, + ) + } + + pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, crate::index::main_key::EMBEDDING_CONFIGS) + } + + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + Ok(self + .main + .remap_types::>>() + .get(rtxn, crate::index::main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default()) + } + + pub fn embedder_id(&self, rtxn: &RoTxn<'_>, name: &str) -> heed::Result> { + self.embedder_info.remap_data_type::().get(rtxn, name) + } + + pub fn put_fresh_embedder_id( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + embedder_id: u8, + ) -> heed::Result<()> { + let info = EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() }; + self.put_embedder_info(wtxn, name, &info) + } + + /// Iterate through the passed list of embedder names, associating a fresh embedder id to any new names. + /// + /// Passing the name of a currently existing embedder is not an error, and will not modify its embedder id, + /// so it is not necessary to differentiate between new and existing embedders before calling this function. + pub fn add_new_embedders<'a>( + &self, + wtxn: &mut RwTxn<'_>, + embedder_names: impl IntoIterator, + total_embedder_count: usize, + ) -> crate::Result<()> { + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + + for res in self.embedder_info.iter(wtxn)? { + let (_name, EmbedderInfo { embedder_id, embedding_status: _ }) = res?; + free_indices[embedder_id as usize] = false; + } + + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + + for embedder_name in embedder_names { + if self.embedder_id(wtxn, embedder_name)?.is_some() { + continue; + } + let embedder_id = find_free_index() + .ok_or(crate::UserError::TooManyEmbedders(total_embedder_count))?; + tracing::debug!( + embedder = embedder_name, + embedder_id, + "assigning free id to new embedder" + ); + self.put_fresh_embedder_id(wtxn, embedder_name, embedder_id)?; + } + Ok(()) + } + + pub fn embedder_info( + &self, + rtxn: &RoTxn<'_>, + name: &str, + ) -> heed::Result> { + self.embedder_info.get(rtxn, name) + } + + /// Clear the list of docids that are `user_provided` or `must_regenerate` across all embedders. + pub fn clear_embedder_info_docids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<()> { + let mut it = self.embedder_info.iter_mut(wtxn)?; + while let Some(res) = it.next() { + let (embedder_name, info) = res?; + let embedder_name = embedder_name.to_owned(); + // SAFETY: we copied the `embedder_name` so are not using the reference while using put + unsafe { + it.put_current( + &embedder_name, + &EmbedderInfo { + embedder_id: info.embedder_id, + embedding_status: EmbeddingStatus::new(), + }, + )?; + } + } + Ok(()) + } + + pub fn iter_embedder_info<'a>( + &self, + rtxn: &'a RoTxn<'_>, + ) -> heed::Result>> { + self.embedder_info.iter(rtxn) + } + + pub fn iter_embedder_id<'a>( + &self, + rtxn: &'a RoTxn<'_>, + ) -> heed::Result>> { + self.embedder_info.remap_data_type::().iter(rtxn) + } + + pub fn remove_embedder( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + ) -> heed::Result> { + let info = self.embedder_info.get(wtxn, name)?; + self.embedder_info.delete(wtxn, name)?; + Ok(info) + } + + pub fn put_embedder_info( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + info: &EmbedderInfo, + ) -> heed::Result<()> { + self.embedder_info.put(wtxn, name, info) + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 065beb5fb..ec4ee2ccd 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -18,6 +18,7 @@ use crate::prompt::{Prompt, PromptData}; use crate::ThreadPoolNoAbort; pub mod composite; +pub mod db; pub mod error; pub mod hf; pub mod json_template; From 0114796d2aaba9b638e188541dd1edba5ddd06e6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:56:44 +0200 Subject: [PATCH 043/101] Index uses the vector::db stuff --- crates/milli/src/index.rs | 70 +++++++++++++-------------------------- 1 file changed, 23 insertions(+), 47 deletions(-) diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index e9e63a853..b2ec992ba 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -30,7 +30,8 @@ use crate::order_by_map::OrderByMap; use crate::prompt::PromptData; use crate::proximity::ProximityPrecision; use crate::update::new::StdResult; -use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::db::IndexEmbeddingConfigs; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -177,7 +178,7 @@ pub struct Index { pub field_id_docid_facet_strings: Database, /// Maps an embedder name to its id in the arroy store. - pub embedder_category_id: Database, + pub(crate) embedder_category_id: Database, /// Vector store based on arroy™. pub vector_arroy: arroy::Database, @@ -1745,34 +1746,6 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::LOCALIZED_ATTRIBUTES_RULES) } - /// Put the embedding configs: - /// 1. The name of the embedder - /// 2. The configuration option for this embedder - /// 3. The list of documents with a user provided embedding - pub(crate) fn put_embedding_configs( - &self, - wtxn: &mut RwTxn<'_>, - configs: Vec, - ) -> heed::Result<()> { - self.main.remap_types::>>().put( - wtxn, - main_key::EMBEDDING_CONFIGS, - &configs, - ) - } - - pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { - self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) - } - - pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { - Ok(self - .main - .remap_types::>>() - .get(rtxn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default()) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } @@ -1785,19 +1758,29 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::SEARCH_CUTOFF) } + pub fn embedding_configs(&self) -> IndexEmbeddingConfigs { + IndexEmbeddingConfigs::new(self.main, self.embedder_category_id) + } + pub fn embeddings( &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result>> { + ) -> Result, bool)>> { let mut res = BTreeMap::new(); - let embedding_configs = self.embedding_configs(rtxn)?; - for config in embedding_configs { - let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); - let reader = - ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let embedders = self.embedding_configs(); + for config in embedders.embedding_configs(rtxn)? { + let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); + let reader = ArroyWrapper::new( + self.vector_arroy, + embedder_info.embedder_id, + config.config.quantized(), + ); let embeddings = reader.item_vectors(rtxn, docid)?; - res.insert(config.name.to_owned(), embeddings); + res.insert( + config.name.to_owned(), + (embeddings, embedder_info.embedding_status.must_regenerate(docid)), + ); } Ok(res) } @@ -1809,9 +1792,9 @@ impl Index { pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { let mut stats = ArroyStats::default(); - let embedding_configs = self.embedding_configs(rtxn)?; - for config in embedding_configs { - let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let embedding_configs = self.embedding_configs(); + for config in embedding_configs.embedding_configs(rtxn)? { + let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); let reader = ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); reader.aggregate_stats(rtxn, &mut stats)?; @@ -1936,13 +1919,6 @@ impl Index { } } -#[derive(Debug, Deserialize, Serialize)] -pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, - pub user_provided: RoaringBitmap, -} - #[derive(Debug, Default, Deserialize, Serialize)] pub struct ChatConfig { pub description: String, From c16c60b5998e88a835d20505799b8f0c779d1922 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:48:53 +0200 Subject: [PATCH 044/101] Add `vector::extractor` module --- crates/milli/src/vector/extractor.rs | 214 +++++++++++++++++++++++++++ crates/milli/src/vector/mod.rs | 1 + 2 files changed, 215 insertions(+) create mode 100644 crates/milli/src/vector/extractor.rs diff --git a/crates/milli/src/vector/extractor.rs b/crates/milli/src/vector/extractor.rs new file mode 100644 index 000000000..cbfc62ee1 --- /dev/null +++ b/crates/milli/src/vector/extractor.rs @@ -0,0 +1,214 @@ +use std::cell::RefCell; +use std::collections::BTreeMap; +use std::fmt::Debug; + +use bumpalo::Bump; +use serde_json::Value; + +use super::json_template::{self, JsonTemplate}; +use crate::prompt::error::RenderPromptError; +use crate::prompt::Prompt; +use crate::update::new::document::Document; +use crate::vector::RuntimeFragment; +use crate::GlobalFieldsIdsMap; + +pub trait Extractor<'doc> { + type DocumentMetadata; + type Input: PartialEq; + type Error; + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error>; + + fn extractor_id(&self) -> u8; + + fn diff_documents<'a, OD: Document<'a> + Debug, ND: Document<'a> + Debug>( + &self, + old: OD, + new: ND, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> + where + 'doc: 'a, + { + let old_input = self.extract(old, meta); + let new_input = self.extract(new, meta); + to_diff(old_input, new_input) + } + + fn diff_settings<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + old: Option<&Self>, + ) -> Result, Self::Error> { + let old_input = if let Some(old) = old { old.extract(&doc, meta) } else { Ok(None) }; + let new_input = self.extract(&doc, meta); + + to_diff(old_input, new_input) + } + + fn ignore_errors(self) -> IgnoreErrorExtractor + where + Self: Sized, + { + IgnoreErrorExtractor(self) + } +} + +fn to_diff( + old_input: Result, E>, + new_input: Result, E>, +) -> Result, E> { + let old_input = old_input.ok().unwrap_or(None); + let new_input = new_input?; + Ok(match (old_input, new_input) { + (Some(old), Some(new)) if old == new => ExtractorDiff::Unchanged, + (None, None) => ExtractorDiff::Unchanged, + (None, Some(input)) => ExtractorDiff::Added(input), + (Some(_), None) => ExtractorDiff::Removed, + (Some(_), Some(input)) => ExtractorDiff::Updated(input), + }) +} + +pub enum ExtractorDiff { + Removed, + Added(Input), + Updated(Input), + Unchanged, +} + +impl ExtractorDiff { + pub fn into_input(self) -> Option { + match self { + ExtractorDiff::Removed => None, + ExtractorDiff::Added(input) => Some(input), + ExtractorDiff::Updated(input) => Some(input), + ExtractorDiff::Unchanged => None, + } + } + + pub fn needs_change(&self) -> bool { + match self { + ExtractorDiff::Removed => true, + ExtractorDiff::Added(_) => true, + ExtractorDiff::Updated(_) => true, + ExtractorDiff::Unchanged => false, + } + } + + pub fn into_list_of_changes( + named_diffs: impl IntoIterator, + ) -> BTreeMap> { + named_diffs + .into_iter() + .filter(|(_, diff)| diff.needs_change()) + .map(|(name, diff)| (name, diff.into_input())) + .collect() + } +} + +pub struct DocumentTemplateExtractor<'a, 'b, 'c> { + doc_alloc: &'a Bump, + field_id_map: &'a RefCell>, + template: &'c Prompt, +} + +impl<'a, 'b, 'c> DocumentTemplateExtractor<'a, 'b, 'c> { + pub fn new( + template: &'c Prompt, + doc_alloc: &'a Bump, + field_id_map: &'a RefCell>, + ) -> Self { + Self { template, doc_alloc, field_id_map } + } +} + +impl<'doc> Extractor<'doc> for DocumentTemplateExtractor<'doc, '_, '_> { + type DocumentMetadata = &'doc str; + type Input = &'doc str; + type Error = RenderPromptError; + + fn extractor_id(&self) -> u8 { + 0 + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + external_docid: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(Some(self.template.render_document( + external_docid, + doc, + self.field_id_map, + self.doc_alloc, + )?)) + } +} + +pub struct RequestFragmentExtractor<'a> { + fragment: &'a JsonTemplate, + extractor_id: u8, + doc_alloc: &'a Bump, +} + +impl<'a> RequestFragmentExtractor<'a> { + pub fn new(fragment: &'a RuntimeFragment, doc_alloc: &'a Bump) -> Self { + Self { fragment: &fragment.template, extractor_id: fragment.id, doc_alloc } + } +} + +impl<'doc> Extractor<'doc> for RequestFragmentExtractor<'doc> { + type DocumentMetadata = (); + type Input = Value; + type Error = json_template::Error; + + fn extractor_id(&self) -> u8 { + self.extractor_id + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + _meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(Some(self.fragment.render_document(doc, self.doc_alloc)?)) + } +} + +pub struct IgnoreErrorExtractor(E); + +impl<'doc, E> Extractor<'doc> for IgnoreErrorExtractor +where + E: Extractor<'doc>, +{ + type DocumentMetadata = E::DocumentMetadata; + type Input = E::Input; + + type Error = Infallible; + + fn extractor_id(&self) -> u8 { + self.0.extractor_id() + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(self.0.extract(doc, meta).ok().flatten()) + } +} + +#[derive(Debug)] +pub enum Infallible {} + +impl From for crate::Error { + fn from(_: Infallible) -> Self { + unreachable!("Infallible values cannot be built") + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index ec4ee2ccd..246f824e1 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -20,6 +20,7 @@ use crate::ThreadPoolNoAbort; pub mod composite; pub mod db; pub mod error; +pub mod extractor; pub mod hf; pub mod json_template; pub mod manual; From b45059e8f202a714bf78a957cf6d1304b66325f6 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:49:10 +0200 Subject: [PATCH 045/101] Add `vector::session` module --- crates/milli/src/vector/mod.rs | 1 + crates/milli/src/vector/session.rs | 152 +++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) create mode 100644 crates/milli/src/vector/session.rs diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 246f824e1..395c5d704 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -26,6 +26,7 @@ pub mod json_template; pub mod manual; pub mod openai; pub mod parsed_vectors; +pub mod session; pub mod settings; pub mod ollama; diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs new file mode 100644 index 000000000..b6f229779 --- /dev/null +++ b/crates/milli/src/vector/session.rs @@ -0,0 +1,152 @@ +use bumpalo::collections::Vec as BVec; +use bumpalo::Bump; +use serde_json::Value; + +use super::{EmbedError, Embedder, Embedding}; +use crate::{DocumentId, Result, ThreadPoolNoAbort}; +type ExtractorId = u8; + +#[derive(Clone, Copy)] +pub struct Metadata<'doc> { + pub docid: DocumentId, + pub external_docid: &'doc str, + pub extractor_id: ExtractorId, +} + +pub struct EmbeddingResponse<'doc> { + pub metadata: Metadata<'doc>, + pub embedding: Option, +} + +pub trait OnEmbed<'doc> { + type ErrorMetadata; + + fn process_embedding_response(&mut self, response: EmbeddingResponse<'doc>); + fn process_embedding_error( + &mut self, + error: EmbedError, + embedder_name: &'doc str, + unused_vectors_distribution: &Self::ErrorMetadata, + metadata: &[Metadata<'doc>], + ) -> crate::Error; + + fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec); +} + +pub struct EmbedSession<'doc, C, I> { + // requests + inputs: BVec<'doc, I>, + metadata: BVec<'doc, Metadata<'doc>>, + + threads: &'doc ThreadPoolNoAbort, + embedder: &'doc Embedder, + + embedder_name: &'doc str, + + on_embed: C, +} + +pub trait Input: Sized { + fn embed_ref( + inputs: &[Self], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError>; +} + +impl Input for &'_ str { + fn embed_ref( + inputs: &[Self], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + embedder.embed_index_ref(inputs, threads) + } +} + +impl Input for Value { + fn embed_ref( + inputs: &[Value], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + ) -> std::result::Result, EmbedError> { + embedder.embed_index_ref_fragments(inputs, threads) + } +} + +impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { + #[allow(clippy::too_many_arguments)] + pub fn new( + embedder: &'doc Embedder, + embedder_name: &'doc str, + threads: &'doc ThreadPoolNoAbort, + doc_alloc: &'doc Bump, + on_embed: C, + ) -> Self { + let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); + let texts = BVec::with_capacity_in(capacity, doc_alloc); + let ids = BVec::with_capacity_in(capacity, doc_alloc); + Self { inputs: texts, metadata: ids, embedder, threads, embedder_name, on_embed } + } + + pub fn request_embedding( + &mut self, + metadata: Metadata<'doc>, + rendered: I, + unused_vectors_distribution: &C::ErrorMetadata, + ) -> Result<()> { + if self.inputs.len() < self.inputs.capacity() { + self.inputs.push(rendered); + self.metadata.push(metadata); + return Ok(()); + } + + self.embed_chunks(unused_vectors_distribution) + } + + pub fn drain(mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result { + self.embed_chunks(unused_vectors_distribution)?; + Ok(self.on_embed) + } + + #[allow(clippy::too_many_arguments)] + fn embed_chunks(&mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result<()> { + if self.inputs.is_empty() { + return Ok(()); + } + let res = match I::embed_ref(self.inputs.as_slice(), self.embedder, self.threads) { + Ok(embeddings) => { + for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) { + self.on_embed.process_embedding_response(EmbeddingResponse { + metadata, + embedding: Some(embedding), + }); + } + Ok(()) + } + Err(error) => { + return Err(self.on_embed.process_embedding_error( + error, + self.embedder_name, + unused_vectors_distribution, + &self.metadata, + )) + } + }; + self.inputs.clear(); + self.metadata.clear(); + res + } + + pub(crate) fn embedder_name(&self) -> &'doc str { + self.embedder_name + } + + pub(crate) fn doc_alloc(&self) -> &'doc Bump { + self.inputs.bump() + } + + pub(crate) fn on_embed_mut(&mut self) -> &mut C { + &mut self.on_embed + } +} From 0b5bc41b792aec391ed293d9b903dfe007e06578 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:50:42 +0200 Subject: [PATCH 046/101] Add new vector errors --- crates/milli/src/vector/error.rs | 73 ++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 685022de8..00d4221e5 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -101,6 +101,32 @@ pub enum EmbedErrorKind { MissingEmbedding, #[error(transparent)] PanicInThreadPool(#[from] PanicCatched), + #[error("`media` requested but the configuration doesn't have source `rest`")] + RestMediaNotARest, + #[error("`media` requested, and the configuration has source `rest`, but the configuration doesn't have `searchFragments`.")] + RestMediaNotAFragment, + + #[error("Query matches multiple search fragments.\n - Note: First matched fragment `{name}`.\n - Note: Second matched fragment `{second_name}`.\n - Note: {}", + { + serde_json::json!({ + "q": q, + "media": media + }) + })] + RestSearchMatchesMultipleFragments { + name: String, + second_name: String, + q: Option, + media: Option, + }, + #[error("Query matches no search fragment.\n - Note: {}", + { + serde_json::json!({ + "q": q, + "media": media + }) + })] + RestSearchMatchesNoFragment { q: Option, media: Option }, } fn option_info(info: Option<&str>, prefix: &str) -> String { @@ -210,6 +236,44 @@ impl EmbedError { pub(crate) fn rest_extraction_error(error: String) -> EmbedError { Self { kind: EmbedErrorKind::RestExtractionError(error), fault: FaultSource::Runtime } } + + pub(crate) fn rest_media_not_a_rest() -> EmbedError { + Self { kind: EmbedErrorKind::RestMediaNotARest, fault: FaultSource::User } + } + + pub(crate) fn rest_media_not_a_fragment() -> EmbedError { + Self { kind: EmbedErrorKind::RestMediaNotAFragment, fault: FaultSource::User } + } + + pub(crate) fn rest_search_matches_multiple_fragments( + name: &str, + second_name: &str, + q: Option<&str>, + media: Option<&serde_json::Value>, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestSearchMatchesMultipleFragments { + name: name.to_string(), + second_name: second_name.to_string(), + q: q.map(String::from), + media: media.cloned(), + }, + fault: FaultSource::User, + } + } + + pub(crate) fn rest_search_matches_no_fragment( + q: Option<&str>, + media: Option<&serde_json::Value>, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestSearchMatchesNoFragment { + q: q.map(String::from), + media: media.cloned(), + }, + fault: FaultSource::User, + } + } } #[derive(Debug, thiserror::Error)] @@ -382,6 +446,13 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_cannot_infer_dimensions_for_fragment() -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::RestCannotInferDimensionsForFragment, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -499,6 +570,8 @@ pub enum NewEmbedderErrorKind { CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize }, #[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance:.2}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")] CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, + #[error("cannot infer `dimensions` for an embedder using `indexingFragments`.\n - Note: Specify `dimensions` explicitly or don't use `indexingFragments`.")] + RestCannotInferDimensionsForFragment, } pub struct PossibleEmbeddingMistakes { From 836ae19becebd5e3dbb81fc322a724159d3fa8d7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:52:05 +0200 Subject: [PATCH 047/101] ArroyWrapper changes --- crates/milli/src/vector/mod.rs | 241 ++++++++++++++++++++++++--------- 1 file changed, 180 insertions(+), 61 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 395c5d704..3e7dc270d 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -15,6 +15,8 @@ use utoipa::ToSchema; use self::error::{EmbedError, NewEmbedderError}; use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{Prompt, PromptData}; +use crate::vector::composite::SubEmbedderOptions; +use crate::vector::json_template::JsonTemplate; use crate::ThreadPoolNoAbort; pub mod composite; @@ -63,7 +65,7 @@ impl ArroyWrapper { rtxn: &'a RoTxn<'a>, db: arroy::Database, ) -> impl Iterator, arroy::Error>> + 'a { - arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { + arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { match arroy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { Ok(false) => Some(Ok(reader)), @@ -76,12 +78,57 @@ impl ArroyWrapper { }) } - pub fn dimensions(&self, rtxn: &RoTxn) -> Result { - let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap(); + /// The item ids that are present in the store specified by its id. + /// + /// The ids are accessed via a lambda to avoid lifetime shenanigans. + pub fn items_in_store( + &self, + rtxn: &RoTxn, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { if self.quantized { - Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) + self._items_in_store(rtxn, self.quantized_db(), store_id, with_items) } else { - Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) + self._items_in_store(rtxn, self.angular_db(), store_id, with_items) + } + } + + fn _items_in_store( + &self, + rtxn: &RoTxn, + db: arroy::Database, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let reader = arroy::Reader::open(rtxn, index, db); + match reader { + Ok(reader) => Ok(with_items(reader.item_ids())), + Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), + Err(err) => Err(err), + } + } + + pub fn dimensions(&self, rtxn: &RoTxn) -> Result, arroy::Error> { + if self.quantized { + Ok(self + .readers(rtxn, self.quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } else { + Ok(self + .readers(rtxn, self.angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) } } @@ -96,13 +143,13 @@ impl ArroyWrapper { arroy_memory: Option, cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { writer.builder(rng).build(wtxn)? } else if writer.is_empty(wtxn)? { - break; + continue; } } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); @@ -127,7 +174,7 @@ impl ArroyWrapper { .cancel(cancel) .build(wtxn)?; } else if writer.is_empty(wtxn)? { - break; + continue; } } } @@ -146,7 +193,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = embeddings.dimension(); for (index, vector) in - arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) { if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension) @@ -182,7 +229,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = vector.len(); - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); if !writer.contains_item(wtxn, item_id)? { writer.add_item(wtxn, item_id, vector)?; @@ -192,6 +239,38 @@ impl ArroyWrapper { Ok(()) } + /// Add a vector associated with a document in store specified by its id. + /// + /// Any existing vector associated with the document in the store will be replaced by the new vector. + pub fn add_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + if self.quantized { + self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector) + } else { + self._add_item_in_store(wtxn, self.angular_db(), item_id, store_id, vector) + } + } + + fn _add_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimension); + writer.add_item(wtxn, item_id, vector) + } + /// Delete all embeddings from a specific `item_id` pub fn del_items( &self, @@ -199,24 +278,84 @@ impl ArroyWrapper { dimension: usize, item_id: arroy::ItemId, ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if !writer.del_item(wtxn, item_id)? { - break; - } + writer.del_item(wtxn, item_id)?; } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); - if !writer.del_item(wtxn, item_id)? { - break; - } + writer.del_item(wtxn, item_id)?; } } Ok(()) } - /// Delete one item. + /// Removes the item specified by its id from the store specified by its id. + /// + /// Returns whether the item was removed. + /// + /// # Warning + /// + /// - This function will silently fail to remove the item if used against an arroy database that was never built. + pub fn del_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + if self.quantized { + self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions) + } else { + self._del_item_in_store(wtxn, self.angular_db(), item_id, store_id, dimensions) + } + } + + fn _del_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.del_item(wtxn, item_id) + } + + /// Removes all items from the store specified by its id. + /// + /// # Warning + /// + /// - This function will silently fail to remove the items if used against an arroy database that was never built. + pub fn clear_store( + &self, + wtxn: &mut RwTxn, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + if self.quantized { + self._clear_store(wtxn, self.quantized_db(), store_id, dimensions) + } else { + self._clear_store(wtxn, self.angular_db(), store_id, dimensions) + } + } + + fn _clear_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.clear(wtxn) + } + + /// Delete one item from its value. pub fn del_item( &self, wtxn: &mut RwTxn, @@ -238,54 +377,31 @@ impl ArroyWrapper { vector: &[f32], ) -> Result { let dimension = vector.len(); - let mut deleted_index = None; - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - // uses invariant: vectors are packed in the first writers. - break; + continue; }; if candidate == vector { - writer.del_item(wtxn, item_id)?; - deleted_index = Some(index); + return writer.del_item(wtxn, item_id); } } - - // 🥲 enforce invariant: vectors are packed in the first writers. - if let Some(deleted_index) = deleted_index { - let mut last_index_with_a_vector = None; - for index in - arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize) - { - let writer = arroy::Writer::new(db, index, dimension); - let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } - if let Some((last_index, vector)) = last_index_with_a_vector { - let writer = arroy::Writer::new(db, last_index, dimension); - writer.del_item(wtxn, item_id)?; - let writer = arroy::Writer::new(db, deleted_index, dimension); - writer.add_item(wtxn, item_id, &vector)?; - } - } - Ok(deleted_index.is_some()) + Ok(false) } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(wtxn)? { - break; + continue; } writer.clear(wtxn)?; } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); if writer.is_empty(wtxn)? { - break; + continue; } writer.clear(wtxn)?; } @@ -299,17 +415,17 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let contains = if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(rtxn)? { - break; + continue; } writer.contains_item(rtxn, item)? } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); if writer.is_empty(rtxn)? { - break; + continue; } writer.contains_item(rtxn, item)? }; @@ -348,13 +464,14 @@ impl ArroyWrapper { let reader = reader?; let mut searcher = reader.nns(limit); if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } searcher.candidates(filter); } if let Some(mut ret) = searcher.by_item(rtxn, item)? { results.append(&mut ret); - } else { - break; } } results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); @@ -389,6 +506,9 @@ impl ArroyWrapper { let reader = reader?; let mut searcher = reader.nns(limit); if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } searcher.candidates(filter); } @@ -407,16 +527,12 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.quantized_db()) { if let Some(vec) = reader?.item_vector(rtxn, item_id)? { vectors.push(vec); - } else { - break; } } } else { for reader in self.readers(rtxn, self.angular_db()) { if let Some(vec) = reader?.item_vector(rtxn, item_id)? { vectors.push(vec); - } else { - break; } } } @@ -989,8 +1105,11 @@ pub const fn is_cuda_enabled() -> bool { cfg!(feature = "cuda") } -pub fn arroy_db_range_for_embedder(embedder_id: u8) -> impl Iterator { - let embedder_id = (embedder_id as u16) << 8; - - (0..=u8::MAX).map(move |k| embedder_id | (k as u16)) +fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator { + (0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id)) +} + +fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 { + let embedder_id = (embedder_id as u16) << 8; + embedder_id | (store_id as u16) } From 422a786ffdaef07639191f9ec4fedc868be6c7ee Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:52:36 +0200 Subject: [PATCH 048/101] RuntimeEmbedder and RuntimeFragments --- crates/milli/src/vector/mod.rs | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 3e7dc270d..37ade8f81 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -584,6 +584,7 @@ pub struct ArroyStats { pub documents: RoaringBitmap, } /// One or multiple embeddings stored consecutively in a flat vector. +#[derive(Debug, PartialEq)] pub struct Embeddings { data: Vec, dimension: usize, @@ -734,15 +735,26 @@ impl EmbeddingConfig { } } -/// Map of embedder configurations. -/// -/// Each configuration is mapped to a name. +/// Map of runtime embedder data. #[derive(Clone, Default)] -pub struct EmbeddingConfigs(HashMap, Arc, bool)>); +pub struct RuntimeEmbedders(HashMap>); -impl EmbeddingConfigs { +pub struct RuntimeEmbedder { + pub embedder: Arc, + pub document_template: Prompt, + pub fragments: Vec, + pub is_quantized: bool, +} + +pub struct RuntimeFragment { + pub name: String, + pub id: u8, + pub template: JsonTemplate, +} + +impl RuntimeEmbedders { /// Create the map from its internal component.s - pub fn new(data: HashMap, Arc, bool)>) -> Self { + pub fn new(data: HashMap>) -> Self { Self(data) } @@ -751,24 +763,23 @@ impl EmbeddingConfigs { } /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option<(Arc, Arc, bool)> { + pub fn get(&self, name: &str) -> Option> { self.0.get(name).cloned() } - pub fn inner_as_ref(&self) -> &HashMap, Arc, bool)> { + pub fn inner_as_ref(&self) -> &HashMap> { &self.0 } - pub fn into_inner(self) -> HashMap, Arc, bool)> { + pub fn into_inner(self) -> HashMap> { self.0 } } -impl IntoIterator for EmbeddingConfigs { - type Item = (String, (Arc, Arc, bool)); +impl IntoIterator for RuntimeEmbedders { + type Item = (String, Arc); - type IntoIter = - std::collections::hash_map::IntoIter, Arc, bool)>; + type IntoIter = std::collections::hash_map::IntoIter>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() From 5716ab70f38c521f768f98f64aa32399f0fedb54 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:07:32 +0200 Subject: [PATCH 049/101] EmbeddingConfigs -> RuntimeEmbedders --- crates/benchmarks/benches/indexing.rs | 64 +++++++++---------- crates/benchmarks/benches/utils.rs | 4 +- crates/fuzzers/src/bin/fuzz-indexing.rs | 4 +- crates/index-scheduler/src/lib.rs | 58 ++++++++++++----- .../src/scheduler/process_dump_creation.rs | 11 +--- .../src/scheduler/process_index_operation.rs | 9 ++- crates/index-scheduler/src/scheduler/test.rs | 2 +- crates/meilisearch/src/lib.rs | 2 +- .../src/routes/indexes/documents.rs | 13 +--- crates/meilisearch/src/search/mod.rs | 19 ++---- crates/meilitool/src/main.rs | 10 +-- .../milli/src/search/new/tests/integration.rs | 4 +- crates/milli/src/search/new/vector_sort.rs | 4 +- crates/milli/src/search/similar.rs | 11 ++-- crates/milli/src/test_index.rs | 4 +- .../milli/tests/search/facet_distribution.rs | 4 +- crates/milli/tests/search/mod.rs | 4 +- crates/milli/tests/search/query_criteria.rs | 4 +- crates/milli/tests/search/typo_tolerance.rs | 4 +- 19 files changed, 118 insertions(+), 117 deletions(-) diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 16e7a2f81..4083b69dd 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -11,7 +11,7 @@ use milli::heed::{EnvOpenOptions, RwTxn}; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{FilterableAttributesRule, Index}; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; @@ -166,7 +166,7 @@ fn indexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -233,7 +233,7 @@ fn reindexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -278,7 +278,7 @@ fn reindexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -347,7 +347,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -424,7 +424,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -469,7 +469,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -510,7 +510,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -578,7 +578,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -645,7 +645,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -712,7 +712,7 @@ fn indexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -778,7 +778,7 @@ fn reindexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -823,7 +823,7 @@ fn reindexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -891,7 +891,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -968,7 +968,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1014,7 +1014,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1056,7 +1056,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1123,7 +1123,7 @@ fn indexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1189,7 +1189,7 @@ fn reindexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1234,7 +1234,7 @@ fn reindexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1302,7 +1302,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1351,7 +1351,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index 0632b7846..ec1f96fd5 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -13,7 +13,7 @@ use milli::heed::EnvOpenOptions; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::IndexerConfig; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::Index; use serde_json::Value; use tempfile::TempDir; @@ -89,7 +89,7 @@ fn main() { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut operations = Vec::new(); diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 505ce23f8..f551652c1 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -57,12 +57,15 @@ use meilisearch_types::features::{ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{DecodeIgnore, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, RoTxn, WithoutTls}; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; -use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; +use meilisearch_types::milli::vector::json_template::JsonTemplate; +use meilisearch_types::milli::vector::{ + Embedder, EmbedderOptions, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, +}; use meilisearch_types::milli::{self, Index}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{KindWithContent, Task}; +use milli::vector::db::IndexEmbeddingConfig; use processing::ProcessingTasks; pub use queue::Query; use queue::Queue; @@ -851,29 +854,42 @@ impl IndexScheduler { &self, index_uid: String, embedding_configs: Vec, - ) -> Result { + ) -> Result { let res: Result<_> = embedding_configs .into_iter() .map( |IndexEmbeddingConfig { name, config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized }, - .. - }| { - let prompt = Arc::new( - prompt - .try_into() - .map_err(meilisearch_types::milli::Error::from) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, - ); + fragments, + }| + -> Result<(String, Arc)> { + let document_template = prompt + .try_into() + .map_err(meilisearch_types::milli::Error::from) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + + let fragments = fragments + .into_inner() + .into_iter() + .map(|fragment| { + let value = embedder_options.fragment(&fragment.name).unwrap(); + let template = JsonTemplate::new(value.clone()).unwrap(); + RuntimeFragment { name: fragment.name, id: fragment.id, template } + }) + .collect(); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); if let Some(embedder) = embedders.get(&embedder_options) { - return Ok(( - name, - (embedder.clone(), prompt, quantized.unwrap_or_default()), - )); + let runtime = Arc::new(RuntimeEmbedder { + embedder: embedder.clone(), + document_template, + fragments, + is_quantized: quantized.unwrap_or_default(), + }); + + return Ok((name, runtime)); } } @@ -889,11 +905,19 @@ impl IndexScheduler { let mut embedders = self.embedders.write().unwrap(); embedders.insert(embedder_options, embedder.clone()); } - Ok((name, (embedder, prompt, quantized.unwrap_or_default()))) + + let runtime = Arc::new(RuntimeEmbedder { + embedder: embedder.clone(), + document_template, + fragments, + is_quantized: quantized.unwrap_or_default(), + }); + + Ok((name, runtime)) }, ) .collect(); - res.map(EmbeddingConfigs::new) + res.map(RuntimeEmbedders::new) } pub fn chat_settings(&self, uid: &str) -> Result> { diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index a6d785b2f..ec1be0e93 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -165,9 +165,6 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index - .embedding_configs(&rtxn) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; let nb_documents = index .number_of_documents(&rtxn) @@ -221,16 +218,12 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate: !user_provided, + regenerate, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 04aaf9a84..62d0e6545 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -89,8 +89,9 @@ impl IndexScheduler { let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(); let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + .map_err(|e| Error::from_milli(e.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; for operation in operations { match operation { @@ -274,8 +275,9 @@ impl IndexScheduler { }) .unwrap()?; let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentEditionProgress::Indexing); @@ -423,8 +425,9 @@ impl IndexScheduler { indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentDeletionProgress::Indexing); diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index ee26165c7..2c492525f 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -3,11 +3,11 @@ use std::collections::BTreeMap; use big_s::S; use meili_snap::{json_string, snapshot}; use meilisearch_auth::AuthFilter; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexDocumentsMethod::*; use meilisearch_types::milli::{self}; use meilisearch_types::settings::SettingEmbeddingSettings; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; +use milli::vector::db::IndexEmbeddingConfig; use roaring::RoaringBitmap; use crate::insta_snapshot::snapshot_index_scheduler; diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 871bd688e..e1acef2ce 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -563,7 +563,7 @@ fn import_dump( let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; - let embedder_configs = index.embedding_configs(&wtxn)?; + let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; let builder = milli::update::IndexDocuments::new( diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 50eec46fe..a93d736f7 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1452,7 +1452,6 @@ fn some_documents<'a, 't: 'a>( ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { @@ -1468,15 +1467,9 @@ fn some_documents<'a, 't: 'a>( Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, vector) in index.embeddings(rtxn, key)? { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { - embeddings: Some(vector.into()), - regenerate: !user_provided, - }; + for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? { + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 5e543c53f..61ef3f813 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -399,10 +399,10 @@ impl SearchKind { route: Route, ) -> Result<(String, Arc, bool), ResponseError> { let rtxn = index.read_txn()?; - let embedder_configs = index.embedding_configs(&rtxn)?; + let embedder_configs = index.embedding_configs().embedding_configs(&rtxn)?; let embedders = index_scheduler.embedders(index_uid, embedder_configs)?; - let (embedder, _, quantized) = embedders + let (embedder, quantized) = embedders .get(embedder_name) .ok_or(match route { Route::Search | Route::MultiSearch => { @@ -412,6 +412,7 @@ impl SearchKind { milli::UserError::InvalidSimilarEmbedder(embedder_name.to_owned()) } }) + .map(|runtime| (runtime.embedder.clone(), runtime.is_quantized)) .map_err(milli::Error::from)?; if let Some(vector_len) = vector_len { @@ -1328,7 +1329,6 @@ struct HitMaker<'a> { vectors_fid: Option, retrieve_vectors: RetrieveVectors, to_retrieve_ids: BTreeSet, - embedding_configs: Vec, formatter_builder: MatcherBuilder<'a>, formatted_options: BTreeMap, show_ranking_score: bool, @@ -1443,8 +1443,6 @@ impl<'a> HitMaker<'a> { &displayed_ids, ); - let embedding_configs = index.embedding_configs(rtxn)?; - Ok(Self { index, rtxn, @@ -1453,7 +1451,6 @@ impl<'a> HitMaker<'a> { vectors_fid, retrieve_vectors, to_retrieve_ids, - embedding_configs, formatter_builder, formatted_options, show_ranking_score: format.show_ranking_score, @@ -1499,14 +1496,8 @@ impl<'a> HitMaker<'a> { Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, vector) in self.index.embeddings(self.rtxn, id)? { - let user_provided = self - .embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(id)); - let embeddings = - ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; + for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? { + let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index dd1213782..b967e620c 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -545,7 +545,6 @@ fn export_documents( let rtxn = index.read_txn()?; let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(&rtxn)?; if let Some(offset) = offset { eprintln!("Skipping {offset} documents"); @@ -592,17 +591,12 @@ fn export_documents( .into()); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); - + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate: !user_provided, + regenerate, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 9e2afca97..38f39e18b 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -8,7 +8,7 @@ use maplit::{btreemap, hashset}; use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexerConfig, Settings}; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{db_snap, Criterion, FilterableAttributesRule, Index}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); use crate::constants::RESERVED_GEO_FIELD_NAME; @@ -55,7 +55,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index 834f97384..2c201e899 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -32,8 +32,8 @@ impl VectorSort { ) -> Result { let embedder_index = ctx .index - .embedder_category_id - .get(ctx.txn, embedder_name)? + .embedding_configs() + .embedder_id(ctx.txn, embedder_name)? .ok_or_else(|| crate::UserError::InvalidSearchEmbedder(embedder_name.to_owned()))?; Ok(Self { diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 759940f9c..903b5fcf9 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -64,10 +64,13 @@ impl<'a> Similar<'a> { let universe = universe; - let embedder_index = - self.index.embedder_category_id.get(self.rtxn, &self.embedder_name)?.ok_or_else( - || crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()), - )?; + let embedder_index = self + .index + .embedding_configs() + .embedder_id(self.rtxn, &self.embedder_name)? + .ok_or_else(|| { + crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) + })?; let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); let results = reader.nns_by_item( diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index f2e34c615..cfd8c8492 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -18,7 +18,7 @@ use crate::update::{ self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings, }; use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -223,7 +223,7 @@ fn aborting_indexation() { let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let payload = documents!([ { "id": 1, "name": "kevin" }, diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index d04db425e..cc1b85369 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -5,7 +5,7 @@ use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{FacetDistribution, FilterableAttributesRule, Index, Object, OrderBy}; use serde_json::{from_value, json}; @@ -35,7 +35,7 @@ fn test_facet_distribution_with_no_facet_values() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let doc1: Object = from_value( diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 3ee78561d..fa03f1cc1 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -10,7 +10,7 @@ use maplit::{btreemap, hashset}; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{ AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy, }; @@ -74,7 +74,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index cb0c23e42..3f8134085 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -8,7 +8,7 @@ use maplit::hashset; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; use rand::Rng; use Criterion::*; @@ -288,7 +288,7 @@ fn criteria_ascdesc() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 49c9c7b5d..95ff85165 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -6,7 +6,7 @@ use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy}; use serde_json::from_value; use tempfile::tempdir; @@ -123,7 +123,7 @@ fn test_typo_disabled_on_word() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.replace_documents(&documents).unwrap(); From e7b9b8f00230429831fe5467f3a1d5161465e6e2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:53:06 +0200 Subject: [PATCH 050/101] Change embedder API --- crates/milli/src/vector/mod.rs | 75 ++++++++++++++++++++++++++++++++-- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 37ade8f81..87ecd2414 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -797,6 +797,27 @@ pub enum EmbedderOptions { Composite(composite::EmbedderOptions), } +impl EmbedderOptions { + pub fn fragment(&self, name: &str) -> Option<&serde_json::Value> { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => None, + EmbedderOptions::Rest(embedder_options) => { + embedder_options.indexing_fragments.get(name) + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + embedder_options.indexing_fragments.get(name) + } else { + None + } + } + } + } +} + impl Default for EmbedderOptions { fn default() -> Self { Self::HuggingFace(Default::default()) @@ -837,6 +858,17 @@ impl Embedder { #[tracing::instrument(level = "debug", skip_all, target = "search")] pub fn embed_search( + &self, + query: SearchQuery<'_>, + deadline: Option, + ) -> std::result::Result { + match query { + SearchQuery::Text(text) => self.embed_search_text(text, deadline), + SearchQuery::Media { q, media } => self.embed_search_media(q, media, deadline), + } + } + + pub fn embed_search_text( &self, text: &str, deadline: Option, @@ -858,10 +890,7 @@ impl Embedder { .pop() .ok_or_else(EmbedError::missing_embedding), Embedder::UserProvided(embedder) => embedder.embed_one(text), - Embedder::Rest(embedder) => embedder - .embed_ref(&[text], deadline, None)? - .pop() - .ok_or_else(EmbedError::missing_embedding), + Embedder::Rest(embedder) => embedder.embed_one(SearchQuery::Text(text), deadline, None), Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None), }?; @@ -872,6 +901,18 @@ impl Embedder { Ok(embedding) } + pub fn embed_search_media( + &self, + q: Option<&str>, + media: Option<&serde_json::Value>, + deadline: Option, + ) -> std::result::Result { + let Embedder::Rest(embedder) = self else { + return Err(EmbedError::rest_media_not_a_rest()); + }; + embedder.embed_one(SearchQuery::Media { q, media }, deadline, None) + } + /// Embed multiple chunks of texts. /// /// Each chunk is composed of one or multiple texts. @@ -916,6 +957,26 @@ impl Embedder { } } + pub fn embed_index_ref_fragments( + &self, + fragments: &[serde_json::Value], + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + if let Embedder::Rest(embedder) = self { + embedder.embed_index_ref(fragments, threads, embedder_stats) + } else { + let Embedder::Composite(embedder) = self else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + let crate::vector::composite::SubEmbedder::Rest(embedder) = &embedder.index else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + + embedder.embed_index_ref(fragments, threads, embedder_stats) + } + } + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] pub fn chunk_count_hint(&self) -> usize { match self { @@ -987,6 +1048,12 @@ impl Embedder { } } +#[derive(Clone, Copy)] +pub enum SearchQuery<'a> { + Text(&'a str), + Media { q: Option<&'a str>, media: Option<&'a serde_json::Value> }, +} + /// Describes the mean and sigma of distribution of embedding similarity in the embedding space. /// /// The intended use is to make the similarity score more comparable to the regular ranking score. From 4235a82dcfab23f5bab89cee819c49f91dd68712 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:54:06 +0200 Subject: [PATCH 051/101] REST embedder supports fragments --- crates/milli/src/vector/rest.rs | 231 +++++++++++++++++++++++++++----- 1 file changed, 197 insertions(+), 34 deletions(-) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index fbe3c1129..9477959ad 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -6,11 +6,13 @@ use rand::Rng; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; use serde::{Deserialize, Serialize}; +use serde_json::Value; use super::error::EmbedErrorKind; -use super::json_template::ValueTemplate; +use super::json_template::{InjectableValue, JsonTemplate}; use super::{ - DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, REQUEST_PARALLELISM, + DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, SearchQuery, + REQUEST_PARALLELISM, }; use crate::error::FaultSource; use crate::progress::EmbedderStats; @@ -88,19 +90,54 @@ struct EmbedderData { bearer: Option, headers: BTreeMap, url: String, - request: Request, + request: RequestData, response: Response, configuration_source: ConfigurationSource, } +#[derive(Debug)] +pub enum RequestData { + Single(Request), + FromFragments(RequestFromFragments), +} + +impl RequestData { + pub fn new( + request: Value, + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + ) -> Result { + Ok(if indexing_fragments.is_empty() && search_fragments.is_empty() { + RequestData::Single(Request::new(request)?) + } else { + RequestData::FromFragments(RequestFromFragments::new(request, search_fragments)?) + }) + } + + fn input_type(&self) -> InputType { + match self { + RequestData::Single(request) => request.input_type(), + RequestData::FromFragments(request_from_fragments) => { + request_from_fragments.input_type() + } + } + } + + fn has_fragments(&self) -> bool { + matches!(self, RequestData::FromFragments(_)) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct EmbedderOptions { pub api_key: Option, pub distribution: Option, pub dimensions: Option, pub url: String, - pub request: serde_json::Value, - pub response: serde_json::Value, + pub request: Value, + pub search_fragments: BTreeMap, + pub indexing_fragments: BTreeMap, + pub response: Value, pub headers: BTreeMap, } @@ -138,7 +175,12 @@ impl Embedder { .timeout(std::time::Duration::from_secs(30)) .build(); - let request = Request::new(options.request)?; + let request = RequestData::new( + options.request, + options.indexing_fragments, + options.search_fragments, + )?; + let response = Response::new(options.response, &request)?; let data = EmbedderData { @@ -188,7 +230,7 @@ impl Embedder { embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> where - S: AsRef + Serialize, + S: Serialize, { embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline, embedder_stats) } @@ -231,9 +273,9 @@ impl Embedder { } } - pub(crate) fn embed_index_ref( + pub(crate) fn embed_index_ref( &self, - texts: &[&str], + texts: &[S], threads: &ThreadPoolNoAbort, embedder_stats: &EmbedderStats, ) -> Result, EmbedError> { @@ -287,9 +329,44 @@ impl Embedder { pub(super) fn cache(&self) -> &EmbeddingCache { &self.cache } + + pub(crate) fn embed_one( + &self, + query: SearchQuery, + deadline: Option, + embedder_stats: Option<&EmbedderStats>, + ) -> Result { + let mut embeddings = match (&self.data.request, query) { + (RequestData::Single(_), SearchQuery::Text(text)) => { + embed(&self.data, &[text], 1, Some(self.dimensions), deadline, embedder_stats) + } + (RequestData::Single(_), SearchQuery::Media { q: _, media: _ }) => { + return Err(EmbedError::rest_media_not_a_fragment()) + } + (RequestData::FromFragments(request_from_fragments), SearchQuery::Text(q)) => { + let fragment = request_from_fragments.render_search_fragment(Some(q), None)?; + + embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats) + } + ( + RequestData::FromFragments(request_from_fragments), + SearchQuery::Media { q, media }, + ) => { + let fragment = request_from_fragments.render_search_fragment(q, media)?; + + embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats) + } + }?; + + // unwrap: checked by `expected_count` + Ok(embeddings.pop().unwrap()) + } } fn infer_dimensions(data: &EmbedderData) -> Result { + if data.request.has_fragments() { + return Err(NewEmbedderError::rest_cannot_infer_dimensions_for_fragment()); + } let v = embed(data, ["test"].as_slice(), 1, None, None, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error @@ -307,6 +384,13 @@ fn embed( where S: Serialize, { + if inputs.is_empty() { + if expected_count != 0 { + return Err(EmbedError::rest_response_embedding_count(expected_count, 0)); + } + return Ok(Vec::new()); + } + let request = data.client.post(&data.url); let request = if let Some(bearer) = &data.bearer { request.set("Authorization", bearer) @@ -318,7 +402,12 @@ where request = request.set(header.as_str(), value.as_str()); } - let body = data.request.inject_texts(inputs); + let body = match &data.request { + RequestData::Single(request) => request.inject_texts(inputs), + RequestData::FromFragments(request_from_fragments) => { + request_from_fragments.request_from_fragments(inputs).expect("inputs was empty") + } + }; for attempt in 0..10 { if let Some(embedder_stats) = &embedder_stats { @@ -426,7 +515,7 @@ fn response_to_embedding( expected_count: usize, expected_dimensions: Option, ) -> Result, Retry> { - let response: serde_json::Value = response + let response: Value = response .into_json() .map_err(EmbedError::rest_response_deserialization) .map_err(Retry::retry_later)?; @@ -455,17 +544,19 @@ fn response_to_embedding( } pub(super) const REQUEST_PLACEHOLDER: &str = "{{text}}"; +pub(super) const REQUEST_FRAGMENT_PLACEHOLDER: &str = "{{fragment}}"; pub(super) const RESPONSE_PLACEHOLDER: &str = "{{embedding}}"; pub(super) const REPEAT_PLACEHOLDER: &str = "{{..}}"; #[derive(Debug)] pub struct Request { - template: ValueTemplate, + template: InjectableValue, } impl Request { - pub fn new(template: serde_json::Value) -> Result { - let template = match ValueTemplate::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) { + pub fn new(template: Value) -> Result { + let template = match InjectableValue::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) + { Ok(template) => template, Err(error) => { let message = @@ -485,42 +576,114 @@ impl Request { } } - pub fn inject_texts( - &self, - texts: impl IntoIterator, - ) -> serde_json::Value { + pub fn inject_texts(&self, texts: impl IntoIterator) -> Value { self.template.inject(texts.into_iter().map(|s| serde_json::json!(s))).unwrap() } } +#[derive(Debug)] +pub struct RequestFromFragments { + search_fragments: BTreeMap, + request: InjectableValue, +} + +impl RequestFromFragments { + pub fn new( + request: Value, + search_fragments: impl IntoIterator, + ) -> Result { + let request = + match InjectableValue::new(request, REQUEST_FRAGMENT_PLACEHOLDER, REPEAT_PLACEHOLDER) { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; + + let search_fragments: Result<_, NewEmbedderError> = search_fragments + .into_iter() + .map(|(name, value)| { + Ok(( + name, + JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing("searchFragments"), + ) + })?, + )) + }) + .collect(); + + Ok(Self { request, search_fragments: search_fragments? }) + } + + fn input_type(&self) -> InputType { + if self.request.has_array_value() { + InputType::TextArray + } else { + InputType::Text + } + } + + pub fn render_search_fragment( + &self, + q: Option<&str>, + media: Option<&Value>, + ) -> Result { + let mut it = self.search_fragments.iter().filter_map(|(name, template)| { + let render = template.render_search(q, media).ok()?; + Some((name, render)) + }); + let Some((name, fragment)) = it.next() else { + return Err(EmbedError::rest_search_matches_no_fragment(q, media)); + }; + if let Some((second_name, _)) = it.next() { + return Err(EmbedError::rest_search_matches_multiple_fragments( + name, + second_name, + q, + media, + )); + } + + Ok(fragment) + } + + pub fn request_from_fragments<'a, S: Serialize + 'a>( + &self, + fragments: impl IntoIterator, + ) -> Option { + self.request.inject(fragments.into_iter().map(|fragment| serde_json::json!(fragment))).ok() + } +} + #[derive(Debug)] pub struct Response { - template: ValueTemplate, + template: InjectableValue, } impl Response { - pub fn new(template: serde_json::Value, request: &Request) -> Result { - let template = match ValueTemplate::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) - { - Ok(template) => template, - Err(error) => { - let message = - error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); - return Err(NewEmbedderError::rest_could_not_parse_template(message)); - } - }; + pub fn new(template: Value, request: &RequestData) -> Result { + let template = + match InjectableValue::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; - match (template.has_array_value(), request.template.has_array_value()) { + match (template.has_array_value(), request.input_type() == InputType::TextArray) { (true, true) | (false, false) => Ok(Self {template}), (true, false) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has multiple embeddings, but `request` has only one text to embed".to_string())), (false, true) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has a single embedding, but `request` has multiple texts to embed".to_string())), } } - pub fn extract_embeddings( - &self, - response: serde_json::Value, - ) -> Result, EmbedError> { + pub fn extract_embeddings(&self, response: Value) -> Result, EmbedError> { let extracted_values: Vec = match self.template.extract(response) { Ok(extracted_values) => extracted_values, Err(error) => { From c45ede44a80a75a03d20d624ef12c1d693381b5f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:01:55 +0200 Subject: [PATCH 052/101] Add new parameters to openai and rest embedders --- crates/milli/src/vector/ollama.rs | 2 ++ crates/milli/src/vector/openai.rs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index d4329a2de..feec92cc0 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -71,6 +71,8 @@ impl EmbedderOptions { request, response, headers: Default::default(), + indexing_fragments: Default::default(), + search_fragments: Default::default(), }) } } diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 0159d5c76..bf6c92978 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -201,6 +201,8 @@ impl Embedder { ] }), headers: Default::default(), + indexing_fragments: Default::default(), + search_fragments: Default::default(), }, cache_cap, super::rest::ConfigurationSource::OpenAi, From d48baece51081434b879c35a72d6052a227599a9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:56:15 +0200 Subject: [PATCH 053/101] New error when too many fragments in settings --- crates/milli/src/error.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 2136ec97e..f8886da8e 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -288,6 +288,8 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), + #[error("Too many fragments in the configuration. Found {0}, but limited to 256.")] + TooManyFragments(usize), #[error("Cannot find embedder with name `{0}`.")] InvalidSearchEmbedder(String), #[error("Cannot find embedder with name `{0}`.")] From f3d5c74c02ef8e82b17fa68c7ab833d0f33e20ca Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:55:28 +0200 Subject: [PATCH 054/101] Vector settings to add `indexingFragments` and `searchFragments` --- crates/milli/src/vector/settings.rs | 373 +++++++++++++++++++++++++++- 1 file changed, 361 insertions(+), 12 deletions(-) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 712c1faa5..93de37290 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -2,6 +2,8 @@ use std::collections::BTreeMap; use std::num::NonZeroUsize; use deserr::Deserr; +use either::Either; +use itertools::Itertools; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -229,6 +231,35 @@ pub struct EmbeddingSettings { /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, + /// Template fragments that will be reassembled and sent to the remote embedder at indexing time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents. + /// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes. + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub indexing_fragments: Setting>>, + + /// Template fragments that will be reassembled and sent to the remote embedder at search time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub search_fragments: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] #[schema(value_type = Option)] @@ -483,6 +514,36 @@ pub struct SubEmbeddingSettings { /// - 🌱 When modified for source `openAi`, embeddings are never regenerated /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, + + /// Template fragments that will be reassembled and sent to the remote embedder at indexing time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents. + /// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes. + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub indexing_fragments: Setting>>, + + /// Template fragments that will be reassembled and sent to the remote embedder at search time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub search_fragments: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] #[schema(value_type = Option)] @@ -555,16 +616,24 @@ pub struct SubEmbeddingSettings { } /// Indicates what action should take place during a reindexing operation for an embedder -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum ReindexAction { /// An indexing operation should take place for this embedder, keeping existing vectors /// and checking whether the document template changed or not RegeneratePrompts, + RegenerateFragments(Vec<(String, RegenerateFragment)>), /// An indexing operation should take place for all documents for this embedder, removing existing vectors /// (except userProvided ones) FullReindex, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum RegenerateFragment { + Update, + Remove, + Add, +} + pub enum SettingsDiff { Remove, Reindex { action: ReindexAction, updated_settings: EmbeddingSettings, quantize: bool }, @@ -577,6 +646,12 @@ pub struct EmbedderAction { pub is_being_quantized: bool, pub write_back: Option, pub reindex: Option, + pub remove_fragments: Option, +} + +#[derive(Debug)] +pub struct RemoveFragments { + pub fragment_ids: Vec, } impl EmbedderAction { @@ -592,6 +667,10 @@ impl EmbedderAction { self.reindex.as_ref() } + pub fn remove_fragments(&self) -> Option<&RemoveFragments> { + self.remove_fragments.as_ref() + } + pub fn with_is_being_quantized(mut self, quantize: bool) -> Self { self.is_being_quantized = quantize; self @@ -603,11 +682,23 @@ impl EmbedderAction { is_being_quantized: false, write_back: Some(write_back), reindex: None, + remove_fragments: None, } } pub fn with_reindex(reindex: ReindexAction, was_quantized: bool) -> Self { - Self { was_quantized, is_being_quantized: false, write_back: None, reindex: Some(reindex) } + Self { + was_quantized, + is_being_quantized: false, + write_back: None, + reindex: Some(reindex), + remove_fragments: None, + } + } + + pub fn with_remove_fragments(mut self, remove_fragments: RemoveFragments) -> Self { + self.remove_fragments = Some(remove_fragments); + self } } @@ -634,6 +725,8 @@ impl SettingsDiff { mut dimensions, mut document_template, mut url, + mut indexing_fragments, + mut search_fragments, mut request, mut response, mut search_embedder, @@ -653,6 +746,8 @@ impl SettingsDiff { dimensions: new_dimensions, document_template: new_document_template, url: new_url, + indexing_fragments: new_indexing_fragments, + search_fragments: new_search_fragments, request: new_request, response: new_response, search_embedder: new_search_embedder, @@ -684,6 +779,8 @@ impl SettingsDiff { &mut document_template, &mut document_template_max_bytes, &mut url, + &mut indexing_fragments, + &mut search_fragments, &mut request, &mut response, &mut headers, @@ -696,6 +793,8 @@ impl SettingsDiff { new_document_template, new_document_template_max_bytes, new_url, + new_indexing_fragments, + new_search_fragments, new_request, new_response, new_headers, @@ -722,6 +821,8 @@ impl SettingsDiff { dimensions, document_template, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -769,6 +870,8 @@ impl SettingsDiff { mut document_template, mut document_template_max_bytes, mut url, + mut indexing_fragments, + mut search_fragments, mut request, mut response, mut headers, @@ -794,6 +897,8 @@ impl SettingsDiff { document_template: new_document_template, document_template_max_bytes: new_document_template_max_bytes, url: new_url, + indexing_fragments: new_indexing_fragments, + search_fragments: new_search_fragments, request: new_request, response: new_response, headers: new_headers, @@ -814,6 +919,8 @@ impl SettingsDiff { &mut document_template, &mut document_template_max_bytes, &mut url, + &mut indexing_fragments, + &mut search_fragments, &mut request, &mut response, &mut headers, @@ -826,6 +933,8 @@ impl SettingsDiff { new_document_template, new_document_template_max_bytes, new_url, + new_indexing_fragments, + new_search_fragments, new_request, new_response, new_headers, @@ -846,6 +955,8 @@ impl SettingsDiff { dimensions, document_template, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -875,6 +986,8 @@ impl SettingsDiff { document_template: &mut Setting, document_template_max_bytes: &mut Setting, url: &mut Setting, + indexing_fragments: &mut Setting>>, + search_fragments: &mut Setting>>, request: &mut Setting, response: &mut Setting, headers: &mut Setting>, @@ -887,6 +1000,8 @@ impl SettingsDiff { new_document_template: Setting, new_document_template_max_bytes: Setting, new_url: Setting, + new_indexing_fragments: Setting>>, + new_search_fragments: Setting>>, new_request: Setting, new_response: Setting, new_headers: Setting>, @@ -902,6 +1017,8 @@ impl SettingsDiff { pooling, dimensions, url, + indexing_fragments, + search_fragments, request, response, document_template, @@ -941,6 +1058,104 @@ impl SettingsDiff { } } } + + *search_fragments = match (std::mem::take(search_fragments), new_search_fragments) { + (Setting::Set(search_fragments), Setting::Set(new_search_fragments)) => { + Setting::Set( + search_fragments + .into_iter() + .merge_join_by(new_search_fragments, |(left, _), (right, _)| { + left.cmp(right) + }) + .map(|eob| { + match eob { + // merge fragments + itertools::EitherOrBoth::Both((name, _), (_, right)) => { + (name, right) + } + // unchanged fragment + itertools::EitherOrBoth::Left(left) => left, + // new fragment + itertools::EitherOrBoth::Right(right) => right, + } + }) + .collect(), + ) + } + (_, Setting::Reset) => Setting::Reset, + (left, Setting::NotSet) => left, + (Setting::NotSet | Setting::Reset, Setting::Set(new_search_fragments)) => { + Setting::Set(new_search_fragments) + } + }; + + let mut regenerate_fragments = Vec::new(); + *indexing_fragments = match (std::mem::take(indexing_fragments), new_indexing_fragments) { + (Setting::Set(fragments), Setting::Set(new_fragments)) => { + Setting::Set( + fragments + .into_iter() + .merge_join_by(new_fragments, |(left, _), (right, _)| left.cmp(right)) + .map(|eob| { + match eob { + // merge fragments + itertools::EitherOrBoth::Both( + (name, left), + (other_name, right), + ) => { + if left == right { + (name, left) + } else { + match right { + Some(right) => { + regenerate_fragments + .push((other_name, RegenerateFragment::Update)); + (name, Some(right)) + } + None => { + regenerate_fragments + .push((other_name, RegenerateFragment::Remove)); + (name, None) + } + } + } + } + // unchanged fragment + itertools::EitherOrBoth::Left(left) => left, + // new fragment + itertools::EitherOrBoth::Right((name, right)) => { + if right.is_some() { + regenerate_fragments + .push((name.clone(), RegenerateFragment::Add)); + } + (name, right) + } + } + }) + .collect(), + ) + } + // remove all fragments => move to document template + (_, Setting::Reset) => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + Setting::Reset + } + // add all fragments + (Setting::NotSet | Setting::Reset, Setting::Set(new_fragments)) => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + + Setting::Set(new_fragments) + } + // no change + (left, Setting::NotSet) => left, + }; + if !regenerate_fragments.is_empty() { + ReindexAction::push_action( + reindex_action, + ReindexAction::RegenerateFragments(regenerate_fragments), + ); + } + if request.apply(new_request) { ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); } @@ -972,10 +1187,16 @@ impl SettingsDiff { impl ReindexAction { fn push_action(this: &mut Option, other: Self) { - *this = match (*this, other) { - (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), - (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), - (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + use ReindexAction::*; + *this = match (this.take(), other) { + (_, FullReindex) => Some(FullReindex), + (Some(FullReindex), _) => Some(FullReindex), + (_, RegenerateFragments(fragments)) => Some(RegenerateFragments(fragments)), + (Some(RegenerateFragments(fragments)), RegeneratePrompts) => { + Some(RegenerateFragments(fragments)) + } + (Some(RegeneratePrompts), RegeneratePrompts) => Some(RegeneratePrompts), + (None, RegeneratePrompts) => Some(RegeneratePrompts), } } } @@ -988,6 +1209,8 @@ fn apply_default_for_source( pooling: &mut Setting, dimensions: &mut Setting, url: &mut Setting, + indexing_fragments: &mut Setting>>, + search_fragments: &mut Setting>>, request: &mut Setting, response: &mut Setting, document_template: &mut Setting, @@ -1003,6 +1226,8 @@ fn apply_default_for_source( *pooling = Setting::Reset; *dimensions = Setting::NotSet; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1015,6 +1240,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1027,6 +1254,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::Reset; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1039,6 +1268,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::Reset; + *indexing_fragments = Setting::Reset; + *search_fragments = Setting::Reset; *request = Setting::Reset; *response = Setting::Reset; *headers = Setting::Reset; @@ -1051,6 +1282,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *document_template = Setting::NotSet; @@ -1065,6 +1298,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *document_template = Setting::NotSet; @@ -1131,6 +1366,8 @@ pub enum MetaEmbeddingSetting { DocumentTemplate, DocumentTemplateMaxBytes, Url, + IndexingFragments, + SearchFragments, Request, Response, Headers, @@ -1153,6 +1390,8 @@ impl MetaEmbeddingSetting { DocumentTemplate => "documentTemplate", DocumentTemplateMaxBytes => "documentTemplateMaxBytes", Url => "url", + IndexingFragments => "indexingFragments", + SearchFragments => "searchFragments", Request => "request", Response => "response", Headers => "headers", @@ -1176,6 +1415,8 @@ impl EmbeddingSettings { dimensions: &Setting, api_key: &Setting, url: &Setting, + indexing_fragments: &Setting>>, + search_fragments: &Setting>>, request: &Setting, response: &Setting, document_template: &Setting, @@ -1210,6 +1451,20 @@ impl EmbeddingSettings { )?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::IndexingFragments, + context, + indexing_fragments, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::SearchFragments, + context, + search_fragments, + )?; Self::check_setting( embedder_name, source, @@ -1348,8 +1603,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( OpenAi, - Revision | Pooling | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + Revision | Pooling | IndexingFragments | SearchFragments | Request | Response + | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, ( @@ -1359,8 +1614,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( HuggingFace, - ApiKey | Dimensions | Url | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + ApiKey | Dimensions | Url | IndexingFragments | SearchFragments | Request + | Response | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, (Ollama, Model, _) => FieldStatus::Mandatory, @@ -1371,8 +1626,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( Ollama, - Revision | Pooling | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + Revision | Pooling | IndexingFragments | SearchFragments | Request | Response + | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, (UserProvided, Dimensions, _) => FieldStatus::Mandatory, @@ -1386,6 +1641,8 @@ impl EmbeddingSettings { | DocumentTemplate | DocumentTemplateMaxBytes | Url + | IndexingFragments + | SearchFragments | Request | Response | Headers @@ -1404,6 +1661,10 @@ impl EmbeddingSettings { | Headers, _, ) => FieldStatus::Allowed, + (Rest, IndexingFragments, NotNested | Indexing) => FieldStatus::Allowed, + (Rest, IndexingFragments, Search) => FieldStatus::Disallowed, + (Rest, SearchFragments, NotNested | Search) => FieldStatus::Allowed, + (Rest, SearchFragments, Indexing) => FieldStatus::Disallowed, (Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => { FieldStatus::Disallowed } @@ -1419,6 +1680,8 @@ impl EmbeddingSettings { | DocumentTemplate | DocumentTemplateMaxBytes | Url + | IndexingFragments + | SearchFragments | Request | Response | Headers, @@ -1512,6 +1775,11 @@ impl std::fmt::Display for EmbedderSource { } } +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] +pub struct Fragment { + pub value: serde_json::Value, +} + impl EmbeddingSettings { fn from_hugging_face( super::hf::EmbedderOptions { @@ -1534,6 +1802,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1566,6 +1836,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::some_or_not_set(url), + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1598,6 +1870,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::some_or_not_set(url), + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1622,6 +1896,8 @@ impl EmbeddingSettings { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1638,6 +1914,8 @@ impl EmbeddingSettings { dimensions, url, request, + indexing_fragments, + search_fragments, response, distribution, headers, @@ -1656,6 +1934,26 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::Set(url), + indexing_fragments: if indexing_fragments.is_empty() { + Setting::NotSet + } else { + Setting::Set( + indexing_fragments + .into_iter() + .map(|(name, fragment)| (name, Some(Fragment { value: fragment }))) + .collect(), + ) + }, + search_fragments: if search_fragments.is_empty() { + Setting::NotSet + } else { + Setting::Set( + search_fragments + .into_iter() + .map(|(name, fragment)| (name, Some(Fragment { value: fragment }))) + .collect(), + ) + }, request: Setting::Set(request), response: Setting::Set(response), distribution: Setting::some_or_not_set(distribution), @@ -1714,6 +2012,8 @@ impl From for EmbeddingSettings { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1786,6 +2086,8 @@ impl From for SubEmbeddingSettings { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1804,6 +2106,8 @@ impl From for SubEmbeddingSettings { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1828,6 +2132,8 @@ impl From for EmbeddingConfig { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, distribution, @@ -1879,6 +2185,8 @@ impl From for EmbeddingConfig { EmbedderSource::Rest => SubEmbedderOptions::rest( url.set().unwrap(), api_key, + indexing_fragments, + search_fragments, request.set().unwrap(), response.set().unwrap(), headers, @@ -1922,6 +2230,8 @@ impl SubEmbedderOptions { document_template: _, document_template_max_bytes: _, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1944,6 +2254,8 @@ impl SubEmbedderOptions { EmbedderSource::Rest => Self::rest( url.set().unwrap(), api_key, + indexing_fragments, + search_fragments, request.set().unwrap(), response.set().unwrap(), headers, @@ -2010,9 +2322,13 @@ impl SubEmbedderOptions { distribution: distribution.set(), }) } + + #[allow(clippy::too_many_arguments)] fn rest( url: String, api_key: Setting, + indexing_fragments: Setting>>, + search_fragments: Setting>>, request: serde_json::Value, response: serde_json::Value, headers: Setting>, @@ -2027,6 +2343,22 @@ impl SubEmbedderOptions { response, distribution: distribution.set(), headers: headers.set().unwrap_or_default(), + search_fragments: search_fragments + .set() + .unwrap_or_default() + .into_iter() + .filter_map(|(name, fragment)| { + Some((name, fragment.map(|fragment| fragment.value)?)) + }) + .collect(), + indexing_fragments: indexing_fragments + .set() + .unwrap_or_default() + .into_iter() + .filter_map(|(name, fragment)| { + Some((name, fragment.map(|fragment| fragment.value)?)) + }) + .collect(), }) } fn ollama( @@ -2066,3 +2398,20 @@ impl From for EmbedderOptions { } } } + +pub(crate) fn fragments_from_settings( + setting: &Setting, +) -> impl Iterator + '_ { + let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) }; + if let Some(setting) = setting.indexing_fragments.as_ref().set() { + Either::Right(setting.keys().cloned()) + } else { + let Some(setting) = setting.indexing_embedder.as_ref().set() else { + return Either::Left(None.into_iter()); + }; + let Some(setting) = setting.indexing_fragments.as_ref().set() else { + return Either::Left(None.into_iter()); + }; + Either::Right(setting.keys().cloned()) + } +} From 41620d53254a46a58763592f310ce94ca1f567d4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:58:16 +0200 Subject: [PATCH 055/101] Support `indexingFragments` and `searchFragments` in settings --- crates/meilisearch-types/src/settings.rs | 4 +- crates/milli/src/update/settings.rs | 314 +++++++++++++++-------- 2 files changed, 213 insertions(+), 105 deletions(-) diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 7d64440ce..d7b163448 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -9,10 +9,11 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; use milli::disabled_typos_terms::DisabledTyposTerms; -use milli::index::{IndexEmbeddingConfig, PrefixSearch}; +use milli::index::PrefixSearch; use milli::proximity::ProximityPrecision; pub use milli::update::ChatSettings; use milli::update::Setting; +use milli::vector::db::IndexEmbeddingConfig; use milli::{Criterion, CriterionError, FilterableAttributesRule, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; use utoipa::ToSchema; @@ -911,6 +912,7 @@ pub fn settings( }; let embedders: BTreeMap<_, _> = index + .embedding_configs() .embedding_configs(rtxn)? .into_iter() .map(|IndexEmbeddingConfig { name, config, .. }| { diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c6ede7a1d..3dae4f57c 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{merge_join_by, EitherOrBoth, Itertools}; -use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -23,22 +22,25 @@ use crate::error::UserError::{self, InvalidChatSettingsDocumentTemplateMaxBytes} use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ - ChatConfig, IndexEmbeddingConfig, PrefixSearch, SearchParameters, - DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, + ChatConfig, PrefixSearch, SearchParameters, DEFAULT_MIN_WORD_LEN_ONE_TYPO, + DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; -use crate::progress::EmbedderStats; -use crate::progress::Progress; +use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::new::indexer::reindex; use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; +use crate::vector::json_template::JsonTemplate; use crate::vector::settings::{ EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, SubEmbeddingSettings, WriteBackToDocuments, }; -use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; +use crate::vector::{ + Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, +}; use crate::{ ChannelCongestion, FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, }; @@ -1044,22 +1046,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { match std::mem::take(&mut self.embedder_settings) { Setting::Set(configs) => self.update_embedding_configs_set(configs), Setting::Reset => { + let embedders = self.index.embedding_configs(); // all vectors should be written back to documents - let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs = embedders.embedding_configs(self.wtxn)?; let remove_all: Result> = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_provided }| -> Result<_> { - let embedder_id = - self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( - crate::InternalError::DatabaseMissingEntry { - db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, - key: None, - }, - )?; + .map(|IndexEmbeddingConfig { name, config, fragments: _ }| -> Result<_> { + let embedder_info = embedders.embedder_info(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; Ok(( name, EmbedderAction::with_write_back( - WriteBackToDocuments { embedder_id, user_provided }, + WriteBackToDocuments { + embedder_id: embedder_info.embedder_id, + user_provided: embedder_info + .embedding_status + .into_user_provided(), + }, config.quantized(), ), )) @@ -1069,7 +1076,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let remove_all = remove_all?; self.index.embedder_category_id.clear(self.wtxn)?; - self.index.delete_embedding_configs(self.wtxn)?; + embedders.delete_embedding_configs(self.wtxn)?; Ok(remove_all) } Setting::NotSet => Ok(Default::default()), @@ -1081,12 +1088,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { configs: BTreeMap>, ) -> Result> { use crate::vector::settings::SettingsDiff; - - let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap = old_configs + let embedders = self.index.embedding_configs(); + let old_configs = embedders.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_provided }| { - (name, (config.into(), user_provided)) + .map(|IndexEmbeddingConfig { name, config, fragments }| { + (name, (config.into(), fragments)) }) .collect(); let mut updated_configs = BTreeMap::new(); @@ -1097,55 +1104,88 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + EitherOrBoth::Both((name, (old, mut fragments)), (_, new)) => { let was_quantized = old.binary_quantized.set().unwrap_or_default(); let settings_diff = SettingsDiff::from_settings(&name, old, new)?; match settings_diff { SettingsDiff::Remove => { + let info = embedders.remove_embedder(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; tracing::debug!( embedder = name, - user_provided = user_provided.len(), + user_provided = info.embedding_status.user_provided_docids().len(), "removing embedder" ); - let embedder_id = - self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( - crate::InternalError::DatabaseMissingEntry { - db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, - key: None, - }, - )?; - // free id immediately - self.index.embedder_category_id.delete(self.wtxn, &name)?; embedder_actions.insert( name, EmbedderAction::with_write_back( - WriteBackToDocuments { embedder_id, user_provided }, + WriteBackToDocuments { + embedder_id: info.embedder_id, + user_provided: info.embedding_status.into_user_provided(), + }, was_quantized, ), ); } SettingsDiff::Reindex { action, updated_settings, quantize } => { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - ?action, - "reindex embedder" - ); - embedder_actions.insert( - name.clone(), + let mut remove_fragments = None; + let updated_settings = Setting::Set(updated_settings); + if let ReindexAction::RegenerateFragments(regenerate_fragments) = + &action + { + let it = regenerate_fragments + .iter() + .filter(|(_, action)| { + matches!( + action, + crate::vector::settings::RegenerateFragment::Remove + ) + }) + .map(|(name, _)| name.as_str()); + + remove_fragments = fragments.remove_fragments(it); + + let it = regenerate_fragments + .iter() + .filter(|(_, action)| { + matches!( + action, + crate::vector::settings::RegenerateFragment::Add + ) + }) + .map(|(name, _)| name.clone()); + fragments.add_new_fragments(it)?; + } else { + // needs full reindex of fragments + fragments = FragmentConfigs::new(); + fragments.add_new_fragments( + crate::vector::settings::fragments_from_settings( + &updated_settings, + ), + )?; + } + tracing::debug!(embedder = name, ?action, "reindex embedder"); + + let embedder_action = EmbedderAction::with_reindex(action, was_quantized) - .with_is_being_quantized(quantize), - ); - let new = - validate_embedding_settings(Setting::Set(updated_settings), &name)?; - updated_configs.insert(name, (new, user_provided)); + .with_is_being_quantized(quantize); + + let embedder_action = if let Some(remove_fragments) = remove_fragments { + embedder_action.with_remove_fragments(remove_fragments) + } else { + embedder_action + }; + + embedder_actions.insert(name.clone(), embedder_action); + let new = validate_embedding_settings(updated_settings, &name)?; + updated_configs.insert(name, (new, fragments)); } SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - "update without reindex embedder" - ); + tracing::debug!(embedder = name, "update without reindex embedder"); let new = validate_embedding_settings(Setting::Set(updated_settings), &name)?; if quantize { @@ -1154,14 +1194,14 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { EmbedderAction::default().with_is_being_quantized(true), ); } - updated_configs.insert(name, (new, user_provided)); + updated_configs.insert(name, (new, fragments)); } } } // unchanged config - EitherOrBoth::Left((name, (setting, user_provided))) => { + EitherOrBoth::Left((name, (setting, fragments))) => { tracing::debug!(embedder = name, "unchanged embedder"); - updated_configs.insert(name, (Setting::Set(setting), user_provided)); + updated_configs.insert(name, (Setting::Set(setting), fragments)); } // new config EitherOrBoth::Right((name, mut setting)) => { @@ -1176,47 +1216,42 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { name.clone(), EmbedderAction::with_reindex(ReindexAction::FullReindex, false), ); - updated_configs.insert(name, (setting, RoaringBitmap::new())); + let mut fragments = FragmentConfigs::new(); + fragments.add_new_fragments( + crate::vector::settings::fragments_from_settings(&setting), + )?; + updated_configs.insert(name, (setting, fragments)); } } } - let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; - for res in self.index.embedder_category_id.iter(self.wtxn)? { - let (_name, id) = res?; - free_indices[id as usize] = false; - } - let mut free_indices = free_indices.iter_mut().enumerate(); - let mut find_free_index = - move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); - for (name, action) in embedder_actions.iter() { - // ignore actions that are not possible for a new embedder - if matches!(action.reindex(), Some(ReindexAction::FullReindex)) - && self.index.embedder_category_id.get(self.wtxn, name)?.is_none() - { - let id = - find_free_index().ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; - tracing::debug!(embedder = name, id, "assigning free id to new embedder"); - self.index.embedder_category_id.put(self.wtxn, name, &id)?; - } - } + embedders.add_new_embedders( + self.wtxn, + embedder_actions + .iter() + // ignore actions that are not possible for a new embedder, most critically deleted embedders + .filter(|(_, action)| matches!(action.reindex(), Some(ReindexAction::FullReindex))) + .map(|(name, _)| name.as_str()), + updated_configs.len(), + )?; + let updated_configs: Vec = updated_configs .into_iter() - .filter_map(|(name, (config, user_provided))| match config { + .filter_map(|(name, (config, fragments))| match config { Setting::Set(config) => { - Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + Some(IndexEmbeddingConfig { name, config: config.into(), fragments }) } Setting::Reset => None, Setting::NotSet => Some(IndexEmbeddingConfig { name, config: EmbeddingSettings::default().into(), - user_provided, + fragments: Default::default(), }), }) .collect(); if updated_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; + embedders.delete_embedding_configs(self.wtxn)?; } else { - self.index.put_embedding_configs(self.wtxn, updated_configs)?; + embedders.put_embedding_configs(self.wtxn, updated_configs)?; } Ok(embedder_actions) } @@ -1611,13 +1646,13 @@ impl InnerIndexSettingsDiff { // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { - for (embedder_name, (config, _, _quantized)) in - new_settings.embedding_configs.inner_as_ref() - { - let was_quantized = - old_settings.embedding_configs.get(embedder_name).is_some_and(|conf| conf.2); + for (embedder_name, runtime) in new_settings.embedding_configs.inner_as_ref() { + let was_quantized = old_settings + .embedding_configs + .get(embedder_name) + .is_some_and(|conf| conf.is_quantized); // skip embedders that don't use document templates - if !config.uses_document_template() { + if !runtime.embedder.uses_document_template() { continue; } @@ -1630,13 +1665,31 @@ impl InnerIndexSettingsDiff { was_quantized, )); } - std::collections::btree_map::Entry::Occupied(entry) => { + std::collections::btree_map::Entry::Occupied(mut entry) => { + // future-proofing, make sure to destructure here so that any new field is taken into account in this case + // case in point: adding `remove_fragments` was detected. let EmbedderAction { was_quantized: _, is_being_quantized: _, - write_back: _, // We are deleting this embedder, so no point in regeneration - reindex: _, // We are already fully reindexing - } = entry.get(); + write_back, // We are deleting this embedder, so no point in regeneration + reindex, + remove_fragments: _, + } = entry.get_mut(); + + // fixup reindex to make sure we regenerate all fragments + *reindex = match reindex.take() { + Some(ReindexAction::RegenerateFragments(_)) => { + Some(ReindexAction::RegeneratePrompts) + } + Some(reindex) => Some(reindex), // We are at least regenerating prompts + None => { + if write_back.is_none() { + Some(ReindexAction::RegeneratePrompts) // quantization case + } else { + None + } + } + }; } }; } @@ -1790,7 +1843,7 @@ pub(crate) struct InnerIndexSettings { pub exact_attributes: HashSet, pub disabled_typos_terms: DisabledTyposTerms, pub proximity_precision: ProximityPrecision, - pub embedding_configs: EmbeddingConfigs, + pub embedding_configs: RuntimeEmbedders, pub embedder_category_id: HashMap, pub geo_fields_ids: Option<(FieldId, FieldId)>, pub prefix_search: PrefixSearch, @@ -1801,7 +1854,7 @@ impl InnerIndexSettings { pub fn from_index( index: &Index, rtxn: &heed::RoTxn<'_>, - embedding_configs: Option, + embedding_configs: Option, ) -> Result { let stop_words = index.stop_words(rtxn)?; let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); @@ -1812,7 +1865,7 @@ impl InnerIndexSettings { let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = match embedding_configs { Some(embedding_configs) => embedding_configs, - None => embedders(index.embedding_configs(rtxn)?)?, + None => embedders(index.embedding_configs().embedding_configs(rtxn)?)?, }; let embedder_category_id = index .embedder_category_id @@ -1900,28 +1953,49 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() .map( |IndexEmbeddingConfig { name, config: EmbeddingConfig { embedder_options, prompt, quantized }, - .. + fragments, }| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + let document_template = prompt.try_into().map_err(crate::Error::from)?; - let embedder = Arc::new( + let embedder = // cache_cap: no cache needed for indexing purposes - Embedder::new(embedder_options.clone(), 0) + Arc::new(Embedder::new(embedder_options.clone(), 0) .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt, quantized.unwrap_or_default()))) + .map_err(crate::Error::from)?); + + let fragments = fragments + .into_inner() + .into_iter() + .map(|fragment| { + let template = JsonTemplate::new( + embedder_options.fragment(&fragment.name).unwrap().clone(), + ) + .unwrap(); + + RuntimeFragment { name: fragment.name, id: fragment.id, template } + }) + .collect(); + + Ok(( + name, + Arc::new(RuntimeEmbedder { + embedder, + document_template, + fragments, + is_quantized: quantized.unwrap_or_default(), + }), + )) }, ) .collect(); - res.map(EmbeddingConfigs::new) + res.map(RuntimeEmbedders::new) } fn validate_prompt( @@ -1970,6 +2044,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -1997,8 +2073,28 @@ pub fn validate_embedding_settings( } if let Some(request) = request.as_ref().set() { - let request = crate::vector::rest::Request::new(request.to_owned()) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + let request = crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(), + search_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; if let Some(response) = response.as_ref().set() { crate::vector::rest::Response::new(response.to_owned(), &request) .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; @@ -2017,6 +2113,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -2036,6 +2134,8 @@ pub fn validate_embedding_settings( &dimensions, &api_key, &url, + &indexing_fragments, + &search_fragments, &request, &response, &document_template, @@ -2114,6 +2214,8 @@ pub fn validate_embedding_settings( &embedder.dimensions, &embedder.api_key, &embedder.url, + &embedder.indexing_fragments, + &embedder.search_fragments, &embedder.request, &embedder.response, &embedder.document_template, @@ -2169,6 +2271,8 @@ pub fn validate_embedding_settings( &embedder.dimensions, &embedder.api_key, &embedder.url, + &embedder.indexing_fragments, + &embedder.search_fragments, &embedder.request, &embedder.response, &embedder.document_template, @@ -2201,6 +2305,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, From 22d363c05ad44f68a24de047c832de67aae7d966 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:59:35 +0200 Subject: [PATCH 056/101] Clear DB on clear documents --- crates/milli/src/update/clear_documents.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index b0ae070de..01631e9a3 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -64,11 +64,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // Remove all user-provided bits from the configs - let mut configs = self.index.embedding_configs(self.wtxn)?; - for config in configs.iter_mut() { - config.user_provided.clear(); - } - self.index.put_embedding_configs(self.wtxn, configs)?; + self.index.embedding_configs().clear_embedder_info_docids(self.wtxn)?; // Clear the other databases. external_documents_ids.clear(self.wtxn)?; From f8232976eda21fa869dd1679e0c86c1126011c6c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:00:22 +0200 Subject: [PATCH 057/101] Implement in new document indexer --- crates/milli/src/update/new/channel.rs | 156 +++- crates/milli/src/update/new/document.rs | 105 +++ .../milli/src/update/new/document_change.rs | 8 +- .../milli/src/update/new/extract/documents.rs | 9 +- .../src/update/new/extract/vectors/mod.rs | 842 +++++++++++------- .../milli/src/update/new/indexer/extract.rs | 25 +- crates/milli/src/update/new/indexer/mod.rs | 23 +- crates/milli/src/update/new/indexer/write.rs | 52 +- .../milli/src/update/new/vector_document.rs | 29 +- crates/milli/src/vector/session.rs | 28 +- 10 files changed, 886 insertions(+), 391 deletions(-) diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 4fff31a35..aec192ace 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -138,6 +138,7 @@ pub enum ReceiverAction { WakeUp, LargeEntry(LargeEntry), LargeVectors(LargeVectors), + LargeVector(LargeVector), } /// An entry that cannot fit in the BBQueue buffers has been @@ -174,6 +175,24 @@ impl LargeVectors { } } +#[derive(Debug)] +pub struct LargeVector { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The extractor id in which to insert the large embedding. + pub extractor_id: u8, + /// The large embedding that must be written. + pub embedding: Mmap, +} + +impl LargeVector { + pub fn read_embedding(&self, dimensions: usize) -> &[f32] { + self.embedding.chunks_exact(dimensions).map(bytemuck::cast_slice).next().unwrap() + } +} + impl<'a> WriterBbqueueReceiver<'a> { /// Tries to receive an action to do until the timeout occurs /// and if it does, consider it as a spurious wake up. @@ -238,6 +257,7 @@ pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), ArroySetVectors(ArroySetVectors), + ArroySetVector(ArroySetVector), } impl EntryHeader { @@ -250,6 +270,7 @@ impl EntryHeader { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::ArroySetVectors(_) => 2, + EntryHeader::ArroySetVector(_) => 3, } } @@ -274,11 +295,17 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() + embedding_size * count } + fn total_set_vector_size(dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size + } + fn header_size(&self) -> usize { let payload_size = match self { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), + EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), }; Self::variant_size() + payload_size } @@ -301,6 +328,11 @@ impl EntryHeader { let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVectors(header) } + 3 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVector(header) + } id => panic!("invalid variant id: {id}"), } } @@ -311,6 +343,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), + EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), }; *first = self.variant_id(); remaining.copy_from_slice(payload_bytes); @@ -379,6 +412,37 @@ impl ArroySetVectors { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVector { + pub docid: DocumentId, + pub embedder_id: u8, + pub extractor_id: u8, + _padding: [u8; 2], +} + +impl ArroySetVector { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read the embedding and write it into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(u16)] pub enum Database { @@ -398,6 +462,7 @@ pub enum Database { FacetIdStringDocids, FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, + VectorEmbedderCategoryId, } impl Database { @@ -419,6 +484,7 @@ impl Database { Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), + Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(), } } @@ -440,6 +506,7 @@ impl Database { Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, + Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID, } } } @@ -568,6 +635,82 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } + fn set_vector_for_extractor( + &self, + docid: u32, + embedder_id: u8, + extractor_id: u8, + embedding: Option, + ) -> crate::Result<()> { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // If there are no vectors we specify the dimensions + // to zero to allocate no extra space at all + let dimensions = embedding.as_ref().map_or(0, |emb| emb.len()); + + let arroy_set_vector = + ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] }; + let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); + let total_length = EntryHeader::total_set_vector_size(dimensions); + if total_length > max_grant { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + let embedding = embedding.expect("set_vector without a vector does not fit in RAM"); + + let mut embedding_bytes = bytemuck::cast_slice(&embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + let embedding = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVector { docid, embedder_id, extractor_id, embedding }; + self.sender.send(ReceiverAction::LargeVector(large_vectors)).unwrap(); + + return Ok(()); + } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + if dimensions != 0 { + let output_iter = + remaining.chunks_exact_mut(dimensions * mem::size_of::()); + + for (embedding, output) in embedding.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + } + + Ok(()) + }, + )?; + + Ok(()) + } + + fn embedding_status( + &self, + name: &str, + infos: crate::vector::db::EmbedderInfo, + ) -> crate::Result<()> { + let bytes = infos.to_bytes().map_err(|_| { + InternalError::Serialization(crate::SerializationError::Encoding { + db_name: Some(Database::VectorEmbedderCategoryId.database_name()), + }) + })?; + self.write_key_value(Database::VectorEmbedderCategoryId, name.as_bytes(), &bytes) + } + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { InternalError::StorePut { @@ -942,9 +1085,18 @@ impl EmbeddingSender<'_, '_> { &self, docid: DocumentId, embedder_id: u8, - embedding: Embedding, + extractor_id: u8, + embedding: Option, ) -> crate::Result<()> { - self.0.set_vectors(docid, embedder_id, &[embedding]) + self.0.set_vector_for_extractor(docid, embedder_id, extractor_id, embedding) + } + + pub(crate) fn embedding_status( + &self, + name: &str, + infos: crate::vector::db::EmbedderInfo, + ) -> crate::Result<()> { + self.0.embedding_status(name, infos) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b07cc0298..d520bb952 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -12,6 +12,7 @@ use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; +use crate::update::del_add::KvReaderDelAdd; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::vector_document::VectorDocumentFromDb; use crate::vector::settings::EmbedderAction; @@ -469,6 +470,110 @@ impl<'doc> Versions<'doc> { } } +#[derive(Debug)] +pub struct KvDelAddDocument<'a, Mapper: FieldIdMapper> { + document: &'a obkv::KvReaderU16, + side: crate::update::del_add::DelAdd, + fields_ids_map: &'a Mapper, +} + +impl<'a, Mapper: FieldIdMapper> KvDelAddDocument<'a, Mapper> { + pub fn new( + document: &'a obkv::KvReaderU16, + side: crate::update::del_add::DelAdd, + fields_ids_map: &'a Mapper, + ) -> Self { + Self { document, side, fields_ids_map } + } + + fn get(&self, k: &str) -> Result> { + let Some(id) = self.fields_ids_map.id(k) else { return Ok(None) }; + let Some(value) = self.document.get(id) else { return Ok(None) }; + let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { return Ok(None) }; + + let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + Ok(Some(value)) + } +} + +impl<'a, Mapper: FieldIdMapper> Document<'a> for KvDelAddDocument<'a, Mapper> { + fn iter_top_level_fields(&self) -> impl Iterator> { + let mut it = self.document.iter(); + + std::iter::from_fn(move || loop { + let (fid, value) = it.next()?; + let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { + continue; + }; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(error) => return Some(Err(error.into())), + }; + + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { + continue; + } + + let res = (|| { + let value = + serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + Ok((name, value)) + })(); + + return Some(res); + }) + } + + fn top_level_fields_count(&self) -> usize { + let mut it = self.document.iter(); + + std::iter::from_fn(move || loop { + let (fid, value) = it.next()?; + let Some(_) = KvReaderDelAdd::from_slice(value).get(self.side) else { + continue; + }; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(_) => return Some(()), + }; + + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { + continue; + } + + return Some(()); + }) + .count() + } + + fn top_level_field(&self, k: &str) -> Result> { + if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME { + return Ok(None); + } + self.get(k) + } + + fn vectors_field(&self) -> Result> { + self.get(RESERVED_VECTORS_FIELD_NAME) + } + + fn geo_field(&self) -> Result> { + self.get(RESERVED_GEO_FIELD_NAME) + } +} + pub struct DocumentIdentifiers<'doc> { docid: DocumentId, external_document_id: &'doc str, diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 2b9161319..1a40615e7 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -11,7 +11,7 @@ use super::vector_document::{ use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::update::new::document::DocumentIdentifiers; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{DocumentId, Index, InternalError, Result}; pub enum DocumentChange<'doc> { @@ -70,7 +70,7 @@ impl<'doc> Insertion<'doc> { pub fn inserted_vectors( &self, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } @@ -241,7 +241,7 @@ impl<'doc> Update<'doc> { pub fn only_changed_vectors( &self, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } @@ -252,7 +252,7 @@ impl<'doc> Update<'doc> { index: &'doc Index, mapper: &'doc Mapper, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { if self.from_scratch { MergedVectorDocument::without_db( diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 5c1a1927a..31d2ada0f 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -7,8 +7,7 @@ use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; -use crate::update::new::document::{write_to_obkv, Document}; -use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::document::{write_to_obkv, Document, DocumentContext, DocumentIdentifiers}; use crate::update::new::indexer::document_changes::{Extractor, IndexingContext}; use crate::update::new::indexer::settings_changes::{ settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor, @@ -19,16 +18,16 @@ use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::update::settings::SettingsDelta; use crate::vector::settings::EmbedderAction; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::Result; pub struct DocumentsExtractor<'a, 'b> { document_sender: DocumentsSender<'a, 'b>, - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, } impl<'a, 'b> DocumentsExtractor<'a, 'b> { - pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a RuntimeEmbedders) -> Self { Self { document_sender, embedders } } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4d308018a..3b8f5fa58 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,30 +1,35 @@ use std::cell::RefCell; use std::collections::BTreeMap; +use std::fmt::Debug; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::{DefaultHashBuilder, HashMap}; -use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::document::{Document, DocumentContext, DocumentIdentifiers}; use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{ EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, }; +use crate::vector::extractor::{ + DocumentTemplateExtractor, Extractor as VectorExtractor, RequestFragmentExtractor, +}; +use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed}; use crate::vector::settings::{EmbedderAction, ReindexAction}; -use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; +use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; pub struct EmbeddingExtractor<'a, 'b> { - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, embedder_stats: &'a EmbedderStats, @@ -33,7 +38,7 @@ pub struct EmbeddingExtractor<'a, 'b> { impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, embedder_stats: &'a EmbedderStats, @@ -45,7 +50,7 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> { } pub struct EmbeddingExtractorData<'extractor>( - pub HashMap, + pub HashMap, ); unsafe impl MostlySend for EmbeddingExtractorData<'_> {} @@ -67,19 +72,18 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); - for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { - let embedder_id = - context.index.embedder_category_id.get(&context.rtxn, embedder_name)?.ok_or_else( - || InternalError::DatabaseMissingEntry { - db_name: "embedder_category_id", - key: None, - }, - )?; + let embedder_db = context.index.embedding_configs(); + for (embedder_name, runtime) in embedders { + let embedder_info = embedder_db + .embedder_info(&context.rtxn, embedder_name)? + .ok_or_else(|| InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + })?; all_chunks.push(Chunks::new( - embedder, - embedder_id, + runtime, + embedder_info, embedder_name, - prompt, context.data, &self.possible_embedding_mistakes, self.embedder_stats, @@ -94,19 +98,14 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { match change { DocumentChange::Deletion(deletion) => { // vector deletion is handled by document sender, - // we still need to accomodate deletion from user_provided + // we still need to accomodate deletion from embedding_status for chunks in &mut all_chunks { - // regenerate: true means we delete from user_provided - chunks.set_regenerate(deletion.docid(), true); + let (is_user_provided, must_regenerate) = + chunks.is_user_provided_must_regenerate(deletion.docid()); + chunks.clear_status(deletion.docid(), is_user_provided, must_regenerate); } } DocumentChange::Update(update) => { - let old_vectors = update.current_vectors( - &context.rtxn, - context.index, - context.db_fields_ids_map, - &context.doc_alloc, - )?; let new_vectors = update.only_changed_vectors(&context.doc_alloc, self.embedders)?; @@ -115,19 +114,16 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } for chunks in &mut all_chunks { - let embedder_name = chunks.embedder_name(); - let prompt = chunks.prompt(); + let (old_is_user_provided, old_must_regenerate) = + chunks.is_user_provided_must_regenerate(update.docid()); - let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + let embedder_name = chunks.embedder_name(); // case where we have a `_vectors` field in the updated document if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - if old_vectors.regenerate != new_vectors.regenerate { - chunks.set_regenerate(update.docid(), new_vectors.regenerate); - } // do we have set embeddings? if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( @@ -139,97 +135,62 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { document_id: update.external_document_id().to_string(), error: error.to_string(), })?, + old_is_user_provided, + old_must_regenerate, + new_vectors.regenerate, )?; // regenerate if the new `_vectors` fields is set to. } else if new_vectors.regenerate { - let new_rendered = prompt.render_document( - update.external_document_id(), - update.merged( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - let must_regenerate = if !old_vectors.regenerate { - // we just enabled `regenerate` - true - } else { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - ); - - if let Ok(old_rendered) = old_rendered { - // must regenerate if the rendered changed - new_rendered != old_rendered - } else { - // cannot check previous rendered, better regenerate - true - } - }; - - if must_regenerate { - chunks.set_autogenerated( - update.docid(), - update.external_document_id(), - new_rendered, - &unused_vectors_distribution, - )?; - } - } - // no `_vectors` field, so only regenerate if the document is already set to in the DB. - } else if old_vectors.regenerate { - let new_rendered = prompt.render_document( - update.external_document_id(), - update.merged( + let new_document = update.merged( &context.rtxn, context.index, context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - - let must_regenerate = { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - ); - if let Ok(old_rendered) = old_rendered { - // regenerate if the rendered version changed - new_rendered != old_rendered - } else { - // if we cannot render the previous version of the documents, let's regenerate - true - } - }; - - if must_regenerate { - chunks.set_autogenerated( + )?; + let old_document = update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + chunks.update_autogenerated( update.docid(), update.external_document_id(), - new_rendered, + old_document, + new_document, + context.new_fields_ids_map, &unused_vectors_distribution, + old_is_user_provided, + old_must_regenerate, + true, )?; } + // no `_vectors` field, so only regenerate if the document is already set to in the DB. + } else if old_must_regenerate { + let new_document = update.merged( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + let old_document = update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + chunks.update_autogenerated( + update.docid(), + update.external_document_id(), + old_document, + new_document, + context.new_fields_ids_map, + &unused_vectors_distribution, + old_is_user_provided, + old_must_regenerate, + true, + )?; } } } DocumentChange::Insertion(insertion) => { + let (default_is_user_provided, default_must_regenerate) = (false, true); let new_vectors = insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { @@ -238,13 +199,11 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { for chunks in &mut all_chunks { let embedder_name = chunks.embedder_name(); - let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - chunks.set_regenerate(insertion.docid(), new_vectors.regenerate); if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( insertion.external_document_id(), @@ -257,33 +216,36 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { .to_string(), error: error.to_string(), })?, + default_is_user_provided, + default_must_regenerate, + new_vectors.regenerate, )?; } else if new_vectors.regenerate { - let rendered = prompt.render_document( + chunks.insert_autogenerated( + insertion.docid(), insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - insertion.docid(), - insertion.external_document_id(), - rendered, &unused_vectors_distribution, + true, )?; + } else { + chunks.set_status( + insertion.docid(), + default_is_user_provided, + default_must_regenerate, + false, + false, + ); } } else { - let rendered = prompt.render_document( + chunks.insert_autogenerated( + insertion.docid(), insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - insertion.docid(), - insertion.external_document_id(), - rendered, &unused_vectors_distribution, + true, )?; } } @@ -501,156 +463,74 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding } } -// **Warning**: the destructor of this struct is not normally run, make sure that all its fields: -// 1. don't have side effects tied to they destructors -// 2. if allocated, are allocated inside of the bumpalo -// -// Currently this is the case as: -// 1. BVec are inside of the bumaplo -// 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'b, 'extractor> { - texts: BVec<'a, &'a str>, - ids: BVec<'a, DocumentId>, - - embedder: &'a Embedder, +pub struct OnEmbeddingDocumentUpdates<'doc, 'b> { embedder_id: u8, - embedder_name: &'a str, - dimensions: usize, - prompt: &'a Prompt, - possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: &'a EmbedderStats, - user_provided: &'a RefCell>, - threads: &'a ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - has_manual_generation: Option<&'a str>, + sender: EmbeddingSender<'doc, 'b>, + possible_embedding_mistakes: &'doc PossibleEmbeddingMistakes, } -impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { - #[allow(clippy::too_many_arguments)] - pub fn new( - embedder: &'a Embedder, - embedder_id: u8, - embedder_name: &'a str, - prompt: &'a Prompt, - user_provided: &'a RefCell>, - possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: &'a EmbedderStats, - threads: &'a ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - doc_alloc: &'a Bump, - ) -> Self { - let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); - let texts = BVec::with_capacity_in(capacity, doc_alloc); - let ids = BVec::with_capacity_in(capacity, doc_alloc); - let dimensions = embedder.dimensions(); - Self { - texts, - ids, - embedder, - prompt, - possible_embedding_mistakes, - embedder_stats, - threads, - sender, - embedder_id, - embedder_name, - user_provided, - has_manual_generation: None, - dimensions, - } +impl OnEmbeddingDocumentUpdates<'_, '_> { + fn clear_vectors(&self, docid: DocumentId) { + self.sender.set_vectors(docid, self.embedder_id, vec![]).unwrap(); } +} - pub fn set_autogenerated( +impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { + type ErrorMetadata = UnusedVectorsDistributionBump<'doc>; + fn process_embedding_response( &mut self, - docid: DocumentId, - external_docid: &'a str, - rendered: &'a str, - unused_vectors_distribution: &UnusedVectorsDistributionBump, - ) -> Result<()> { - let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_)); - if is_manual { - self.has_manual_generation.get_or_insert(external_docid); - } - - if self.texts.len() < self.texts.capacity() { - self.texts.push(rendered); - self.ids.push(docid); - return Ok(()); - } - - Self::embed_chunks( - &mut self.texts, - &mut self.ids, - self.embedder, - self.embedder_id, - self.embedder_name, - self.possible_embedding_mistakes, - self.embedder_stats, - unused_vectors_distribution, - self.threads, - self.sender, - self.has_manual_generation.take(), - ) + response: crate::vector::session::EmbeddingResponse<'doc>, + ) { + self.sender + .set_vector( + response.metadata.docid, + self.embedder_id, + response.metadata.extractor_id, + response.embedding, + ) + .unwrap(); } - pub fn drain( - mut self, - unused_vectors_distribution: &UnusedVectorsDistributionBump, - ) -> Result<()> { - let res = Self::embed_chunks( - &mut self.texts, - &mut self.ids, - self.embedder, - self.embedder_id, - self.embedder_name, - self.possible_embedding_mistakes, - self.embedder_stats, - unused_vectors_distribution, - self.threads, - self.sender, - self.has_manual_generation, - ); - // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff - std::mem::forget(self); - res + fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec) { + self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); } - #[allow(clippy::too_many_arguments)] - pub fn embed_chunks( - texts: &mut BVec<'a, &'a str>, - ids: &mut BVec<'a, DocumentId>, - embedder: &Embedder, - embedder_id: u8, - embedder_name: &str, - possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: &EmbedderStats, + fn process_embedding_error( + &mut self, + error: crate::vector::hf::EmbedError, + embedder_name: &'doc str, unused_vectors_distribution: &UnusedVectorsDistributionBump, - threads: &ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - has_manual_generation: Option<&'a str>, - ) -> Result<()> { - if let Some(external_docid) = has_manual_generation { - let mut msg = format!( - r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}", - external_docid, - if ids.len() > 1 { - format!(" and at least {} other document(s)", ids.len() - 1) - } else { - "".to_string() - } - ); - - msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + metadata: &[Metadata<'doc>], + ) -> crate::Error { + if let FaultSource::Bug = error.fault { + crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) + } else { + let mut msg = if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + format!( + r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{} +- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.", + if let Some(first) = metadata.first() { first.external_docid } else { "???" }, + if metadata.len() > 1 { + format!(" and at least {} other document(s)", metadata.len() - 1) + } else { + "".to_string() + } + ) + } else { + format!(r"While embedding documents for embedder `{embedder_name}`: {error}") + }; let mut hint_count = 0; - for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2) + for (vector_misspelling, count) in + self.possible_embedding_mistakes.vector_mistakes().take(2) { msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); hint_count += 1; } - for (embedder_misspelling, count) in possible_embedding_mistakes + for (embedder_misspelling, count) in self + .possible_embedding_mistakes .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) .take(2) { @@ -659,107 +539,413 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { } if hint_count == 0 { - msg += &format!( - "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" - ); - } - - return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); - } - - let res = match embedder.embed_index_ref(texts.as_slice(), threads, embedder_stats) { - Ok(embeddings) => { - for (docid, embedding) in ids.into_iter().zip(embeddings) { - sender.set_vector(*docid, embedder_id, embedding).unwrap(); - } - Ok(()) - } - Err(error) => { - if let FaultSource::Bug = error.fault { - Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError( - error.into(), - ))) - } else { - let mut msg = format!( - r"While embedding documents for embedder `{embedder_name}`: {error}" + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" ); - - if let EmbedErrorKind::ManualEmbed(_) = &error.kind { - msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); - } - - let mut hint_count = 0; - - for (vector_misspelling, count) in - possible_embedding_mistakes.vector_mistakes().take(2) - { - msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); - hint_count += 1; - } - - for (embedder_misspelling, count) in possible_embedding_mistakes - .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) - .take(2) - { - msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); - hint_count += 1; - } - - if hint_count == 0 { - if let EmbedErrorKind::ManualEmbed(_) = &error.kind { - msg += &format!( - "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" - ); - } - } - - Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))) } } + + crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) + } + } +} + +struct Chunks<'a, 'b, 'extractor> { + dimensions: usize, + status_delta: &'a RefCell>, + status: EmbeddingStatus, + kind: ChunkType<'a, 'b>, +} + +enum ChunkType<'a, 'b> { + DocumentTemplate { + document_template: &'a Prompt, + session: EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, &'a str>, + }, + Fragments { + fragments: &'a [RuntimeFragment], + session: EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, serde_json::Value>, + }, +} + +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { + #[allow(clippy::too_many_arguments)] + pub fn new( + runtime: &'a RuntimeEmbedder, + embedder_info: EmbedderInfo, + embedder_name: &'a str, + status_delta: &'a RefCell>, + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + embedder_stats: &'a EmbedderStats, + threads: &'a ThreadPoolNoAbort, + sender: EmbeddingSender<'a, 'b>, + doc_alloc: &'a Bump, + ) -> Self { + let embedder = &runtime.embedder; + let dimensions = embedder.dimensions(); + + let fragments = runtime.fragments.as_slice(); + let kind = if fragments.is_empty() { + ChunkType::DocumentTemplate { + document_template: &runtime.document_template, + session: EmbedSession::new( + &runtime.embedder, + embedder_name, + threads, + doc_alloc, + embedder_stats, + OnEmbeddingDocumentUpdates { + embedder_id: embedder_info.embedder_id, + sender, + possible_embedding_mistakes, + }, + ), + } + } else { + ChunkType::Fragments { + fragments, + session: EmbedSession::new( + &runtime.embedder, + embedder_name, + threads, + doc_alloc, + embedder_stats, + OnEmbeddingDocumentUpdates { + embedder_id: embedder_info.embedder_id, + sender, + possible_embedding_mistakes, + }, + ), + } }; - texts.clear(); - ids.clear(); - res + + Self { dimensions, status: embedder_info.embedding_status, status_delta, kind } } - pub fn prompt(&self) -> &'a Prompt { - self.prompt + pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) { + self.status.is_user_provided_must_regenerate(docid) + } + + #[allow(clippy::too_many_arguments)] + pub fn update_autogenerated<'doc, OD: Document<'doc> + Debug, ND: Document<'doc> + Debug>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + old_document: OD, + new_document: ND, + new_fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_must_regenerate: bool, + ) -> Result<()> + where + 'a: 'doc, + { + let extracted = match &mut self.kind { + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + let ex = DocumentTemplateExtractor::new( + document_template, + doc_alloc, + new_fields_ids_map, + ); + + if old_is_user_provided { + session.on_embed_mut().clear_vectors(docid); + } + + update_autogenerated( + docid, + external_docid, + [ex], + old_document, + new_document, + &external_docid, + old_must_regenerate, + session, + unused_vectors_distribution, + )? + } + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + + if old_is_user_provided { + session.on_embed_mut().clear_vectors(docid); + } + + update_autogenerated( + docid, + external_docid, + extractors, + old_document, + new_document, + &(), + old_must_regenerate, + session, + unused_vectors_distribution, + )? + } + }; + + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + old_is_user_provided && !extracted, + new_must_regenerate, + ); + + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + pub fn insert_autogenerated + Debug>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + new_document: D, + new_fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + new_must_regenerate: bool, + ) -> Result<()> { + let (default_is_user_provided, default_must_regenerate) = (false, true); + self.set_status( + docid, + default_is_user_provided, + default_must_regenerate, + false, + new_must_regenerate, + ); + + match &mut self.kind { + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + let ex = DocumentTemplateExtractor::new( + document_template, + doc_alloc, + new_fields_ids_map, + ); + + insert_autogenerated( + docid, + external_docid, + [ex], + new_document, + &external_docid, + session, + unused_vectors_distribution, + )?; + } + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + } + } + Ok(()) + } + + pub fn drain(self, unused_vectors_distribution: &UnusedVectorsDistributionBump) -> Result<()> { + match self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.drain(unused_vectors_distribution)?; + } + ChunkType::Fragments { fragments: _, session } => { + session.drain(unused_vectors_distribution)?; + } + } + Ok(()) } pub fn embedder_name(&self) -> &'a str { - self.embedder_name - } - - fn set_regenerate(&self, docid: DocumentId, regenerate: bool) { - let mut user_provided = self.user_provided.borrow_mut(); - let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default(); - if regenerate { - // regenerate == !user_provided - user_provided.insert_del_u32(docid); - } else { - user_provided.insert_add_u32(docid); + match &self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.embedder_name() + } + ChunkType::Fragments { fragments: _, session } => session.embedder_name(), } } - fn set_vectors( + fn set_status( &self, + docid: DocumentId, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) { + if EmbeddingStatusDelta::needs_change( + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ) { + let mut status_delta = self.status_delta.borrow_mut(); + let status_delta = status_delta.0.entry_ref(self.embedder_name()).or_default(); + status_delta.push_delta( + docid, + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ); + } + } + + pub fn clear_status(&self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) { + // these value ensure both roaring are at 0. + if EmbeddingStatusDelta::needs_clear(is_user_provided, must_regenerate) { + let mut status_delta = self.status_delta.borrow_mut(); + let status_delta = status_delta.0.entry_ref(self.embedder_name()).or_default(); + status_delta.clear_docid(docid, is_user_provided, must_regenerate); + } + } + + pub fn set_vectors( + &mut self, external_docid: &'a str, docid: DocumentId, embeddings: Vec, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_must_regenerate: bool, ) -> Result<()> { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + true, + new_must_regenerate, + ); for (embedding_index, embedding) in embeddings.iter().enumerate() { if embedding.len() != self.dimensions { return Err(UserError::InvalidIndexingVectorDimensions { expected: self.dimensions, found: embedding.len(), - embedder_name: self.embedder_name.to_string(), + embedder_name: self.embedder_name().to_string(), document_id: external_docid.to_string(), embedding_index, } .into()); } } - self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap(); + match &mut self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.on_embed_mut().process_embeddings( + Metadata { docid, external_docid, extractor_id: 0 }, + embeddings, + ); + } + ChunkType::Fragments { fragments: _, session } => { + session.on_embed_mut().process_embeddings( + Metadata { docid, external_docid, extractor_id: 0 }, + embeddings, + ); + } + } + Ok(()) } } + +#[allow(clippy::too_many_arguments)] +fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( + docid: DocumentId, + external_docid: &'a str, + extractors: impl IntoIterator, + old_document: OD, + new_document: ND, + meta: &E::DocumentMetadata, + old_must_regenerate: bool, + session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, +) -> Result +where + OD: Document<'doc> + Debug, + ND: Document<'doc> + Debug, + E: VectorExtractor<'a>, + E::Input: Input, + crate::Error: From, +{ + let mut extracted = false; + for extractor in extractors { + let new_rendered = extractor.extract(&new_document, meta)?; + let must_regenerate = if !old_must_regenerate { + // we just enabled `regenerate` + true + } else { + let old_rendered = extractor.extract(&old_document, meta); + + if let Ok(old_rendered) = old_rendered { + // must regenerate if the rendered changed + new_rendered != old_rendered + } else { + // cannot check previous rendered, better regenerate + true + } + }; + + if must_regenerate { + extracted = true; + let metadata = + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; + + if let Some(new_rendered) = new_rendered { + session.request_embedding(metadata, new_rendered, unused_vectors_distribution)? + } else { + // remove any existing embedding + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { metadata, embedding: None }, + ); + } + } + } + + Ok(extracted) +} + +fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( + docid: DocumentId, + external_docid: &'a str, + extractors: impl IntoIterator, + new_document: D, + meta: &E::DocumentMetadata, + session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, +) -> Result<()> +where + E: VectorExtractor<'a>, + E::Input: Input, + crate::Error: From, +{ + for extractor in extractors { + let new_rendered = extractor.extract(&new_document, meta)?; + + if let Some(new_rendered) = new_rendered { + session.request_embedding( + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }, + new_rendered, + unused_vectors_distribution, + )?; + } + } + + Ok(()) +} diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index bb275d8aa..a3e7842c2 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -13,21 +13,17 @@ use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; use super::settings_changes::settings_change_extract; -use crate::documents::FieldIdMapper; -use crate::documents::PrimaryKey; -use crate::index::IndexEmbeddingConfig; -use crate::progress::EmbedderStats; -use crate::progress::MergingWordCache; +use crate::documents::{FieldIdMapper, PrimaryKey}; +use crate::progress::{EmbedderStats, MergingWordCache}; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; -use crate::vector::EmbeddingConfigs; -use crate::Index; -use crate::InternalError; -use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; +use crate::vector::db::IndexEmbeddingConfig; +use crate::vector::RuntimeEmbedders; +use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; #[allow(clippy::too_many_arguments)] pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( @@ -35,7 +31,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( indexing_context: IndexingContext, indexer_span: Span, extractor_sender: ExtractorBbqueueSender, - embedders: &EmbeddingConfigs, + embedders: &RuntimeEmbedders, extractor_allocs: &'extractor mut ThreadLocal>, finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, @@ -275,14 +271,19 @@ where let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); + let embedder_configs = index.embedding_configs(); for config in &mut index_embeddings { + let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap(); + 'data: for data in datastore.iter_mut() { let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { + let Some(delta) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided, modified_docids); + delta.apply_to(&mut infos.embedding_status); } + + extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap(); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0efef48fd..507d1a650 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -24,7 +24,7 @@ use crate::progress::{EmbedderStats, Progress}; use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; -use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; pub(crate) mod de; @@ -54,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( new_fields_ids_map: FieldsIdsMap, new_primary_key: Option>, document_changes: &DC, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, embedder_stats: &'indexer EmbedderStats, @@ -93,7 +93,7 @@ where grenad_parameters: &grenad_parameters, }; - let index_embeddings = index.embedding_configs(wtxn)?; + let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; let mut modified_docids = roaring::RoaringBitmap::new(); @@ -133,20 +133,21 @@ where let arroy_writers: Result> = embedders .inner_as_ref() .iter() - .map(|(embedder_name, (embedder, _, was_quantized))| { - let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { + .map(|(embedder_name, runtime)| { + let embedder_index = index + .embedding_configs() + .embedder_id(wtxn, embedder_name)? + .ok_or(InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, - }, - )?; + })?; - let dimensions = embedder.dimensions(); - let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized); + let dimensions = runtime.embedder.dimensions(); + let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized); Ok(( embedder_index, - (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + (embedder_name.as_str(), &*runtime.embedder, writer, dimensions), )) }) .collect(); diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index fa48ff589..b8e3685f8 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -11,11 +11,11 @@ use super::super::channel::*; use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; -use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::update::settings::InnerIndexSettings; +use crate::vector::db::IndexEmbeddingConfig; use crate::vector::settings::EmbedderAction; -use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; +use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders}; use crate::{Error, Index, InternalError, Result, UserError}; pub fn write_to_db( @@ -64,6 +64,14 @@ pub fn write_to_db( writer.del_items(wtxn, *dimensions, docid)?; writer.add_items(wtxn, docid, &embeddings)?; } + ReceiverAction::LargeVector( + large_vector @ LargeVector { docid, embedder_id, extractor_id, .. }, + ) => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let embedding = large_vector.read_embedding(*dimensions); + writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; + } } // Every time the is a message in the channel we search @@ -137,7 +145,7 @@ where )?; } - index.put_embedding_configs(wtxn, index_embeddings)?; + index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?; Ok(()) } @@ -147,7 +155,7 @@ pub(super) fn update_index( wtxn: &mut RwTxn<'_>, new_fields_ids_map: FieldIdMapWithMetadata, new_primary_key: Option>, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, ) -> Result<()> { @@ -226,14 +234,36 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); - if embeddings.append(all_embeddings.to_vec()).is_err() { - return Err(Error::UserError(UserError::InvalidVectorDimensions { - expected: *dimensions, - found: all_embeddings.len(), - })); - } writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; + if !all_embeddings.is_empty() { + if embeddings.append(all_embeddings.to_vec()).is_err() { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: all_embeddings.len(), + })); + } + writer.add_items(wtxn, docid, &embeddings)?; + } + } + EntryHeader::ArroySetVector( + asv @ ArroySetVector { docid, embedder_id, extractor_id, .. }, + ) => { + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding); + + if embedding.is_empty() { + writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?; + } else { + if embedding.len() != *dimensions { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: embedding.len(), + })); + } + writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; + } } } } diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index a52dab6a1..b59984248 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; use super::indexer::de::DeserrRawValue; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::documents::FieldIdMapper; -use crate::index::IndexEmbeddingConfig; +use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig}; use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders}; use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] @@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> { None => None, }; - let embedding_config = index.embedding_configs(rtxn)?; + let embedding_config = index.embedding_configs().embedding_configs(rtxn)?; Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc })) } @@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> { &self, embedder_id: u8, config: &IndexEmbeddingConfig, + status: &EmbeddingStatus, ) -> Result> { let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); @@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> { Ok(VectorEntry { has_configured_embedder: true, embeddings: Some(Embeddings::FromDb(vectors)), - regenerate: !config.user_provided.contains(self.docid), + regenerate: status.must_regenerate(self.docid), implicit: false, }) } @@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { self.embedding_config .iter() .map(|config| { - let embedder_id = - self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap(); - let entry = self.entry_from_db(embedder_id, config)?; + let info = + self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap(); + let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?; let config_name = self.doc_alloc.alloc_str(config.name.as_str()); Ok((&*config_name, entry)) }) @@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { } fn vectors_for_key(&self, key: &str) -> Result>> { - Ok(match self.index.embedder_category_id.get(self.rtxn, key)? { - Some(embedder_id) => { + Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? { + Some(info) => { let config = self.embedding_config.iter().find(|config| config.name == key).unwrap(); - Some(self.entry_from_db(embedder_id, config)?) + Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?) } None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { Some(embedding_from_doc) => { @@ -222,7 +223,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, vectors: RawMap<'doc, FxBuildHasher>, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, } impl<'doc> VectorDocumentFromVersions<'doc> { @@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, versions: &Versions<'doc>, bump: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { @@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> { db_fields_ids_map: &'doc Mapper, versions: &Versions<'doc>, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; let new_doc = @@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> { external_document_id: &'doc str, versions: &Versions<'doc>, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let Some(new_doc) = VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)? diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index b6f229779..dd005e993 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -3,6 +3,7 @@ use bumpalo::Bump; use serde_json::Value; use super::{EmbedError, Embedder, Embedding}; +use crate::progress::EmbedderStats; use crate::{DocumentId, Result, ThreadPoolNoAbort}; type ExtractorId = u8; @@ -43,6 +44,8 @@ pub struct EmbedSession<'doc, C, I> { embedder_name: &'doc str, + embedder_stats: &'doc EmbedderStats, + on_embed: C, } @@ -51,6 +54,7 @@ pub trait Input: Sized { inputs: &[Self], embedder: &Embedder, threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError>; } @@ -59,8 +63,9 @@ impl Input for &'_ str { inputs: &[Self], embedder: &Embedder, threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError> { - embedder.embed_index_ref(inputs, threads) + embedder.embed_index_ref(inputs, threads, embedder_stats) } } @@ -69,8 +74,9 @@ impl Input for Value { inputs: &[Value], embedder: &Embedder, threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, ) -> std::result::Result, EmbedError> { - embedder.embed_index_ref_fragments(inputs, threads) + embedder.embed_index_ref_fragments(inputs, threads, embedder_stats) } } @@ -81,12 +87,21 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { embedder_name: &'doc str, threads: &'doc ThreadPoolNoAbort, doc_alloc: &'doc Bump, + embedder_stats: &'doc EmbedderStats, on_embed: C, ) -> Self { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); let texts = BVec::with_capacity_in(capacity, doc_alloc); let ids = BVec::with_capacity_in(capacity, doc_alloc); - Self { inputs: texts, metadata: ids, embedder, threads, embedder_name, on_embed } + Self { + inputs: texts, + metadata: ids, + embedder, + threads, + embedder_name, + embedder_stats, + on_embed, + } } pub fn request_embedding( @@ -114,7 +129,12 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { if self.inputs.is_empty() { return Ok(()); } - let res = match I::embed_ref(self.inputs.as_slice(), self.embedder, self.threads) { + let res = match I::embed_ref( + self.inputs.as_slice(), + self.embedder, + self.threads, + self.embedder_stats, + ) { Ok(embeddings) => { for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) { self.on_embed.process_embedding_response(EmbeddingResponse { From cab5e35ff7b133b0743a852a76fdeac92c4b3f3f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:01:05 +0200 Subject: [PATCH 058/101] Implement in old settings indexer and old dump import indexer --- .../extract/extract_vector_points.rs | 771 ++++++++++++++---- .../src/update/index_documents/extract/mod.rs | 53 +- .../milli/src/update/index_documents/mod.rs | 100 ++- .../src/update/index_documents/transform.rs | 41 +- .../src/update/index_documents/typed_chunk.rs | 93 ++- crates/milli/src/vector/parsed_vectors.rs | 12 +- 6 files changed, 824 insertions(+), 246 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index e1981a615..0a179cfa5 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::collections::{BTreeMap, VecDeque}; use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::io::{self, BufReader, BufWriter}; @@ -6,25 +7,29 @@ use std::mem::size_of; use std::str::from_utf8; use std::sync::Arc; +use bumpalo::Bump; use bytemuck::cast_slice; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use grenad::Writer; +use obkv::KvReaderU16; use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; -use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; +use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; +use crate::vector::session::{EmbedSession, Metadata, OnEmbed}; use crate::vector::settings::ReindexAction; -use crate::vector::{Embedder, Embedding}; +use crate::vector::{Embedder, Embedding, RuntimeEmbedder, RuntimeFragment}; use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. @@ -37,12 +42,13 @@ pub struct ExtractedVectorPoints { pub remove_vectors: grenad::Reader>, // docid -> prompt pub prompts: grenad::Reader>, + // docid, extractor_id -> Option + pub inputs: grenad::Reader>, // embedder pub embedder_name: String, - pub embedder: Arc, - pub add_to_user_provided: RoaringBitmap, - pub remove_from_user_provided: RoaringBitmap, + pub runtime: Arc, + pub embedding_status_delta: EmbeddingStatusDelta, } enum VectorStateDelta { @@ -56,46 +62,74 @@ enum VectorStateDelta { // Remove any previous vector // Note: changing the value of the prompt **does require** recording this delta NowGenerated(String), + + // Add and remove the vectors computed from the fragments. + UpdateGeneratedFromFragments(Vec<(String, ExtractorDiff)>), + + /// Wasn't generated from fragments, but now is. + /// Delete any previous vectors and add the new vectors + NowGeneratedFromFragments(Vec<(String, Value)>), } impl VectorStateDelta { - fn into_values(self) -> (bool, String, Vec>) { + fn into_values(self) -> (bool, String, BTreeMap>, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), - VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - // We always delete the previous vectors - VectorStateDelta::NowManual(add) => (true, Default::default(), add), - VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), + VectorStateDelta::NowRemoved => { + (true, Default::default(), Default::default(), Default::default()) + } + VectorStateDelta::NowManual(add) => (true, Default::default(), Default::default(), add), + VectorStateDelta::NowGenerated(prompt) => { + (true, prompt, Default::default(), Default::default()) + } + VectorStateDelta::UpdateGeneratedFromFragments(fragments) => ( + false, + Default::default(), + ExtractorDiff::into_list_of_changes(fragments), + Default::default(), + ), + VectorStateDelta::NowGeneratedFromFragments(items) => ( + true, + Default::default(), + ExtractorDiff::into_list_of_changes( + items.into_iter().map(|(name, value)| (name, ExtractorDiff::Added(value))), + ), + Default::default(), + ), } } } -struct EmbedderVectorExtractor { +struct EmbedderVectorExtractor<'a> { embedder_name: String, - embedder: Arc, - prompt: Arc, + embedder_info: &'a EmbedderInfo, + runtime: Arc, // (docid) -> (prompt) prompts_writer: Writer>, + // (docid, extractor_id) -> (Option) + inputs_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, // (docid, _index) -> KvWriterDelAdd -> Vector manual_vectors_writer: Writer>, - // The docids of the documents that contains a user defined embedding - add_to_user_provided: RoaringBitmap, + embedding_status_delta: EmbeddingStatusDelta, action: ExtractionAction, } -struct DocumentOperation { - // The docids of the documents that contains an auto-generated embedding - remove_from_user_provided: RoaringBitmap, -} - enum ExtractionAction { SettingsFullReindex, - SettingsRegeneratePrompts { old_prompt: Arc }, - DocumentOperation(DocumentOperation), + SettingsRegeneratePrompts { + old_runtime: Arc, + }, + /// List of fragments to update/add + SettingsRegenerateFragments { + // name and indices, respectively in old and new runtime, of the fragments to examine. + must_regenerate_fragments: BTreeMap, usize)>, + old_runtime: Arc, + }, + DocumentOperation, } struct ManualEmbedderErrors { @@ -183,8 +217,8 @@ impl ManualEmbedderErrors { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, + embedder_info: &[(String, EmbedderInfo)], possible_embedding_mistakes: &PossibleEmbeddingMistakes, ) -> Result<(Vec, UnusedVectorsDistribution)> { let mut unused_vectors_distribution = UnusedVectorsDistribution::new(); @@ -204,13 +238,13 @@ pub fn extract_vector_points( let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); let old_configs = &settings_diff.old.embedding_configs; - if reindex_vectors { for (name, action) in settings_diff.embedding_config_updates.iter() { if let Some(action) = action.reindex() { - let Some((embedder_name, (embedder, prompt, _quantized))) = - configs.remove_entry(name) - else { + let (_, embedder_info) = + embedder_info.iter().find(|(embedder_name, _)| embedder_name == name).unwrap(); + + let Some((embedder_name, runtime)) = configs.remove_entry(name) else { tracing::error!(embedder = name, "Requested embedder config not found"); continue; }; @@ -229,6 +263,12 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let inputs_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + // (docid) -> () let remove_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -238,24 +278,66 @@ pub fn extract_vector_points( let action = match action { ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, - ReindexAction::RegeneratePrompts => { - let Some((_, old_prompt, _quantized)) = old_configs.get(name) else { + ReindexAction::RegenerateFragments(regenerate_fragments) => { + let Some(old_runtime) = old_configs.get(name) else { tracing::error!(embedder = name, "Old embedder config not found"); continue; }; - ExtractionAction::SettingsRegeneratePrompts { old_prompt } + let fragments = regenerate_fragments + .iter() + .filter_map(|(name, fragment)| match fragment { + crate::vector::settings::RegenerateFragment::Update => { + let old_value = old_runtime + .fragments + .binary_search_by_key(&name, |fragment| &fragment.name) + .ok(); + let Ok(new_value) = runtime + .fragments + .binary_search_by_key(&name, |fragment| &fragment.name) + else { + return None; + }; + Some((name.clone(), (old_value, new_value))) + } + // was already handled in transform + crate::vector::settings::RegenerateFragment::Remove => None, + crate::vector::settings::RegenerateFragment::Add => { + let Ok(new_value) = runtime + .fragments + .binary_search_by_key(&name, |fragment| &fragment.name) + else { + return None; + }; + Some((name.clone(), (None, new_value))) + } + }) + .collect(); + ExtractionAction::SettingsRegenerateFragments { + old_runtime, + must_regenerate_fragments: fragments, + } + } + + ReindexAction::RegeneratePrompts => { + let Some(old_runtime) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { old_runtime } } }; extractors.push(EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided: RoaringBitmap::new(), + embedding_status_delta: Default::default(), action, }); } else { @@ -264,8 +346,12 @@ pub fn extract_vector_points( } } else { // document operation + for (embedder_name, runtime) in configs.into_iter() { + let (_, embedder_info) = embedder_info + .iter() + .find(|(name, _)| embedder_name.as_str() == name.as_str()) + .unwrap(); - for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() { // (docid, _index) -> KvWriterDelAdd -> Vector let manual_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -280,6 +366,12 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let inputs_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + // (docid) -> () let remove_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -289,22 +381,23 @@ pub fn extract_vector_points( extractors.push(EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided: RoaringBitmap::new(), - action: ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided: RoaringBitmap::new(), - }), + embedding_status_delta: Default::default(), + action: ExtractionAction::DocumentOperation, }); } } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; + let mut doc_alloc = Bump::new(); while let Some((key, value)) = cursor.move_on_next()? { + doc_alloc.reset(); // this must always be serialized as (docid, external_docid); const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = @@ -320,9 +413,12 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; + let regenerate_for_embedders = embedder_info + .iter() + .filter(|&(_, infos)| infos.embedding_status.must_regenerate(docid)) + .map(|(name, _)| name.clone()); let mut parsed_vectors = ParsedVectorsDiff::new( - docid, - embedders_configs, + regenerate_for_embedders, obkv, old_vectors_fid, new_vectors_fid, @@ -331,44 +427,40 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided, + embedding_status_delta, action, } in extractors.iter_mut() { - let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_)); + let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_)); let (old, new) = parsed_vectors.remove(embedder_name); + let new_must_regenerate = new.must_regenerate(); let delta = match action { ExtractionAction::SettingsFullReindex => match old { // A full reindex can be triggered either by: // 1. a new embedder // 2. an existing embedder changed so that it must regenerate all generated embeddings. // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB - VectorState::Inline(vectors) => { - if !vectors.must_regenerate() { - add_to_user_provided.insert(docid); - } - - match vectors.into_array_of_vectors() { - Some(add_vectors) => { - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError( - crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ), - )); - } - VectorStateDelta::NowManual(add_vectors) + VectorState::Inline(vectors) => match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); } - None => VectorStateDelta::NoChange, + VectorStateDelta::NowManual(add_vectors) } - } + None => VectorStateDelta::NoChange, + }, // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated @@ -381,11 +473,79 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, new_fields_ids_map)? + let has_fragments = !runtime.fragments.is_empty(); + + if has_fragments { + regenerate_all_fragments( + &runtime.fragments, + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } } }, + ExtractionAction::SettingsRegenerateFragments { + must_regenerate_fragments, + old_runtime, + } => { + if old.must_regenerate() { + let has_fragments = !runtime.fragments.is_empty(); + let old_has_fragments = !old_runtime.fragments.is_empty(); + + let is_adding_fragments = has_fragments && !old_has_fragments; + + if is_adding_fragments { + regenerate_all_fragments( + &runtime.fragments, + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else if !has_fragments { + // removing fragments + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } else { + let mut fragment_diff = Vec::new(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let obkv_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + for (name, (old_index, new_index)) in must_regenerate_fragments { + let Some(new) = runtime.fragments.get(*new_index) else { continue }; + + let new = + RequestFragmentExtractor::new(new, &doc_alloc).ignore_errors(); + + let diff = { + let old = old_index.as_ref().and_then(|old| { + let old = old_runtime.fragments.get(*old)?; + Some( + RequestFragmentExtractor::new(old, &doc_alloc) + .ignore_errors(), + ) + }); + let old = old.as_ref(); + Extractor::diff_settings(&new, &obkv_document, &(), old) + } + .expect("ignoring errors so this cannot fail"); + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } + } else { + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder + VectorStateDelta::NoChange + } + } // prompt regeneration is only triggered for existing embedders - ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + ExtractionAction::SettingsRegeneratePrompts { old_runtime } => { if old.must_regenerate() { if embedder_is_manual { ManualEmbedderErrors::push_error( @@ -395,24 +555,32 @@ pub fn extract_vector_points( ); continue; } - regenerate_if_prompt_changed( - obkv, - (old_prompt, prompt), - (old_fields_ids_map, new_fields_ids_map), - )? + let has_fragments = !runtime.fragments.is_empty(); + + if has_fragments { + regenerate_all_fragments( + &runtime.fragments, + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + regenerate_if_prompt_changed( + obkv, + (&old_runtime.document_template, &runtime.document_template), + (old_fields_ids_map, new_fields_ids_map), + )? + } } else { // we can simply ignore user provided vectors as they are not regenerated and are // already in the DB since this is an existing embedder VectorStateDelta::NoChange } } - ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided, - }) => extract_vector_document_diff( - docid, + ExtractionAction::DocumentOperation => extract_vector_document_diff( obkv, - prompt, - (add_to_user_provided, remove_from_user_provided), + runtime, + &doc_alloc, (old, new), (old_fields_ids_map, new_fields_ids_map), document_id, @@ -421,13 +589,25 @@ pub fn extract_vector_points( &mut manual_errors, )?, }; + + // update the embedding status + push_embedding_status_delta( + embedding_status_delta, + docid, + &delta, + new_must_regenerate, + &embedder_info.embedding_status, + ); + // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, prompts_writer, + inputs_writer, manual_vectors_writer, &mut key_buffer, delta, + &runtime.fragments, )?; } @@ -444,45 +624,65 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder, - prompt: _, + runtime, + embedder_info: _, prompts_writer, + inputs_writer, remove_vectors_writer, - action, + action: _, manual_vectors_writer, - add_to_user_provided, + embedding_status_delta, } in extractors { - let remove_from_user_provided = - if let ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided, - }) = action - { - remove_from_user_provided - } else { - Default::default() - }; - results.push(ExtractedVectorPoints { manual_vectors: writer_into_reader(manual_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?, prompts: writer_into_reader(prompts_writer)?, - embedder, + inputs: writer_into_reader(inputs_writer)?, + runtime, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, }) } Ok((results, unused_vectors_distribution)) } +fn push_embedding_status_delta( + embedding_status_delta: &mut EmbeddingStatusDelta, + docid: DocumentId, + delta: &VectorStateDelta, + new_must_regenerate: bool, + embedding_status: &EmbeddingStatus, +) { + let (old_is_user_provided, old_must_regenerate) = + embedding_status.is_user_provided_must_regenerate(docid); + let new_is_user_provided = match delta { + VectorStateDelta::NoChange => old_is_user_provided, + VectorStateDelta::NowRemoved => { + embedding_status_delta.clear_docid(docid, old_is_user_provided, old_must_regenerate); + return; + } + VectorStateDelta::NowManual(_) => true, + VectorStateDelta::NowGenerated(_) + | VectorStateDelta::UpdateGeneratedFromFragments(_) + | VectorStateDelta::NowGeneratedFromFragments(_) => false, + }; + + embedding_status_delta.push_delta( + docid, + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ); +} + #[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments fn extract_vector_document_diff( - docid: DocumentId, obkv: &obkv::KvReader, - prompt: &Prompt, - (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + runtime: &RuntimeEmbedder, + doc_alloc: &Bump, (old, new): (VectorState, VectorState), (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, @@ -490,16 +690,6 @@ fn extract_vector_document_diff( embedder_is_manual: bool, manual_errors: &mut Option, ) -> Result { - match (old.must_regenerate(), new.must_regenerate()) { - (true, true) | (false, false) => {} - (true, false) => { - add_to_user_provided.insert(docid); - } - (false, true) => { - remove_from_user_provided.insert(docid); - } - } - let delta = match (old, new) { // regardless of the previous state, if a document now contains inline _vectors, they must // be extracted manually @@ -530,19 +720,52 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt).map(|p| { - p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) + let has_fragments = !runtime.fragments.is_empty(); + if has_fragments { + let prompt = &runtime.document_template; + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or_default() + }); + let new_prompt = + prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + let mut fragment_diff = Vec::new(); + let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let old_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Deletion, + old_fields_ids_map, + ); + + let new_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + + for new in &runtime.fragments { + let name = &new.name; + let fragment = + RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = fragment + .diff_documents(&old_document, &new_document, &()) + .expect("ignoring errors so this cannot fail"); + + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } } else { tracing::trace!("⏭️ Prompt unmodified, skipping"); VectorStateDelta::NoChange @@ -567,15 +790,25 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render_kvdeladd( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) + + let has_fragments = !runtime.fragments.is_empty(); + + if has_fragments { + regenerate_all_fragments( + &runtime.fragments, + doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + // becomes autogenerated + VectorStateDelta::NowGenerated(runtime.document_template.render_kvdeladd( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } } else { - // make sure the document is always removed from user provided on removal - remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } @@ -593,8 +826,6 @@ fn extract_vector_document_diff( // then they are user-provided and nothing possibly changed VectorStateDelta::NoChange } else { - // make sure the document is always removed from user provided on removal - remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } @@ -629,16 +860,45 @@ fn regenerate_prompt( Ok(VectorStateDelta::NowGenerated(prompt)) } +fn regenerate_all_fragments<'a>( + fragments: impl IntoIterator, + doc_alloc: &Bump, + new_fields_ids_map: &FieldIdMapWithMetadata, + obkv: &KvReaderU16, +) -> VectorStateDelta { + let mut fragment_diff = Vec::new(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let obkv_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + for new in fragments { + let name = &new.name; + let new = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = + { new.extract(&obkv_document, &()) }.expect("ignoring errors so this cannot fail"); + if let Some(value) = diff { + fragment_diff.push((name.clone(), value)); + } + } + VectorStateDelta::NowGeneratedFromFragments(fragment_diff) +} + /// We cannot compute the diff between both Del and Add vectors. /// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, + inputs_writer: &mut Writer>, manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, + fragments: &[RuntimeFragment], ) -> Result<()> { - let (must_remove, prompt, mut add_vectors) = delta.into_values(); + let (must_remove, prompt, mut fragment_delta, mut add_vectors) = delta.into_values(); if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; @@ -648,23 +908,49 @@ fn push_vectors_diff( prompts_writer.insert(&key_buffer, prompt.as_bytes())?; } - // We sort and dedup the vectors - add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + if !fragment_delta.is_empty() { + let mut scratch = Vec::new(); + let mut fragment_delta: Vec<_> = fragments + .iter() + .filter_map(|fragment| { + let delta = fragment_delta.remove(&fragment.name)?; + Some((fragment.id, delta)) + }) + .collect(); - // insert vectors into the writer - for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { - // Generate the key by extending the unique index to it. - key_buffer.truncate(TRUNCATE_SIZE); - let index = u16::try_from(i).unwrap(); - key_buffer.extend_from_slice(&index.to_be_bytes()); + fragment_delta.sort_unstable_by_key(|(id, _)| *id); + for (id, value) in fragment_delta { + key_buffer.truncate(TRUNCATE_SIZE); + key_buffer.push(id); + if let Some(value) = value { + scratch.clear(); + serde_json::to_writer(&mut scratch, &value).unwrap(); + inputs_writer.insert(&key_buffer, &scratch)?; + } else { + inputs_writer.insert(&key_buffer, [])?; + } + } + } - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; + if !add_vectors.is_empty() { + // We sort and dedup the vectors + add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + + // insert vectors into the writer + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { + // Generate the key by extending the unique index to it. + key_buffer.truncate(TRUNCATE_SIZE); + let index = u16::try_from(i).unwrap(); + key_buffer.extend_from_slice(&index.to_be_bytes()); + + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; + } } Ok(()) @@ -677,17 +963,18 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] -pub fn extract_embeddings( +pub fn extract_embeddings_from_prompts( // docid, prompt prompt_reader: grenad::Reader, indexer: GrenadParameters, - embedder: Arc, + runtime: Arc, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, embedder_stats: &EmbedderStats, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { + let embedder = &runtime.embedder; let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk @@ -723,7 +1010,7 @@ pub fn extract_embeddings( if chunks.len() == chunks.capacity() { let chunked_embeds = embed_chunks( - &embedder, + embedder, std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)), embedder_name, possible_embedding_mistakes, @@ -746,7 +1033,7 @@ pub fn extract_embeddings( // send last chunk if !chunks.is_empty() { let chunked_embeds = embed_chunks( - &embedder, + embedder, std::mem::take(&mut chunks), embedder_name, possible_embedding_mistakes, @@ -765,7 +1052,7 @@ pub fn extract_embeddings( if !current_chunk.is_empty() { let embeds = embed_chunks( - &embedder, + embedder, vec![std::mem::take(&mut current_chunk)], embedder_name, possible_embedding_mistakes, @@ -838,3 +1125,183 @@ fn embed_chunks( } } } + +#[allow(clippy::too_many_arguments)] +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +pub fn extract_embeddings_from_fragments( + // (docid, extractor_id) -> (Option) + inputs_reader: grenad::Reader, + indexer: GrenadParameters, + runtime: Arc, + embedder_name: &str, + possible_embedding_mistakes: &PossibleEmbeddingMistakes, + embedder_stats: &EmbedderStats, + unused_vectors_distribution: &UnusedVectorsDistribution, + request_threads: &ThreadPoolNoAbort, +) -> Result>> { + let doc_alloc = Bump::new(); + + // (docid, extractor_id) -> (Option) + let vector_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + if inputs_reader.is_empty() { + return writer_into_reader(vector_writer); + } + + let on_embed = WriteGrenadOnEmbed { + waiting_responses: Default::default(), + vector_writer, + scratch: Default::default(), + possible_embedding_mistakes, + }; + + let mut session = EmbedSession::new( + &runtime.embedder, + embedder_name, + request_threads, + &doc_alloc, + embedder_stats, + on_embed, + ); + + let mut cursor = inputs_reader.into_cursor()?; + + while let Some((mut key, value)) = cursor.move_on_next()? { + let docid = key.read_u32::().unwrap(); + let extractor_id = key.read_u8().unwrap(); + + if value.is_empty() { + // no value => removed fragment + session.on_embed_mut().push_response(docid, extractor_id); + } else { + // unwrap: the grenad value was saved as a serde_json::Value + let value: Value = serde_json::from_slice(value).unwrap(); + session.request_embedding( + Metadata { docid, external_docid: "", extractor_id }, + value, + unused_vectors_distribution, + )?; + } + } + + // send last chunk + let on_embed = session.drain(unused_vectors_distribution)?; + on_embed.finish() +} + +struct WriteGrenadOnEmbed<'a> { + // list of (document_id, extractor_id) for which vectors should be removed. + // these are written whenever a response arrives that has a larger (docid, extractor_id). + waiting_responses: VecDeque<(DocumentId, u8)>, + + // grenad of (docid, extractor_id) -> (Option) + vector_writer: Writer>, + + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + + // scratch buffer used to write keys + scratch: Vec, +} + +impl WriteGrenadOnEmbed<'_> { + pub fn push_response(&mut self, docid: DocumentId, extractor_id: u8) { + self.waiting_responses.push_back((docid, extractor_id)); + } + + pub fn finish(mut self) -> Result>> { + for (docid, extractor_id) in self.waiting_responses { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, []).unwrap(); + } + writer_into_reader(self.vector_writer) + } +} + +impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { + type ErrorMetadata = UnusedVectorsDistribution; + fn process_embedding_response( + &mut self, + response: crate::vector::session::EmbeddingResponse<'doc>, + ) { + let (docid, extractor_id) = (response.metadata.docid, response.metadata.extractor_id); + while let Some(waiting_response) = self.waiting_responses.pop_front() { + if (docid, extractor_id) > waiting_response { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, []).unwrap(); + } else { + self.waiting_responses.push_front(waiting_response); + break; + } + } + + if let Some(embedding) = response.embedding { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, cast_slice(embedding.as_slice())).unwrap(); + } + } + + fn process_embedding_error( + &mut self, + error: crate::vector::error::EmbedError, + embedder_name: &'doc str, + unused_vectors_distribution: &crate::vector::error::UnusedVectorsDistribution, + _metadata: &[crate::vector::session::Metadata<'doc>], + ) -> crate::Error { + if let FaultSource::Bug = error.fault { + crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) + } else { + let mut msg = + format!(r"While embedding documents for embedder `{embedder_name}`: {error}"); + + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + } + + let mut hint_count = 0; + + for (vector_misspelling, count) in + self.possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in self + .possible_embedding_mistakes + .embedder_mistakes(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + + if hint_count == 0 { + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + } + + crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) + } + } + + fn process_embeddings( + &mut self, + _metadata: crate::vector::session::Metadata<'doc>, + _embeddings: Vec, + ) { + unimplemented!("unused") + } +} diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index d640bc075..cbf4ceba2 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -23,16 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::{ - extract_embeddings, extract_vector_points, ExtractedVectorPoints, + extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints, }; use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; -use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; +use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::EmbedderInfo; use crate::vector::error::PossibleEmbeddingMistakes; use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; @@ -46,9 +47,9 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, - embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, + embedder_info: Arc>, possible_embedding_mistakes: Arc, embedder_stats: &Arc, ) -> Result<()> { @@ -61,8 +62,8 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), - embedders_configs.clone(), settings_diff.clone(), + embedder_info.clone(), possible_embedding_mistakes.clone(), embedder_stats.clone(), ) @@ -231,8 +232,8 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - embedders_configs: Arc>, settings_diff: Arc, + embedder_info: Arc>, possible_embedding_mistakes: Arc, embedder_stats: Arc, ) -> Result<()> { @@ -245,7 +246,6 @@ fn send_original_documents_data( if index_vectors { let settings_diff = settings_diff.clone(); - let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); @@ -253,8 +253,8 @@ fn send_original_documents_data( match extract_vector_points( original_documents_chunk.clone(), indexer, - &embedders_configs, &settings_diff, + embedder_info.as_slice(), &possible_embedding_mistakes, ) { Ok((extracted_vectors, unused_vectors_distribution)) => { @@ -262,16 +262,16 @@ fn send_original_documents_data( manual_vectors, remove_vectors, prompts, + inputs, embedder_name, - embedder, - add_to_user_provided, - remove_from_user_provided, + runtime, + embedding_status_delta, } in extracted_vectors { - let embeddings = match extract_embeddings( + let embeddings_from_prompts = match extract_embeddings_from_prompts( prompts, indexer, - embedder.clone(), + runtime.clone(), &embedder_name, &possible_embedding_mistakes, &embedder_stats, @@ -284,18 +284,37 @@ fn send_original_documents_data( None } }; + + let embeddings_from_fragments = match extract_embeddings_from_fragments( + inputs, + indexer, + runtime.clone(), + &embedder_name, + &possible_embedding_mistakes, + &embedder_stats, + &unused_vectors_distribution, + request_threads(), + ) { + Ok(results) => Some(results), + Err(error) => { + let _ = lmdb_writer_sx.send(Err(error)); + None + } + }; + if !(remove_vectors.is_empty() && manual_vectors.is_empty() - && embeddings.as_ref().is_none_or(|e| e.is_empty())) + && embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty()) + && embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty())) { let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints { remove_vectors, - embeddings, - expected_dimension: embedder.dimensions(), + embeddings_from_prompts, + embeddings_from_fragments, + expected_dimension: runtime.embedder.dimensions(), manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, })); } } diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 5ec6910f7..055b8bbad 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -38,7 +38,8 @@ pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::vector::db::EmbedderInfo; +use crate::vector::{ArroyWrapper, RuntimeEmbedders}; use crate::{CboRoaringBitmapCodec, Index, Result, UserError}; static MERGED_DATABASE_COUNT: usize = 7; @@ -81,7 +82,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { should_abort: FA, added_documents: u64, deleted_documents: u64, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, embedder_stats: &'t Arc, } @@ -172,7 +173,7 @@ where Ok((self, Ok(indexed_documents))) } - pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self { + pub fn with_embedders(mut self, embedders: RuntimeEmbedders) -> Self { self.embedders = embedders; self } @@ -226,7 +227,13 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); - let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); + let embedder_infos: heed::Result> = self + .index + .embedding_configs() + .iter_embedder_info(self.wtxn)? + .map(|res| res.map(|(name, info)| (name.to_owned(), info))) + .collect(); + let embedder_infos = Arc::new(embedder_infos?); let possible_embedding_mistakes = crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution); @@ -328,9 +335,9 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, - embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, + embedder_infos, Arc::new(possible_embedding_mistakes), &embedder_stats ) @@ -430,21 +437,21 @@ where TypedChunk::VectorPoints { expected_dimension, remove_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { remove_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, expected_dimension, manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, } } otherwise => otherwise, @@ -480,7 +487,7 @@ where // we should insert it in `dimension` for (name, action) in settings_diff.embedding_config_updates.iter() { if action.is_being_quantized && !dimension.contains_key(name.as_str()) { - let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or( + let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, @@ -488,7 +495,9 @@ where )?; let reader = ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); - let dim = reader.dimensions(self.wtxn)?; + let Some(dim) = reader.dimensions(self.wtxn)? else { + continue; + }; dimension.insert(name.to_string(), dim); } } @@ -498,12 +507,19 @@ where let vector_arroy = self.index.vector_arroy; let cancel = &self.should_abort; - let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, - )?; + let embedder_index = + self.index.embedding_configs().embedder_id(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?; let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name); - let was_quantized = - settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let was_quantized = settings_diff + .old + .embedding_configs + .get(&embedder_name) + .is_some_and(|conf| conf.is_quantized); let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); pool.install(|| { @@ -773,11 +789,11 @@ mod tests { use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; - use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; + use crate::vector::db::IndexEmbeddingConfig; use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError}; #[test] @@ -2028,7 +2044,7 @@ mod tests { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -2116,7 +2132,7 @@ mod tests { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -2277,7 +2293,7 @@ mod tests { ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.replace_documents(&documents).unwrap(); indexer.delete_documents(&["2"]); @@ -2343,7 +2359,7 @@ mod tests { indexer.delete_documents(&["1", "2"]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( &indexer_alloc, @@ -2394,7 +2410,7 @@ mod tests { { "id": 3, "name": "jean", "age": 25 }, ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.update_documents(&documents).unwrap(); @@ -2446,7 +2462,7 @@ mod tests { { "id": 3, "legs": 4 }, ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.update_documents(&documents).unwrap(); indexer.delete_documents(&["1", "2"]); @@ -2496,7 +2512,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2"]); @@ -2552,7 +2568,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2", "1", "2"]); @@ -2611,7 +2627,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ @@ -2661,7 +2677,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2775,6 +2791,8 @@ mod tests { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, distribution: Setting::NotSet, @@ -2801,17 +2819,27 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); - let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = + let embedders = index.embedding_configs(); + let mut embedding_configs = embedders.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, fragments } = embedding_configs.pop().unwrap(); + let info = embedders.embedder_info(&rtxn, &embedder_name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0, 1, 2]>"); insta::assert_snapshot!(embedder_name, @"manual"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); + let embedder = std::sync::Arc::new( crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(), ); let res = index .search(&rtxn) - .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec())) + .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()), None) .execute() .unwrap(); assert_eq!(res.documents_ids.len(), 3); @@ -2860,7 +2888,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); // OP @@ -2921,7 +2949,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2980,7 +3008,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index e17625ad4..e07483aff 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -31,7 +31,7 @@ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; -use crate::vector::settings::WriteBackToDocuments; +use crate::vector::settings::{RemoveFragments, WriteBackToDocuments}; use crate::vector::ArroyWrapper; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; @@ -933,10 +933,47 @@ impl<'a, 'i> Transform<'a, 'i> { // delete all vectors from the embedders that need removal for (_, (reader, _)) in readers { - let dimensions = reader.dimensions(wtxn)?; + let Some(dimensions) = reader.dimensions(wtxn)? else { + continue; + }; reader.clear(wtxn, dimensions)?; } + // remove all vectors for the specified fragments + for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in + settings_diff.embedding_config_updates.iter().filter_map(|(name, action)| { + action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized)) + }) + { + let Some(infos) = self.index.embedding_configs().embedder_info(wtxn, embedder_name)? + else { + continue; + }; + let arroy = + ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized); + let Some(dimensions) = arroy.dimensions(wtxn)? else { + continue; + }; + for fragment_id in fragment_ids { + // we must keep the user provided embeddings that ended up in this store + + if infos.embedding_status.user_provided_docids().is_empty() { + // no user provided: clear store + arroy.clear_store(wtxn, *fragment_id, dimensions)?; + continue; + } + + // some user provided, remove only the ids that are not user provided + let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + items - infos.embedding_status.user_provided_docids() + })?; + + for to_delete in to_delete { + arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + } + } + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 6d575a98b..370579a6c 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; +use byteorder::{BigEndian, ReadBytesExt as _}; use grenad::{MergeFunction, Merger, MergerBuilder}; use heed::types::Bytes; use heed::{BytesDecode, RwTxn}; @@ -18,7 +19,6 @@ use super::helpers::{ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; -use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -26,6 +26,7 @@ use crate::update::index_documents::helpers::{ as_cloneable_grenad, try_split_array_at, KeepLatestObkv, }; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig}; use crate::vector::ArroyWrapper; use crate::{ lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, @@ -86,12 +87,14 @@ pub(crate) enum TypedChunk { GeoPoints(grenad::Reader>), VectorPoints { remove_vectors: grenad::Reader>, - embeddings: Option>>, + // docid -> vector + embeddings_from_prompts: Option>>, + // docid, extractor_id -> Option, + embeddings_from_fragments: Option>>, expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - add_to_user_provided: RoaringBitmap, - remove_from_user_provided: RoaringBitmap, + embedding_status_delta: EmbeddingStatusDelta, }, } @@ -155,6 +158,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; let embedders: BTreeSet<_> = index + .embedding_configs() .embedding_configs(wtxn)? .into_iter() .map(|IndexEmbeddingConfig { name, .. }| name) @@ -614,57 +618,66 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "vector_points"); let _entered = span.enter(); + let embedders = index.embedding_configs(); + let mut remove_vectors_builder = MergerBuilder::new(KeepFirst); let mut manual_vectors_builder = MergerBuilder::new(KeepFirst); - let mut embeddings_builder = MergerBuilder::new(KeepFirst); - let mut add_to_user_provided = RoaringBitmap::new(); - let mut remove_from_user_provided = RoaringBitmap::new(); + let mut embeddings_from_prompts_builder = MergerBuilder::new(KeepFirst); + let mut embeddings_from_fragments_builder = MergerBuilder::new(KeepFirst); let mut params = None; + let mut infos = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { remove_vectors, manual_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, expected_dimension, embedder_name, - add_to_user_provided: aud, - remove_from_user_provided: rud, + embedding_status_delta, } = typed_chunk else { unreachable!(); }; + if infos.is_none() { + infos = Some(embedders.embedder_info(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?); + } + params = Some((expected_dimension, embedder_name)); remove_vectors_builder.push(remove_vectors.into_cursor()?); manual_vectors_builder.push(manual_vectors.into_cursor()?); - if let Some(embeddings) = embeddings { - embeddings_builder.push(embeddings.into_cursor()?); + if let Some(embeddings) = embeddings_from_prompts { + embeddings_from_prompts_builder.push(embeddings.into_cursor()?); + } + if let Some(embeddings) = embeddings_from_fragments { + embeddings_from_fragments_builder.push(embeddings.into_cursor()?); + } + + if let Some(infos) = &mut infos { + embedding_status_delta.apply_to(&mut infos.embedding_status); } - add_to_user_provided |= aud; - remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let Some(infos) = infos else { unreachable!() }; - let mut embedding_configs = index.embedding_configs(wtxn)?; - let index_embedder_config = embedding_configs - .iter_mut() - .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) - .unwrap(); - index_embedder_config.user_provided -= remove_from_user_provided; - index_embedder_config.user_provided |= add_to_user_provided; + embedders.put_embedder_info(wtxn, &embedder_name, &infos)?; - index.put_embedding_configs(wtxn, embedding_configs)?; - - let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, - )?; - let binary_quantized = - settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let binary_quantized = settings_diff + .old + .embedding_configs + .get(&embedder_name) + .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance - let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); + let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); @@ -674,8 +687,8 @@ pub(crate) fn write_typed_chunk_into_index( writer.del_items(wtxn, expected_dimension, docid)?; } - // add generated embeddings - let merger = embeddings_builder.build(); + // add generated embeddings -- from prompts + let merger = embeddings_from_prompts_builder.build(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); @@ -702,6 +715,24 @@ pub(crate) fn write_typed_chunk_into_index( writer.add_items(wtxn, docid, &embeddings)?; } + // add generated embeddings -- from fragments + let merger = embeddings_from_fragments_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((mut key, value)) = iter.next()? { + let docid = key.read_u32::().unwrap(); + let extractor_id = key.read_u8().unwrap(); + if value.is_empty() { + writer.del_item_in_store(wtxn, docid, extractor_id, expected_dimension)?; + } else { + let data = pod_collect_to_vec(value); + // it is a code error to have embeddings and not expected_dimension + if data.len() != expected_dimension { + panic!("wrong dimensions") + } + writer.add_item_in_store(wtxn, docid, extractor_id, &data)?; + } + } + // perform the manual diff let merger = manual_vectors_builder.build(); let mut iter = merger.into_stream_merger_iter()?; diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 5fcb2912b..36e80677a 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -6,9 +6,8 @@ use serde_json::value::RawValue; use serde_json::{from_slice, Value}; use super::Embedding; -use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{DocumentId, FieldId, InternalError, UserError}; +use crate::{FieldId, InternalError, UserError}; #[derive(serde::Serialize, Debug)] #[serde(untagged)] @@ -374,8 +373,7 @@ pub struct ParsedVectorsDiff { impl ParsedVectorsDiff { pub fn new( - docid: DocumentId, - embedders_configs: &[IndexEmbeddingConfig], + regenerate_for_embedders: impl Iterator, documents_diff: &KvReader, old_vectors_fid: Option, new_vectors_fid: Option, @@ -396,10 +394,8 @@ impl ParsedVectorsDiff { } } .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); - for embedding_config in embedders_configs { - if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); - } + for name in regenerate_for_embedders { + old.entry(name).or_insert(VectorState::Generated); } let new = 'new: { From 46bceb91f19cea95fc902ca8ff9482d53ea41359 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:08:48 +0200 Subject: [PATCH 059/101] New search errors --- crates/meilisearch-types/src/error.rs | 3 +++ crates/meilisearch/src/error.rs | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 30f6868f6..c57e2d042 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -301,6 +301,7 @@ InvalidFacetSearchQuery , InvalidRequest , BAD_REQU InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; FacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; +InvalidSearchMedia , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ; @@ -308,6 +309,7 @@ InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQU InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ; InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; +InvalidSearchMediaAndVector , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; @@ -464,6 +466,7 @@ impl ErrorCode for milli::Error { | UserError::MissingSourceForNested { .. } | UserError::InvalidSettingsEmbedder { .. } => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, + UserError::TooManyFragments(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound, UserError::MultiplePrimaryKeyCandidatesFound { .. } => { diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index b13eb8d7c..91c6c23fa 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -76,8 +76,10 @@ pub enum MeilisearchHttpError { DocumentFormat(#[from] DocumentFormatError), #[error(transparent)] Join(#[from] JoinError), - #[error("Invalid request: missing `hybrid` parameter when `vector` is present.")] + #[error("Invalid request: missing `hybrid` parameter when `vector` or `media` are present.")] MissingSearchHybrid, + #[error("Invalid request: both `media` and `vector` parameters are present.")] + MediaAndVector, } impl MeilisearchHttpError { @@ -111,6 +113,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::DocumentFormat(e) => e.error_code(), MeilisearchHttpError::Join(_) => Code::Internal, MeilisearchHttpError::MissingSearchHybrid => Code::MissingSearchHybrid, + MeilisearchHttpError::MediaAndVector => Code::InvalidSearchMediaAndVector, MeilisearchHttpError::FederationOptionsInNonFederatedRequest(_) => { Code::InvalidMultiSearchFederationOptions } From d14184f4da8114d532b5f8a7b13c955e204c5ebf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:10:46 +0200 Subject: [PATCH 060/101] Add `media` to search --- .../src/routes/indexes/facet_search.rs | 6 +++ .../meilisearch/src/routes/indexes/search.rs | 2 + crates/meilisearch/src/search/mod.rs | 36 ++++++++++++-- crates/milli/src/search/hybrid.rs | 27 +++++++---- crates/milli/src/search/mod.rs | 48 +++++++++++-------- 5 files changed, 86 insertions(+), 33 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 41f306746..18ad54ccf 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -56,6 +56,8 @@ pub struct FacetSearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] @@ -94,6 +96,7 @@ impl FacetSearchAggregator { facet_name, vector, q, + media, filter, matching_strategy, attributes_to_search_on, @@ -108,6 +111,7 @@ impl FacetSearchAggregator { facet_names: Some(facet_name.clone()).into_iter().collect(), additional_search_parameters_provided: q.is_some() || vector.is_some() + || media.is_some() || filter.is_some() || *matching_strategy != MatchingStrategy::default() || attributes_to_search_on.is_some() @@ -291,6 +295,7 @@ impl From for SearchQuery { facet_name: _, q, vector, + media, filter, matching_strategy, attributes_to_search_on, @@ -312,6 +317,7 @@ impl From for SearchQuery { SearchQuery { q, + media, offset: DEFAULT_SEARCH_OFFSET(), limit: DEFAULT_SEARCH_LIMIT(), page, diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 333ae1944..035ba71d8 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -205,6 +205,8 @@ impl TryFrom for SearchQuery { Ok(Self { q: other.q, + // `media` not supported for `GET` + media: None, vector: other.vector.map(CS::into_inner), offset: other.offset.0, limit: other.limit.0, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 61ef3f813..6d8639504 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -64,6 +64,8 @@ pub struct SearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] @@ -147,6 +149,7 @@ impl From for SearchQuery { ranking_score_threshold: ranking_score_threshold.map(RankingScoreThreshold::from), q: None, vector: None, + media: None, offset: DEFAULT_SEARCH_OFFSET(), page: None, hits_per_page: None, @@ -220,6 +223,7 @@ impl fmt::Debug for SearchQuery { let Self { q, vector, + media, hybrid, offset, limit, @@ -274,6 +278,9 @@ impl fmt::Debug for SearchQuery { ); } } + if let Some(media) = media { + debug.field("media", media); + } if let Some(hybrid) = hybrid { debug.field("hybrid", &hybrid); } @@ -482,8 +489,10 @@ pub struct SearchQueryWithIndex { pub index_uid: IndexUid, #[deserr(default, error = DeserrJsonError)] pub q: Option, - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] @@ -564,6 +573,7 @@ impl SearchQueryWithIndex { let SearchQuery { q, vector, + media, hybrid, offset, limit, @@ -594,6 +604,7 @@ impl SearchQueryWithIndex { index_uid, q, vector, + media, hybrid, offset: if offset == DEFAULT_SEARCH_OFFSET() { None } else { Some(offset) }, limit: if limit == DEFAULT_SEARCH_LIMIT() { None } else { Some(limit) }, @@ -628,6 +639,7 @@ impl SearchQueryWithIndex { federation_options, q, vector, + media, offset, limit, page, @@ -658,6 +670,7 @@ impl SearchQueryWithIndex { SearchQuery { q, vector, + media, offset: offset.unwrap_or(DEFAULT_SEARCH_OFFSET()), limit: limit.unwrap_or(DEFAULT_SEARCH_LIMIT()), page, @@ -984,14 +997,27 @@ pub fn prepare_search<'t>( let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10); + let q = query.q.as_deref(); + let media = query.media.as_ref(); + + let search_query = match (q, media) { + (Some(text), None) => milli::vector::SearchQuery::Text(text), + (q, media) => milli::vector::SearchQuery::Media { q, media }, + }; + embedder - .embed_search(query.q.as_ref().unwrap(), Some(deadline)) + .embed_search(search_query, Some(deadline)) .map_err(milli::vector::Error::from) .map_err(milli::Error::from)? } }; - - search.semantic(embedder_name.clone(), embedder.clone(), *quantized, Some(vector)); + search.semantic( + embedder_name.clone(), + embedder.clone(), + *quantized, + Some(vector), + query.media.clone(), + ); } SearchKind::Hybrid { embedder_name, embedder, quantized, semantic_ratio: _ } => { if let Some(q) = &query.q { @@ -1003,6 +1029,7 @@ pub fn prepare_search<'t>( embedder.clone(), *quantized, query.vector.clone(), + query.media.clone(), ); } } @@ -1127,6 +1154,7 @@ pub fn perform_search( locales, // already used in prepare_search vector: _, + media: _, hybrid: _, offset: _, ranking_score_threshold: _, diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index b63f6288f..c906e1eb7 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -7,6 +7,7 @@ use roaring::RoaringBitmap; use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; use crate::search::new::{distinct_fid, distinct_single_docid}; use crate::search::SemanticSearch; +use crate::vector::SearchQuery; use crate::{Index, MatchingWords, Result, Search, SearchResult}; struct ScoreWithRatioResult { @@ -225,12 +226,9 @@ impl Search<'_> { return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); } - // no vector search against placeholder search - let Some(query) = search.query.take() else { - return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); - }; // no embedder, no semantic search - let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else { + let Some(SemanticSearch { vector, embedder_name, embedder, quantized, media }) = semantic + else { return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; @@ -241,9 +239,17 @@ impl Search<'_> { let span = tracing::trace_span!(target: "search::hybrid", "embed_one"); let _entered = span.enter(); + let q = search.query.as_deref(); + let media = media.as_ref(); + + let query = match (q, media) { + (Some(text), None) => SearchQuery::Text(text), + (q, media) => SearchQuery::Media { q, media }, + }; + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3); - match embedder.embed_search(&query, Some(deadline)) { + match embedder.embed_search(query, Some(deadline)) { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); @@ -257,8 +263,13 @@ impl Search<'_> { } }; - search.semantic = - Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized }); + search.semantic = Some(SemanticSearch { + vector: Some(vector_query), + embedder_name, + embedder, + quantized, + media, + }); // TODO: would be better to have two distinct functions at this point let vector_results = search.execute()?; diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 62183afc3..97d542524 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -12,7 +12,7 @@ use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::vector::Embedder; +use crate::vector::{Embedder, Embedding}; use crate::{ execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index, Result, SearchContext, TimeBudget, UserError, @@ -32,6 +32,7 @@ pub mod similar; #[derive(Debug, Clone)] pub struct SemanticSearch { vector: Option>, + media: Option, embedder_name: String, embedder: Arc, quantized: bool, @@ -93,9 +94,10 @@ impl<'a> Search<'a> { embedder_name: String, embedder: Arc, quantized: bool, - vector: Option>, + vector: Option, + media: Option, ) -> &mut Search<'a> { - self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector }); + self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector, media }); self } @@ -231,24 +233,28 @@ impl<'a> Search<'a> { degraded, used_negative_operator, } = match self.semantic.as_ref() { - Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => { - execute_vector_search( - &mut ctx, - vector, - self.scoring_strategy, - universe, - &self.sort_criteria, - &self.distinct, - self.geo_param, - self.offset, - self.limit, - embedder_name, - embedder, - *quantized, - self.time_budget.clone(), - self.ranking_score_threshold, - )? - } + Some(SemanticSearch { + vector: Some(vector), + embedder_name, + embedder, + quantized, + media: _, + }) => execute_vector_search( + &mut ctx, + vector, + self.scoring_strategy, + universe, + &self.sort_criteria, + &self.distinct, + self.geo_param, + self.offset, + self.limit, + embedder_name, + embedder, + *quantized, + self.time_budget.clone(), + self.ranking_score_threshold, + )?, _ => execute_search( &mut ctx, self.query.as_deref(), From 2b3327ea74357cf6823bd8a89f447c2773c221d1 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:11:00 +0200 Subject: [PATCH 061/101] Use `media` to determine search kind --- .../meilisearch/src/routes/indexes/search.rs | 46 +++++++++++-------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 035ba71d8..697ae9241 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -483,28 +483,30 @@ pub fn search_kind( index_uid: String, index: &milli::Index, ) -> Result { + let is_placeholder_query = + if let Some(q) = query.q.as_deref() { q.trim().is_empty() } else { true }; + let non_placeholder_query = !is_placeholder_query; + let is_media = query.media.is_some(); // handle with care, the order of cases matters, the semantics is subtle - match (query.q.as_deref(), &query.hybrid, query.vector.as_deref()) { - // empty query, no vector => placeholder search - (Some(q), _, None) if q.trim().is_empty() => Ok(SearchKind::KeywordOnly), - // no query, no vector => placeholder search - (None, _, None) => Ok(SearchKind::KeywordOnly), - // hybrid.semantic_ratio == 1.0 => vector - (_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { - SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) - } - // hybrid.semantic_ratio == 0.0 => keyword - (_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { + match (is_media, non_placeholder_query, &query.hybrid, query.vector.as_deref()) { + // media + vector => error + (true, _, _, Some(_)) => Err(MeilisearchHttpError::MediaAndVector.into()), + // media + !hybrid => error + (true, _, None, _) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // vector + !hybrid => error + (_, _, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // hybrid S0 => keyword + (_, _, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { Ok(SearchKind::KeywordOnly) } - // no query, hybrid, vector => semantic - (None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => { - SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len())) + // !q + !vector => placeholder search + (false, false, _, None) => Ok(SearchKind::KeywordOnly), + // hybrid S100 => semantic + (_, _, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) } - // query, no hybrid, no vector => keyword - (Some(_), None, None) => Ok(SearchKind::KeywordOnly), - // query, hybrid, maybe vector => hybrid - (Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( + // q + hybrid => hybrid + (_, true, Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( index_scheduler, index_uid, index, @@ -512,7 +514,11 @@ pub fn search_kind( **semantic_ratio, v.map(|v| v.len()), ), - - (_, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // !q + hybrid => semantic + (_, false, Some(HybridQuery { semantic_ratio: _, embedder }), v) => { + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) + } + // q => keyword + (false, true, None, None) => Ok(SearchKind::KeywordOnly), } } From c593fbe648ec7aedf62285cd7aa8459e9ac068d8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:11:29 +0200 Subject: [PATCH 062/101] Analytics --- .../src/routes/indexes/search_analytics.rs | 12 ++++++++++++ .../meilisearch/src/routes/multi_search_analytics.rs | 1 + 2 files changed, 13 insertions(+) diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs index b16e2636e..07f79eba7 100644 --- a/crates/meilisearch/src/routes/indexes/search_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -61,6 +61,8 @@ pub struct SearchAggregator { semantic_ratio: bool, hybrid: bool, retrieve_vectors: bool, + // Number of requests containing `media` + total_media: usize, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -101,6 +103,7 @@ impl SearchAggregator { let SearchQuery { q, vector, + media, offset, limit, page, @@ -175,6 +178,11 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + + if media.is_some() { + ret.total_media = 1; + } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { @@ -277,6 +285,7 @@ impl Aggregate for SearchAggregator { show_ranking_score_details, semantic_ratio, hybrid, + total_media, total_degraded, total_used_negative_operator, ranking_score_threshold, @@ -327,6 +336,7 @@ impl Aggregate for SearchAggregator { self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; + self.total_media += total_media; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -403,6 +413,7 @@ impl Aggregate for SearchAggregator { show_ranking_score_details, semantic_ratio, hybrid, + total_media, total_degraded, total_used_negative_operator, ranking_score_threshold, @@ -450,6 +461,7 @@ impl Aggregate for SearchAggregator { "hybrid": { "enabled": hybrid, "semantic_ratio": semantic_ratio, + "total_media": total_media, }, "pagination": { "max_limit": max_limit, diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs index 3fa23f630..c24875797 100644 --- a/crates/meilisearch/src/routes/multi_search_analytics.rs +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -42,6 +42,7 @@ impl MultiSearchAggregator { federation_options, q: _, vector: _, + media: _, offset: _, limit: _, page: _, From 11e7c0d75f53e8b2b798194daf38fa12d94e6a5a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 00:09:48 +0200 Subject: [PATCH 063/101] Fix tests --- crates/index-scheduler/src/scheduler/test.rs | 15 +- .../src/scheduler/test_embedders.rs | 215 ++++++++++++------ crates/meilisearch/tests/search/hybrid.rs | 2 +- 3 files changed, 155 insertions(+), 77 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 2c492525f..e9f21dfe4 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -690,11 +690,20 @@ fn test_settings_update() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); - let configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name, config, fragments } = configs.first().unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(config.embedder_options); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); } #[test] diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index 305894d0a..a9b920bd2 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -3,13 +3,14 @@ use std::collections::BTreeMap; use big_s::S; use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::milli::vector::SearchQuery; use meilisearch_types::milli::{self, obkv_to_json}; use meilisearch_types::settings::{SettingEmbeddingSettings, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; use milli::update::IndexDocumentsMethod::*; +use milli::vector::db::IndexEmbeddingConfig; use crate::insta_snapshot::snapshot_index_scheduler; use crate::test_utils::read_json; @@ -85,28 +86,51 @@ fn import_vectors() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); - let configs = index.embedding_configs(&rtxn).unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = + let IndexEmbeddingConfig { name, config: fakerest_config, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = + let IndexEmbeddingConfig { name, config: simple_hf_config, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap(); - let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); - let beagle_embed = hf_embedder.embed_search("Intel the beagle best doggo", None).unwrap(); - let lab_embed = hf_embedder.embed_search("Max the lab best doggo", None).unwrap(); - let patou_embed = hf_embedder.embed_search("kefir the patou best doggo", None).unwrap(); + let hf_runtime = configs.get(&simple_hf_name).unwrap(); + let hf_embedder = &hf_runtime.embedder; + let beagle_embed = hf_embedder + .embed_search(SearchQuery::Text("Intel the beagle best doggo"), None) + .unwrap(); + let lab_embed = + hf_embedder.embed_search(SearchQuery::Text("Max the lab best doggo"), None).unwrap(); + let patou_embed = hf_embedder + .embed_search(SearchQuery::Text("kefir the patou best doggo"), None) + .unwrap(); (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) }; @@ -166,22 +190,38 @@ fn import_vectors() { let rtxn = index.read_txn().unwrap(); // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); let embeddings = index.embeddings(&rtxn, 0).unwrap(); - assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -239,25 +279,41 @@ fn import_vectors() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + let embedders = index.embedding_configs(); // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true"); // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -400,7 +456,7 @@ fn import_vectors_first_and_embedder_later() { // the all the vectors linked to the new specified embedder have been removed // Only the unknown embedders stays in the document DB snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1,2,3]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4,5]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); // even though we specified the vector for the ID 3, it shouldn't be marked // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" @@ -426,19 +482,28 @@ fn import_vectors_first_and_embedder_later() { }, quantized: None, }, - user_provided: RoaringBitmap<[1, 2]>, + fragments: FragmentConfigs( + [], + ), }, ] "###); + let info = + index.embedding_configs().embedder_info(&rtxn, "my_doggo_embedder").unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[1, 2, 3]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[1, 2]>"); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty(), "{embedding:?}"); // the document with the id 3 should keep its original embedding let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embeddings = &embeddings["my_doggo_embedder"]; + let (embeddings, _) = &embeddings["my_doggo_embedder"]; snapshot!(embeddings.len(), @"1"); assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); @@ -493,7 +558,7 @@ fn import_vectors_first_and_embedder_later() { "###); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); @@ -501,7 +566,7 @@ fn import_vectors_first_and_embedder_later() { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); } @@ -603,33 +668,35 @@ fn delete_document_containing_vector() { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, }, - quantized: None, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[0]>, + quantized: None, }, - ] - "###); + fragments: FragmentConfigs( + [], + ), + }, + ] + "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["manual"]; + let (embedding, _) = &embeddings["manual"]; assert!(!embedding.is_empty(), "{embedding:?}"); index_scheduler @@ -647,30 +714,32 @@ fn delete_document_containing_vector() { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, }, - quantized: None, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[]>, + quantized: None, }, - ] - "###); + fragments: FragmentConfigs( + [], + ), + }, + ] + "###); } #[test] diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index be2a724b0..d95e6fb64 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -499,7 +499,7 @@ async fn query_combination() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Invalid request: missing `hybrid` parameter when `vector` is present.", + "message": "Invalid request: missing `hybrid` parameter when `vector` or `media` are present.", "code": "missing_search_hybrid", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#missing_search_hybrid" From e54fc592485b19dbfb8f647b542cc5738a8057bf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Sun, 29 Jun 2025 23:47:15 +0200 Subject: [PATCH 064/101] Fix snaps --- ...r__scheduler__test__settings_update-7.snap | 17 ++++++++++++++++ ...er__test_embedders__import_vectors-14.snap | 12 +++++++++++ ...er__test_embedders__import_vectors-27.snap | 15 ++++++++++++++ ...er__test_embedders__import_vectors-40.snap | 15 ++++++++++++++ ...ler__test_embedders__import_vectors-8.snap | 15 +++++++++----- .../after_registering_settings_task.snap | 2 +- .../settings_update_processed.snap | 2 +- .../Intel to kefir succeeds.snap | 2 +- .../import_vectors/Intel to kefir.snap | 2 +- .../import_vectors/adding Intel succeeds.snap | 2 +- .../import_vectors/after adding Intel.snap | 2 +- ...ter_registering_settings_task_vectors.snap | 2 +- .../settings_update_processed_vectors.snap | 2 +- crates/meilisearch/tests/dumps/mod.rs | 9 ++++++--- crates/meilisearch/tests/features/mod.rs | 20 ++++++++++++------- 15 files changed, 96 insertions(+), 23 deletions(-) create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap create mode 100644 crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap new file mode 100644 index 000000000..82134b838 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap @@ -0,0 +1,17 @@ +--- +source: crates/index-scheduler/src/scheduler/test.rs +expression: config.embedder_options +--- +{ + "Rest": { + "api_key": "My super secret", + "distribution": null, + "dimensions": 4, + "url": "http://localhost:7777", + "request": "{{text}}", + "search_fragments": {}, + "indexing_fragments": {}, + "response": "{{embedding}}", + "headers": {} + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap new file mode 100644 index 000000000..19b5cab92 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap @@ -0,0 +1,12 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: simple_hf_config.embedder_options +--- +{ + "HuggingFace": { + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "distribution": null, + "pooling": "useModel" + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap new file mode 100644 index 000000000..0fc8bd531 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap @@ -0,0 +1,15 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: doc +--- +{ + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap new file mode 100644 index 000000000..0942e4d82 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap @@ -0,0 +1,15 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: doc +--- +{ + "doggo": "kefir", + "breed": "patou", + "_vectors": { + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap index 19b5cab92..29f35d9c1 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap @@ -1,12 +1,17 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -expression: simple_hf_config.embedder_options +expression: fakerest_config.embedder_options --- { - "HuggingFace": { - "model": "sentence-transformers/all-MiniLM-L6-v2", - "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "Rest": { + "api_key": "My super secret", "distribution": null, - "pooling": "useModel" + "dimensions": 384, + "url": "http://localhost:7777", + "request": "{{text}}", + "search_fragments": {}, + "indexing_fragments": {}, + "response": "{{embedding}}", + "headers": {} } } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap index c66a6b5b3..a52f18079 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap index b7faefa8a..b99e15852 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap index c8955e2b6..12e03a28b 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap index 23e43860f..2ea2ebb17 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap index 732527fa8..a2a263b6f 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap index 5e01ffcdf..29fc6abf4 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap index 1172d1118..ae943bf48 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap index 3653eeb9a..9ada7580a 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 3d3bc01db..9b111186d 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2188,7 +2188,8 @@ async fn import_dump_v6_containing_experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -2314,7 +2315,8 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -2420,7 +2422,8 @@ async fn generate_and_import_dump_containing_vectors() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); diff --git a/crates/meilisearch/tests/features/mod.rs b/crates/meilisearch/tests/features/mod.rs index d0d457d3e..ec5838d35 100644 --- a/crates/meilisearch/tests/features/mod.rs +++ b/crates/meilisearch/tests/features/mod.rs @@ -25,7 +25,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -41,7 +42,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -57,7 +59,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -74,7 +77,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -91,7 +95,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); } @@ -115,7 +120,8 @@ async fn experimental_feature_metrics() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -162,7 +168,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`", + "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`, `multimodal`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" From c1a132fa068e252d2554cd5acab489e9eea804b2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 30 Jun 2025 13:54:04 +0200 Subject: [PATCH 065/101] `multimodal` experimental feature --- crates/index-scheduler/src/features.rs | 13 +++++++++++++ crates/meilisearch-types/src/features.rs | 1 + .../meilisearch/src/analytics/segment_analytics.rs | 3 +++ crates/meilisearch/src/routes/features.rs | 11 +++++++++++ crates/meilisearch/src/routes/indexes/settings.rs | 8 ++++++++ crates/meilisearch/src/search/mod.rs | 3 +++ 6 files changed, 39 insertions(+) diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index 78ffc0766..b52a659a6 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -144,6 +144,19 @@ impl RoFeatures { .into()) } } + + pub fn check_multimodal(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.multimodal { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "multimodal", + issue_link: "https://github.com/orgs/meilisearch/discussions/846", + } + .into()) + } + } } impl FeatureData { diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index 9ec2d321f..3c78035e8 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -21,6 +21,7 @@ pub struct RuntimeTogglableFeatures { pub get_task_documents_route: bool, pub composite_embedders: bool, pub chat_completions: bool, + pub multimodal: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 668a7fded..0abc5c817 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -197,6 +197,7 @@ struct Infos { experimental_max_number_of_batched_tasks: usize, experimental_limit_batched_tasks_total_size: u64, experimental_network: bool, + experimental_multimodal: bool, experimental_chat_completions: bool, experimental_get_task_documents_route: bool, experimental_composite_embedders: bool, @@ -303,6 +304,7 @@ impl Infos { get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = features; // We're going to override every sensible information. @@ -322,6 +324,7 @@ impl Infos { experimental_reduce_indexing_memory_usage, experimental_network: network, experimental_chat_completions: chat_completions, + experimental_multimodal: multimodal, experimental_get_task_documents_route: get_task_documents_route, experimental_composite_embedders: composite_embedders, experimental_embedding_cache_entries, diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index 179b9cf68..1a1f89b2d 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -54,6 +54,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { get_task_documents_route: Some(false), composite_embedders: Some(false), chat_completions: Some(false), + multimodal: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -100,6 +101,8 @@ pub struct RuntimeTogglableFeatures { pub composite_embedders: Option, #[deserr(default)] pub chat_completions: Option, + #[deserr(default)] + pub multimodal: Option, } impl From for RuntimeTogglableFeatures { @@ -113,6 +116,7 @@ impl From for RuntimeTogg get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = value; Self { @@ -124,6 +128,7 @@ impl From for RuntimeTogg get_task_documents_route: Some(get_task_documents_route), composite_embedders: Some(composite_embedders), chat_completions: Some(chat_completions), + multimodal: Some(multimodal), } } } @@ -138,6 +143,7 @@ pub struct PatchExperimentalFeatureAnalytics { get_task_documents_route: bool, composite_embedders: bool, chat_completions: bool, + multimodal: bool, } impl Aggregate for PatchExperimentalFeatureAnalytics { @@ -155,6 +161,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { get_task_documents_route: new.get_task_documents_route, composite_embedders: new.composite_embedders, chat_completions: new.chat_completions, + multimodal: new.multimodal, }) } @@ -181,6 +188,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { get_task_documents_route: Some(false), composite_embedders: Some(false), chat_completions: Some(false), + multimodal: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -223,6 +231,7 @@ async fn patch_features( .composite_embedders .unwrap_or(old_features.composite_embedders), chat_completions: new_features.0.chat_completions.unwrap_or(old_features.chat_completions), + multimodal: new_features.0.multimodal.unwrap_or(old_features.multimodal), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because @@ -237,6 +246,7 @@ async fn patch_features( get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = new_features; analytics.publish( @@ -249,6 +259,7 @@ async fn patch_features( get_task_documents_route, composite_embedders, chat_completions, + multimodal, }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index a4b7a5219..308977a6e 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -755,6 +755,14 @@ fn validate_settings( if matches!(embedder.indexing_embedder, Setting::Set(_)) { features.check_composite_embedders("setting `indexingEmbedder`")?; } + + if matches!(embedder.indexing_fragments, Setting::Set(_)) { + features.check_multimodal("setting `indexingFragments`")?; + } + + if matches!(embedder.search_fragments, Setting::Set(_)) { + features.check_multimodal("setting `searchFragments`")?; + } } } diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 6d8639504..1c987a70c 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -972,6 +972,9 @@ pub fn prepare_search<'t>( time_budget: TimeBudget, features: RoFeatures, ) -> Result<(milli::Search<'t>, bool, usize, usize), ResponseError> { + if query.media.is_some() { + features.check_multimodal("passing `media` in a search query")?; + } let mut search = index.search(rtxn); search.time_budget(time_budget); if let Some(ranking_score_threshold) = query.ranking_score_threshold { From e30c24b5bfa6aa8e1782cfe9043c50b80f403222 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Jul 2025 23:52:44 +0200 Subject: [PATCH 066/101] Prompt: relax lifetime constraints --- crates/milli/src/prompt/document.rs | 11 ++++++----- crates/milli/src/prompt/fields.rs | 24 ++++++++++++------------ crates/milli/src/prompt/mod.rs | 4 ++-- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index b00c4cb42..1125c8fba 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -144,18 +144,19 @@ impl ValueView for Document<'_> { use crate::update::new::document::Document as DocumentTrait; #[derive(Debug)] -pub struct ParseableDocument<'doc, D> { +pub struct ParseableDocument<'a, 'doc, D: DocumentTrait<'a> + Debug> { document: D, doc_alloc: &'doc Bump, + _marker: std::marker::PhantomData<&'a ()>, } -impl<'doc, D> ParseableDocument<'doc, D> { +impl<'a, 'doc, D: DocumentTrait<'a> + Debug> ParseableDocument<'a, 'doc, D> { pub fn new(document: D, doc_alloc: &'doc Bump) -> Self { - Self { document, doc_alloc } + Self { document, doc_alloc, _marker: std::marker::PhantomData } } } -impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc, D> { +impl<'a, D: DocumentTrait<'a> + Debug> ObjectView for ParseableDocument<'a, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc } } -impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { +impl<'a, D: DocumentTrait<'a> + Debug> ValueView for ParseableDocument<'a, '_, D> { fn as_debug(&self) -> &dyn Debug { self } diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index 8d006f0b7..5a842268c 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -121,10 +121,10 @@ impl ObjectView for FieldValue<'_, D> { pub struct OwnedFields<'a, D: ObjectView>(Vec>); #[derive(Debug)] -pub struct BorrowedFields<'a, 'map, D: ObjectView> { +pub struct BorrowedFields<'a, 'doc, 'map, D: ObjectView> { document: &'a D, field_id_map: &'a RefCell>, - doc_alloc: &'a Bump, + doc_alloc: &'doc Bump, } impl<'a, D: ObjectView> OwnedFields<'a, D> { @@ -138,11 +138,11 @@ impl<'a, D: ObjectView> OwnedFields<'a, D> { } } -impl<'a, 'map, D: ObjectView> BorrowedFields<'a, 'map, D> { +impl<'a, 'doc, 'map, D: ObjectView> BorrowedFields<'a, 'doc, 'map, D> { pub fn new( document: &'a D, field_id_map: &'a RefCell>, - doc_alloc: &'a Bump, + doc_alloc: &'doc Bump, ) -> Self { Self { document, field_id_map, doc_alloc } } @@ -170,7 +170,7 @@ impl ArrayView for OwnedFields<'_, D> { } } -impl ArrayView for BorrowedFields<'_, '_, D> { +impl ArrayView for BorrowedFields<'_, '_, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -212,7 +212,7 @@ impl ArrayView for BorrowedFields<'_, '_, D> { } } -impl ValueView for BorrowedFields<'_, '_, D> { +impl ValueView for BorrowedFields<'_, '_, '_, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -288,11 +288,11 @@ impl ValueView for OwnedFields<'_, D> { } } -struct ArraySource<'a, 'map, D: ObjectView> { - s: &'a BorrowedFields<'a, 'map, D>, +struct ArraySource<'a, 'doc, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'doc, 'map, D>, } -impl fmt::Display for ArraySource<'_, '_, D> { +impl fmt::Display for ArraySource<'_, '_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[")?; for item in self.s.values() { @@ -303,11 +303,11 @@ impl fmt::Display for ArraySource<'_, '_, D> { } } -struct ArrayRender<'a, 'map, D: ObjectView> { - s: &'a BorrowedFields<'a, 'map, D>, +struct ArrayRender<'a, 'doc, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'doc, 'map, D>, } -impl fmt::Display for ArrayRender<'_, '_, D> { +impl fmt::Display for ArrayRender<'_, '_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for item in self.s.values() { write!(f, "{}", item.render())?; diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index f1b4ddf89..03b20a090 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -107,8 +107,8 @@ impl Prompt { } pub fn render_document< - 'a, // lifetime of the borrow of the document - 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents + 'a, // lifetime of the borrow of the document + 'doc, // lifetime of the allocator, will live for an entire chunk of documents >( &self, external_docid: &str, From 9ce5598fef9d966621710192934ebb6cd45bdbd2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Jul 2025 23:55:07 +0200 Subject: [PATCH 067/101] parsed vectors: embeddings is None when it is null when read from DB --- crates/milli/src/vector/parsed_vectors.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 36e80677a..8ff5a2201 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -150,7 +150,8 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { regenerate = Some(value); } Ok(Some("embeddings")) => { - let value: &RawValue = match map.next_value() { + let value: &RawValue = match map.next_value::<&RawValue>() { + Ok(value) if value.get() == "null" => continue, Ok(value) => value, Err(error) => { return Ok(Err(RawVectorsError::DeserializeEmbeddings { From b086c51a232dd76406525c7caa128daa9bc5b10d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 1 Jul 2025 23:57:14 +0200 Subject: [PATCH 068/101] new settings indexer --- .../src/update/new/extract/vectors/mod.rs | 294 ++++++++++++------ .../milli/src/update/new/indexer/extract.rs | 25 +- crates/milli/src/update/new/indexer/mod.rs | 67 +++- 3 files changed, 262 insertions(+), 124 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 3b8f5fa58..c08fadb14 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,5 +1,4 @@ use std::cell::RefCell; -use std::collections::BTreeMap; use std::fmt::Debug; use bumpalo::collections::Vec as BVec; @@ -16,15 +15,17 @@ use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; +use crate::update::settings::SettingsDelta; use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{ EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, }; use crate::vector::extractor::{ - DocumentTemplateExtractor, Extractor as VectorExtractor, RequestFragmentExtractor, + DocumentTemplateExtractor, Extractor as VectorExtractor, ExtractorDiff, + RequestFragmentExtractor, }; use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed}; -use crate::vector::settings::{EmbedderAction, ReindexAction}; +use crate::vector::settings::ReindexAction; use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; @@ -260,44 +261,31 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } } -pub struct SettingsChangeEmbeddingExtractor<'a, 'b> { - embedders: &'a EmbeddingConfigs, - old_embedders: &'a EmbeddingConfigs, - embedder_actions: &'a BTreeMap, - embedder_category_id: &'a std::collections::HashMap, +pub struct SettingsChangeEmbeddingExtractor<'a, 'b, SD> { + settings_delta: &'a SD, embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { +impl<'a, 'b, SD: SettingsDelta> SettingsChangeEmbeddingExtractor<'a, 'b, SD> { #[allow(clippy::too_many_arguments)] pub fn new( - embedders: &'a EmbeddingConfigs, - old_embedders: &'a EmbeddingConfigs, - embedder_actions: &'a BTreeMap, - embedder_category_id: &'a std::collections::HashMap, + settings_delta: &'a SD, embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); - Self { - embedders, - old_embedders, - embedder_actions, - embedder_category_id, - embedder_stats, - sender, - threads, - possible_embedding_mistakes, - } + Self { settings_delta, embedder_stats, sender, threads, possible_embedding_mistakes } } } -impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbeddingExtractor<'_, '_> { +impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor> + for SettingsChangeEmbeddingExtractor<'_, '_, SD> +{ type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -309,44 +297,49 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding documents: impl Iterator>>, context: &'doc DocumentContext, ) -> crate::Result<()> { - let embedders = self.embedders.inner_as_ref(); - let old_embedders = self.old_embedders.inner_as_ref(); + let embedders = self.settings_delta.new_embedders(); + let old_embedders = self.settings_delta.old_embedders(); let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); - for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { - // if the embedder is not in the embedder_actions, we don't need to reindex. - if let Some((embedder_id, reindex_action)) = - self.embedder_actions - .get(embedder_name) - // keep only the reindex actions - .and_then(EmbedderAction::reindex) - // map the reindex action to the embedder_id - .map(|reindex| { - let embedder_id = self.embedder_category_id.get(embedder_name).expect( - "An embedder_category_id must exist for all reindexed embedders", - ); - (*embedder_id, reindex) - }) - { - all_chunks.push(( - Chunks::new( - embedder, - embedder_id, - embedder_name, - prompt, - context.data, - &self.possible_embedding_mistakes, - self.embedder_stats, - self.threads, - self.sender, - &context.doc_alloc, - ), - reindex_action, - )) - } + let embedder_configs = context.index.embedding_configs(); + for (embedder_name, action) in self.settings_delta.embedder_actions().iter() { + let Some(reindex_action) = action.reindex() else { + continue; + }; + let runtime = embedders + .get(embedder_name) + .expect("A runtime must exist for all reindexed embedder"); + let embedder_info = embedder_configs + .embedder_info(&context.rtxn, embedder_name)? + .unwrap_or_else(|| { + // new embedder + EmbedderInfo { + embedder_id: *self + .settings_delta + .new_embedder_category_id() + .get(embedder_name) + .expect( + "An embedder_category_id must exist for all reindexed embedders", + ), + embedding_status: EmbeddingStatus::new(), + } + }); + all_chunks.push(( + Chunks::new( + runtime, + embedder_info, + embedder_name.as_str(), + context.data, + &self.possible_embedding_mistakes, + self.embedder_stats, + self.threads, + self.sender, + &context.doc_alloc, + ), + reindex_action, + )); } - for document in documents { let document = document?; @@ -360,6 +353,16 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding for (chunks, reindex_action) in &mut all_chunks { let embedder_name = chunks.embedder_name(); let current_vectors = current_vectors.vectors_for_key(embedder_name)?; + let (old_is_user_provided, _) = + chunks.is_user_provided_must_regenerate(document.docid()); + let old_has_fragments = old_embedders + .get(embedder_name) + .map(|embedder| embedder.fragments().is_empty()) + .unwrap_or_default(); + + let new_has_fragments = chunks.has_fragments(); + + let fragments_changed = old_has_fragments ^ new_has_fragments; // if the vectors for this document have been already provided, we don't need to reindex. let (is_new_embedder, must_regenerate) = @@ -368,60 +371,33 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding }); match reindex_action { - ReindexAction::RegeneratePrompts => { + ReindexAction::RegeneratePrompts | ReindexAction::RegenerateFragments(_) => { if !must_regenerate { continue; } // we need to regenerate the prompts for the document - - // Get the old prompt and render the document with it - let Some((_, old_prompt, _)) = old_embedders.get(embedder_name) else { - unreachable!("ReindexAction::RegeneratePrompts implies that the embedder {embedder_name} is in the old_embedders") - }; - let old_rendered = old_prompt.render_document( + chunks.settings_change_autogenerated( + document.docid(), document.external_document_id(), document.current( &context.rtxn, context.index, context.db_fields_ids_map, )?, + self.settings_delta, context.new_fields_ids_map, - &context.doc_alloc, + &unused_vectors_distribution, + old_is_user_provided, + fragments_changed, )?; - - // Get the new prompt and render the document with it - let new_prompt = chunks.prompt(); - let new_rendered = new_prompt.render_document( - document.external_document_id(), - document.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - - // Compare the rendered documents - // if they are different, regenerate the vectors - if new_rendered != old_rendered { - chunks.set_autogenerated( - document.docid(), - document.external_document_id(), - new_rendered, - &unused_vectors_distribution, - )?; - } } ReindexAction::FullReindex => { - let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate if let Some(embeddings) = current_vectors .and_then(|vectors| vectors.embeddings) // insert the embeddings only for new embedders .filter(|_| is_new_embedder) { - chunks.set_regenerate(document.docid(), must_regenerate); chunks.set_vectors( document.external_document_id(), document.docid(), @@ -431,24 +407,27 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding error: error.to_string(), }, )?, + old_is_user_provided, + true, + must_regenerate, )?; } else if must_regenerate { - let rendered = prompt.render_document( + chunks.settings_change_autogenerated( + document.docid(), document.external_document_id(), document.current( &context.rtxn, context.index, context.db_fields_ids_map, )?, + self.settings_delta, context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - document.docid(), - document.external_document_id(), - rendered, &unused_vectors_distribution, + old_is_user_provided, + true, )?; + } else if is_new_embedder { + chunks.set_status(document.docid(), false, true, false, false); } } } @@ -585,7 +564,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { let embedder = &runtime.embedder; let dimensions = embedder.dimensions(); - let fragments = runtime.fragments.as_slice(); + let fragments = runtime.fragments(); let kind = if fragments.is_empty() { ChunkType::DocumentTemplate { document_template: &runtime.document_template, @@ -627,6 +606,117 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { self.status.is_user_provided_must_regenerate(docid) } + #[allow(clippy::too_many_arguments)] + pub fn settings_change_autogenerated<'doc, D: Document<'doc> + Debug, SD: SettingsDelta>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + document: D, + settings_delta: &SD, + fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + old_is_user_provided: bool, + full_reindex: bool, + ) -> Result<()> + where + 'a: 'doc, + { + match &mut self.kind { + ChunkType::Fragments { fragments: _, session } => { + let doc_alloc = session.doc_alloc(); + + if old_is_user_provided | full_reindex { + session.on_embed_mut().clear_vectors(docid); + } + + let mut extracted = false; + let extracted = &mut extracted; + + settings_delta.try_for_each_fragment_diff( + session.embedder_name(), + |fragment_diff| { + let extractor = RequestFragmentExtractor::new(fragment_diff.new, doc_alloc) + .ignore_errors(); + let old = if full_reindex { + None + } else { + fragment_diff.old.map(|old| { + RequestFragmentExtractor::new(old, doc_alloc).ignore_errors() + }) + }; + let metadata = Metadata { + docid, + external_docid, + extractor_id: extractor.extractor_id(), + }; + + match extractor.diff_settings(&document, &(), old.as_ref())? { + ExtractorDiff::Removed => { + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { + metadata, + embedding: None, + }, + ); + } + ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + *extracted = true; + session.request_embedding( + metadata, + input, + unused_vectors_distribution, + )?; + } + ExtractorDiff::Unchanged => { /* nothing to do */ } + } + + Result::Ok(()) + }, + )?; + self.set_status( + docid, + old_is_user_provided, + true, + old_is_user_provided & !*extracted, + true, + ); + } + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + + let old_embedder = settings_delta.old_embedders().get(session.embedder_name()); + let old_document_template = if full_reindex { + None + } else { + old_embedder.as_ref().map(|old_embedder| &old_embedder.document_template) + }; + let extractor = + DocumentTemplateExtractor::new(document_template, doc_alloc, fields_ids_map); + let old_extractor = old_document_template.map(|old_document_template| { + DocumentTemplateExtractor::new(old_document_template, doc_alloc, fields_ids_map) + }); + let metadata = + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; + + match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { + ExtractorDiff::Removed => { + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { metadata, embedding: None }, + ); + } + ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + session.request_embedding(metadata, input, unused_vectors_distribution)?; + } + ExtractorDiff::Unchanged => { /* do nothing */ } + } + self.set_status(docid, old_is_user_provided, true, false, true); + } + } + Ok(()) + } + #[allow(clippy::too_many_arguments)] pub fn update_autogenerated<'doc, OD: Document<'doc> + Debug, ND: Document<'doc> + Debug>( &mut self, @@ -862,6 +952,10 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Ok(()) } + + fn has_fragments(&self) -> bool { + matches!(self.kind, ChunkType::Fragments { .. }) + } } #[allow(clippy::too_many_arguments)] diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index a3e7842c2..abfb4d6da 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -21,7 +21,7 @@ use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; -use crate::vector::db::IndexEmbeddingConfig; +use crate::vector::db::{EmbedderInfo, IndexEmbeddingConfig}; use crate::vector::RuntimeEmbedders; use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; @@ -333,12 +333,11 @@ pub(super) fn extract_all_settings_changes( finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, mut index_embeddings: Vec, - modified_docids: &mut RoaringBitmap, embedder_stats: &EmbedderStats, ) -> Result> where MSP: Fn() -> bool + Sync, - SD: SettingsDelta, + SD: SettingsDelta + Sync, { // Create the list of document ids to extract let rtxn = indexing_context.index.read_txn()?; @@ -369,10 +368,7 @@ where // extract the remaining embeddings let extractor = SettingsChangeEmbeddingExtractor::new( - settings_delta.new_embedders(), - settings_delta.old_embedders(), - settings_delta.embedder_actions(), - settings_delta.new_embedder_category_id(), + settings_delta, embedder_stats, embedding_sender, field_distribution, @@ -396,14 +392,25 @@ where let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); + let embedder_configs = indexing_context.index.embedding_configs(); for config in &mut index_embeddings { + // retrieve infos for existing embedder or create a fresh one + let mut infos = + embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap_or_else(|| { + let embedder_id = + *settings_delta.new_embedder_category_id().get(&config.name).unwrap(); + EmbedderInfo { embedder_id, embedding_status: Default::default() } + }); + 'data: for data in datastore.iter_mut() { let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { + let Some(delta) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided, modified_docids); + delta.apply_to(&mut infos.embedding_status); } + + extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap(); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 507d1a650..a6ba3a919 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -23,7 +23,7 @@ use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::progress::{EmbedderStats, Progress}; use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; -use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; +use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments}; use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; @@ -221,7 +221,7 @@ where MSP: Fn() -> bool + Sync, SD: SettingsDelta + Sync, { - delete_old_embedders(wtxn, index, settings_delta)?; + delete_old_embedders_and_fragments(wtxn, index, settings_delta)?; let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); @@ -254,16 +254,14 @@ where grenad_parameters: &grenad_parameters, }; - let index_embeddings = index.embedding_configs(wtxn)?; + let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; - let mut modified_docids = roaring::RoaringBitmap::new(); let congestion = thread::scope(|s| -> Result { let indexer_span = tracing::Span::current(); let finished_extraction = &finished_extraction; // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; - let modified_docids = &mut modified_docids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.install(move || { @@ -276,7 +274,6 @@ where finished_extraction, field_distribution, index_embeddings, - modified_docids, &embedder_stats, ) }) @@ -342,7 +339,7 @@ where fn arroy_writers_from_embedder_actions<'indexer>( index: &Index, embedder_actions: &'indexer BTreeMap, - embedders: &'indexer EmbeddingConfigs, + embedders: &'indexer RuntimeEmbedders, index_embedder_category_ids: &'indexer std::collections::HashMap, ) -> Result> { let vector_arroy = index.vector_arroy; @@ -350,7 +347,7 @@ fn arroy_writers_from_embedder_actions<'indexer>( embedders .inner_as_ref() .iter() - .filter_map(|(embedder_name, (embedder, _, _))| match embedder_actions.get(embedder_name) { + .filter_map(|(embedder_name, runtime)| match embedder_actions.get(embedder_name) { None => None, Some(action) if action.write_back().is_some() => None, Some(action) => { @@ -365,25 +362,65 @@ fn arroy_writers_from_embedder_actions<'indexer>( }; let writer = ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized); - let dimensions = embedder.dimensions(); + let dimensions = runtime.embedder.dimensions(); Some(Ok(( embedder_category_id, - (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + (embedder_name.as_str(), runtime.embedder.as_ref(), writer, dimensions), ))) } }) .collect() } -fn delete_old_embedders(wtxn: &mut RwTxn<'_>, index: &Index, settings_delta: &SD) -> Result<()> +fn delete_old_embedders_and_fragments( + wtxn: &mut RwTxn<'_>, + index: &Index, + settings_delta: &SD, +) -> Result<()> where SD: SettingsDelta, { for action in settings_delta.embedder_actions().values() { - if let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() { - let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); - let dimensions = reader.dimensions(wtxn)?; - reader.clear(wtxn, dimensions)?; + let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { + continue; + }; + let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); + let Some(dimensions) = reader.dimensions(wtxn)? else { + continue; + }; + reader.clear(wtxn, dimensions)?; + } + + // remove all vectors for the specified fragments + for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in + settings_delta.embedder_actions().iter().filter_map(|(name, action)| { + action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized)) + }) + { + let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { + continue; + }; + let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized); + let Some(dimensions) = arroy.dimensions(wtxn)? else { + continue; + }; + for fragment_id in fragment_ids { + // we must keep the user provided embeddings that ended up in this store + + if infos.embedding_status.user_provided_docids().is_empty() { + // no user provided: clear store + arroy.clear_store(wtxn, *fragment_id, dimensions)?; + continue; + } + + // some user provided, remove only the ids that are not user provided + let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + items - infos.embedding_status.user_provided_docids() + })?; + + for to_delete in to_delete { + arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + } } } From e6329e77e1c0c470ed7d8db9bee9f6abc18bb01d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:00:39 +0200 Subject: [PATCH 069/101] settings fragment_diffs --- crates/milli/src/update/settings.rs | 93 +++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 3dae4f57c..03d44d785 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1578,6 +1578,7 @@ pub struct InnerIndexSettingsDiff { /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. pub(crate) only_additional_fields: Option>, + fragment_diffs: BTreeMap, usize)>>, // Cache the check to see if all the stop_words, allowed_separators, dictionary, // exact_attributes, proximity_precision are different. @@ -1695,10 +1696,59 @@ impl InnerIndexSettingsDiff { } } + // build the fragment diffs + let mut fragment_diffs = BTreeMap::new(); + for (embedder_name, embedder_action) in &embedding_config_updates { + let Some(new_embedder) = new_settings.runtime_embedders.get(embedder_name) else { + continue; + }; + let regenerate_fragments = + if let Some(ReindexAction::RegenerateFragments(regenerate_fragments)) = + embedder_action.reindex() + { + either::Either::Left( + regenerate_fragments + .iter() + .filter(|(_, action)| { + !matches!( + action, + crate::vector::settings::RegenerateFragment::Remove + ) + }) + .map(|(name, _)| name), + ) + } else { + either::Either::Right( + new_embedder.fragments().iter().map(|fragment| &fragment.name), + ) + }; + + let old_embedder = old_settings.runtime_embedders.get(embedder_name); + + let mut fragments = Vec::new(); + for fragment_name in regenerate_fragments { + let Ok(new) = new_embedder + .fragments() + .binary_search_by_key(&fragment_name, |fragment| &fragment.name) + else { + continue; + }; + let old = old_embedder.as_ref().and_then(|old_embedder| { + old_embedder + .fragments() + .binary_search_by_key(&fragment_name, |fragment| &fragment.name) + .ok() + }); + fragments.push((old, new)); + } + fragment_diffs.insert(embedder_name.clone(), fragments); + } + InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, + fragment_diffs, embedding_config_updates, settings_update_only, only_additional_fields, @@ -2341,9 +2391,21 @@ pub trait SettingsDelta { fn old_embedders(&self) -> &EmbeddingConfigs; fn new_embedder_category_id(&self) -> &HashMap; fn embedder_actions(&self) -> &BTreeMap; + fn try_for_each_fragment_diff( + &self, + embedder_name: &str, + for_each: F, + ) -> std::result::Result<(), E> + where + F: FnMut(FragmentDiff) -> std::result::Result<(), E>; fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata; } +pub struct FragmentDiff<'a> { + pub old: Option<&'a RuntimeFragment>, + pub new: &'a RuntimeFragment, +} + impl SettingsDelta for InnerIndexSettingsDiff { fn new_embedders(&self) -> &EmbeddingConfigs { &self.new.embedding_configs @@ -2364,6 +2426,37 @@ impl SettingsDelta for InnerIndexSettingsDiff { fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata { &self.new.fields_ids_map } + + fn try_for_each_fragment_diff( + &self, + embedder_name: &str, + mut for_each: F, + ) -> std::result::Result<(), E> + where + F: FnMut(FragmentDiff) -> std::result::Result<(), E>, + { + let Some(fragment_diff) = self.fragment_diffs.get(embedder_name) else { return Ok(()) }; + for (old, new) in fragment_diff { + let Some(new_runtime) = self.new.runtime_embedders.get(embedder_name) else { + continue; + }; + + let new = new_runtime.fragments().get(*new).unwrap(); + + match old { + Some(old) => { + if let Some(old_runtime) = self.old.runtime_embedders.get(embedder_name) { + let old = &old_runtime.fragments().get(*old).unwrap(); + for_each(FragmentDiff { old: Some(old), new })?; + } else { + for_each(FragmentDiff { old: None, new })?; + } + } + None => for_each(FragmentDiff { old: None, new })?, + }; + } + Ok(()) + } } #[cfg(test)] From 2b2e6c0b3a278827e49de4131bfeeec48d39e7bd Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:01:59 +0200 Subject: [PATCH 070/101] Settings changes --- crates/index-scheduler/src/lib.rs | 16 +++--- crates/milli/src/test_index.rs | 4 +- .../extract/extract_vector_points.rs | 50 ++++++++++--------- .../src/update/index_documents/extract/mod.rs | 2 +- .../milli/src/update/index_documents/mod.rs | 2 +- .../src/update/index_documents/typed_chunk.rs | 2 +- crates/milli/src/update/settings.rs | 34 ++++++------- crates/milli/src/vector/mod.rs | 31 ++++++++++-- 8 files changed, 85 insertions(+), 56 deletions(-) diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index f551652c1..b2f27d66b 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -882,12 +882,12 @@ impl IndexScheduler { { let embedders = self.embedders.read().unwrap(); if let Some(embedder) = embedders.get(&embedder_options) { - let runtime = Arc::new(RuntimeEmbedder { - embedder: embedder.clone(), + let runtime = Arc::new(RuntimeEmbedder::new( + embedder.clone(), document_template, fragments, - is_quantized: quantized.unwrap_or_default(), - }); + quantized.unwrap_or_default(), + )); return Ok((name, runtime)); } @@ -906,12 +906,12 @@ impl IndexScheduler { embedders.insert(embedder_options, embedder.clone()); } - let runtime = Arc::new(RuntimeEmbedder { - embedder: embedder.clone(), + let runtime = Arc::new(RuntimeEmbedder::new( + embedder.clone(), document_template, fragments, - is_quantized: quantized.unwrap_or_default(), - }); + quantized.unwrap_or_default(), + )); Ok((name, runtime)) }, diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index cfd8c8492..6bb6b1345 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -66,7 +66,7 @@ impl TempIndex { let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; + let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders; let mut indexer = indexer::DocumentOperation::new(); match self.index_documents_config.update_method { IndexDocumentsMethod::ReplaceDocuments => { @@ -151,7 +151,7 @@ impl TempIndex { let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; + let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders; let mut indexer = indexer::DocumentOperation::new(); let external_document_ids: Vec<_> = diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 0a179cfa5..d40e82b92 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -236,8 +236,8 @@ pub fn extract_vector_points( let mut extractors = Vec::new(); - let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); - let old_configs = &settings_diff.old.embedding_configs; + let mut configs = settings_diff.new.runtime_embedders.clone().into_inner(); + let old_configs = &settings_diff.old.runtime_embedders; if reindex_vectors { for (name, action) in settings_diff.embedding_config_updates.iter() { if let Some(action) = action.reindex() { @@ -284,16 +284,16 @@ pub fn extract_vector_points( continue; }; - let fragments = regenerate_fragments + let fragment_diffs = regenerate_fragments .iter() .filter_map(|(name, fragment)| match fragment { crate::vector::settings::RegenerateFragment::Update => { let old_value = old_runtime - .fragments + .fragments() .binary_search_by_key(&name, |fragment| &fragment.name) .ok(); let Ok(new_value) = runtime - .fragments + .fragments() .binary_search_by_key(&name, |fragment| &fragment.name) else { return None; @@ -304,7 +304,7 @@ pub fn extract_vector_points( crate::vector::settings::RegenerateFragment::Remove => None, crate::vector::settings::RegenerateFragment::Add => { let Ok(new_value) = runtime - .fragments + .fragments() .binary_search_by_key(&name, |fragment| &fragment.name) else { return None; @@ -314,8 +314,8 @@ pub fn extract_vector_points( }) .collect(); ExtractionAction::SettingsRegenerateFragments { - old_runtime, - must_regenerate_fragments: fragments, + old_runtime: old_runtime.clone(), + must_regenerate_fragments: fragment_diffs, } } @@ -325,7 +325,9 @@ pub fn extract_vector_points( continue; }; - ExtractionAction::SettingsRegeneratePrompts { old_runtime } + ExtractionAction::SettingsRegeneratePrompts { + old_runtime: old_runtime.clone(), + } } }; @@ -473,11 +475,11 @@ pub fn extract_vector_points( ); continue; } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, @@ -492,14 +494,14 @@ pub fn extract_vector_points( old_runtime, } => { if old.must_regenerate() { - let has_fragments = !runtime.fragments.is_empty(); - let old_has_fragments = !old_runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); + let old_has_fragments = !old_runtime.fragments().is_empty(); let is_adding_fragments = has_fragments && !old_has_fragments; if is_adding_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, @@ -517,14 +519,16 @@ pub fn extract_vector_points( new_fields_ids_map, ); for (name, (old_index, new_index)) in must_regenerate_fragments { - let Some(new) = runtime.fragments.get(*new_index) else { continue }; + let Some(new) = runtime.fragments().get(*new_index) else { + continue; + }; let new = RequestFragmentExtractor::new(new, &doc_alloc).ignore_errors(); let diff = { let old = old_index.as_ref().and_then(|old| { - let old = old_runtime.fragments.get(*old)?; + let old = old_runtime.fragments().get(*old)?; Some( RequestFragmentExtractor::new(old, &doc_alloc) .ignore_errors(), @@ -555,11 +559,11 @@ pub fn extract_vector_points( ); continue; } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), &doc_alloc, new_fields_ids_map, obkv, @@ -607,7 +611,7 @@ pub fn extract_vector_points( manual_vectors_writer, &mut key_buffer, delta, - &runtime.fragments, + runtime.fragments(), )?; } @@ -720,7 +724,7 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { let prompt = &runtime.document_template; // Don't give up if the old prompt was failing @@ -753,7 +757,7 @@ fn extract_vector_document_diff( new_fields_ids_map, ); - for new in &runtime.fragments { + for new in runtime.fragments() { let name = &new.name; let fragment = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); @@ -791,11 +795,11 @@ fn extract_vector_document_diff( return Ok(VectorStateDelta::NoChange); } - let has_fragments = !runtime.fragments.is_empty(); + let has_fragments = !runtime.fragments().is_empty(); if has_fragments { regenerate_all_fragments( - &runtime.fragments, + runtime.fragments(), doc_alloc, new_fields_ids_map, obkv, diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index cbf4ceba2..b41fd59e1 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -242,7 +242,7 @@ fn send_original_documents_data( let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) // no point in indexing vectors without embedders - && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); + && (!settings_diff.new.runtime_embedders.inner_as_ref().is_empty()); if index_vectors { let settings_diff = settings_diff.clone(); diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 055b8bbad..658ff1923 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -517,7 +517,7 @@ where let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name); let was_quantized = settings_diff .old - .embedding_configs + .runtime_embedders .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 370579a6c..c93e3e0f7 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -673,7 +673,7 @@ pub(crate) fn write_typed_chunk_into_index( let binary_quantized = settings_diff .old - .embedding_configs + .runtime_embedders .get(&embedder_name) .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 03d44d785..c9ab427ea 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1647,9 +1647,9 @@ impl InnerIndexSettingsDiff { // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { - for (embedder_name, runtime) in new_settings.embedding_configs.inner_as_ref() { + for (embedder_name, runtime) in new_settings.runtime_embedders.inner_as_ref() { let was_quantized = old_settings - .embedding_configs + .runtime_embedders .get(embedder_name) .is_some_and(|conf| conf.is_quantized); // skip embedders that don't use document templates @@ -1893,7 +1893,7 @@ pub(crate) struct InnerIndexSettings { pub exact_attributes: HashSet, pub disabled_typos_terms: DisabledTyposTerms, pub proximity_precision: ProximityPrecision, - pub embedding_configs: RuntimeEmbedders, + pub runtime_embedders: RuntimeEmbedders, pub embedder_category_id: HashMap, pub geo_fields_ids: Option<(FieldId, FieldId)>, pub prefix_search: PrefixSearch, @@ -1904,7 +1904,7 @@ impl InnerIndexSettings { pub fn from_index( index: &Index, rtxn: &heed::RoTxn<'_>, - embedding_configs: Option, + runtime_embedders: Option, ) -> Result { let stop_words = index.stop_words(rtxn)?; let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); @@ -1913,13 +1913,13 @@ impl InnerIndexSettings { let mut fields_ids_map = index.fields_ids_map(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); - let embedding_configs = match embedding_configs { + let runtime_embedders = match runtime_embedders { Some(embedding_configs) => embedding_configs, None => embedders(index.embedding_configs().embedding_configs(rtxn)?)?, }; let embedder_category_id = index - .embedder_category_id - .iter(rtxn)? + .embedding_configs() + .iter_embedder_id(rtxn)? .map(|r| r.map(|(k, v)| (k.to_string(), v))) .collect::>()?; let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); @@ -1960,7 +1960,7 @@ impl InnerIndexSettings { sortable_fields, exact_attributes, proximity_precision, - embedding_configs, + runtime_embedders, embedder_category_id, geo_fields_ids, prefix_search, @@ -2035,12 +2035,12 @@ fn embedders(embedding_configs: Vec) -> Result &EmbeddingConfigs; - fn old_embedders(&self) -> &EmbeddingConfigs; + fn new_embedders(&self) -> &RuntimeEmbedders; + fn old_embedders(&self) -> &RuntimeEmbedders; fn new_embedder_category_id(&self) -> &HashMap; fn embedder_actions(&self) -> &BTreeMap; fn try_for_each_fragment_diff( @@ -2407,12 +2407,12 @@ pub struct FragmentDiff<'a> { } impl SettingsDelta for InnerIndexSettingsDiff { - fn new_embedders(&self) -> &EmbeddingConfigs { - &self.new.embedding_configs + fn new_embedders(&self) -> &RuntimeEmbedders { + &self.new.runtime_embedders } - fn old_embedders(&self) -> &EmbeddingConfigs { - &self.old.embedding_configs + fn old_embedders(&self) -> &RuntimeEmbedders { + &self.old.runtime_embedders } fn new_embedder_category_id(&self) -> &HashMap { diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 87ecd2414..f64223e41 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -742,10 +742,27 @@ pub struct RuntimeEmbedders(HashMap>); pub struct RuntimeEmbedder { pub embedder: Arc, pub document_template: Prompt, - pub fragments: Vec, + fragments: Vec, pub is_quantized: bool, } +impl RuntimeEmbedder { + pub fn new( + embedder: Arc, + document_template: Prompt, + mut fragments: Vec, + is_quantized: bool, + ) -> Self { + fragments.sort_unstable_by(|left, right| left.name.cmp(&right.name)); + Self { embedder, document_template, fragments, is_quantized } + } + + /// The runtime fragments sorted by name. + pub fn fragments(&self) -> &[RuntimeFragment] { + self.fragments.as_slice() + } +} + pub struct RuntimeFragment { pub name: String, pub id: u8, @@ -763,8 +780,8 @@ impl RuntimeEmbedders { } /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option> { - self.0.get(name).cloned() + pub fn get(&self, name: &str) -> Option<&Arc> { + self.0.get(name) } pub fn inner_as_ref(&self) -> &HashMap> { @@ -774,6 +791,14 @@ impl RuntimeEmbedders { pub fn into_inner(self) -> HashMap> { self.0 } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } impl IntoIterator for RuntimeEmbedders { From 119d618a7630963be1ce4dcac9a32da8d32b5ffc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:02:14 +0200 Subject: [PATCH 071/101] Do not "upgrade" regnerate fragments to regenerate prompt --- crates/milli/src/update/settings.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c9ab427ea..242e083f1 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -1679,9 +1679,6 @@ impl InnerIndexSettingsDiff { // fixup reindex to make sure we regenerate all fragments *reindex = match reindex.take() { - Some(ReindexAction::RegenerateFragments(_)) => { - Some(ReindexAction::RegeneratePrompts) - } Some(reindex) => Some(reindex), // We are at least regenerating prompts None => { if write_back.is_none() { From eda309d562701b9d91e3002ac7f6585dc46c2b7d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 00:02:48 +0200 Subject: [PATCH 072/101] make sure fragments are ordered --- crates/milli/src/vector/settings.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 93de37290..4bb4ed92c 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -1150,6 +1150,7 @@ impl SettingsDiff { (left, Setting::NotSet) => left, }; if !regenerate_fragments.is_empty() { + regenerate_fragments.sort_unstable_by(|(left, _), (right, _)| left.cmp(right)); ReindexAction::push_action( reindex_action, ReindexAction::RegenerateFragments(regenerate_fragments), From be640062114d12cfa6c073941f45530dcab988a7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 09:12:18 +0200 Subject: [PATCH 073/101] Fix process export --- .../index-scheduler/src/scheduler/process_export.rs | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 30721065e..2062e1c28 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -150,9 +150,6 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&index_rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index - .embedding_configs(&index_rtxn) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // We don't need to keep this one alive as we will // spawn many threads to process the documents @@ -232,17 +229,12 @@ impl IndexScheduler { )); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate: !user_provided, + regenerate, }; vectors.insert( embedder_name, From d72e5f5f697a8a0c0dc176284f02e4bb9cb5c767 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:29:50 +0200 Subject: [PATCH 074/101] Hide `documentTemplate` and `documentTemplateMaxBytes` when indexing_fragment is defined --- crates/milli/src/vector/settings.rs | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 4bb4ed92c..9ea8d7703 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -1932,8 +1932,18 @@ impl EmbeddingSettings { pooling: Setting::NotSet, api_key: Setting::some_or_not_set(api_key), dimensions: Setting::some_or_not_set(dimensions), - document_template, - document_template_max_bytes, + document_template: if indexing_fragments.is_empty() && search_fragments.is_empty() { + document_template + } else { + Setting::NotSet + }, + document_template_max_bytes: if indexing_fragments.is_empty() + && search_fragments.is_empty() + { + document_template_max_bytes + } else { + Setting::NotSet + }, url: Setting::Set(url), indexing_fragments: if indexing_fragments.is_empty() { Setting::NotSet From 3f5b5df139070e42da0912c9910295985ec17e49 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:35:01 +0200 Subject: [PATCH 075/101] Check consistency of fragments --- crates/meilisearch-types/src/settings.rs | 7 ++- crates/milli/src/update/settings.rs | 65 +++++++++++++++++------- crates/milli/src/vector/settings.rs | 6 +++ 3 files changed, 58 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index d7b163448..9e107a5c3 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -501,8 +501,11 @@ impl Settings { let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; for (name, config) in configs.iter_mut() { let config_to_check = std::mem::take(config); - let checked_config = - milli::update::validate_embedding_settings(config_to_check.inner, name)?; + let checked_config = milli::update::validate_embedding_settings( + config_to_check.inner, + name, + milli::vector::settings::EmbeddingValidationContext::SettingsPartialUpdate, + )?; *config = SettingEmbeddingSettings { inner: checked_config }; } self.embedders = Setting::Set(configs); diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 242e083f1..c2152022b 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -35,8 +35,8 @@ use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; use crate::vector::json_template::JsonTemplate; use crate::vector::settings::{ - EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, - SubEmbeddingSettings, WriteBackToDocuments, + EmbedderAction, EmbedderSource, EmbeddingSettings, EmbeddingValidationContext, NestingContext, + ReindexAction, SubEmbeddingSettings, WriteBackToDocuments, }; use crate::vector::{ Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, @@ -1181,13 +1181,20 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { }; embedder_actions.insert(name.clone(), embedder_action); - let new = validate_embedding_settings(updated_settings, &name)?; + let new = validate_embedding_settings( + updated_settings, + &name, + EmbeddingValidationContext::FullSettings, + )?; updated_configs.insert(name, (new, fragments)); } SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => { tracing::debug!(embedder = name, "update without reindex embedder"); - let new = - validate_embedding_settings(Setting::Set(updated_settings), &name)?; + let new = validate_embedding_settings( + Setting::Set(updated_settings), + &name, + EmbeddingValidationContext::FullSettings, + )?; if quantize { embedder_actions.insert( name.clone(), @@ -1211,7 +1218,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { crate::vector::settings::EmbeddingSettings::apply_default_openai_model( &mut setting, ); - let setting = validate_embedding_settings(setting, &name)?; + let setting = validate_embedding_settings( + setting, + &name, + EmbeddingValidationContext::FullSettings, + )?; embedder_actions.insert( name.clone(), EmbedderAction::with_reindex(ReindexAction::FullReindex, false), @@ -2079,6 +2090,7 @@ fn validate_prompt( pub fn validate_embedding_settings( settings: Setting, name: &str, + context: EmbeddingValidationContext, ) -> Result> { let Setting::Set(settings) = settings else { return Ok(settings) }; let EmbeddingSettings { @@ -2119,10 +2131,10 @@ pub fn validate_embedding_settings( })?; } - if let Some(request) = request.as_ref().set() { - let request = crate::vector::rest::RequestData::new( - request.to_owned(), - indexing_fragments + // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments + if context == EmbeddingValidationContext::FullSettings { + if let Some(request) = request.as_ref().set() { + let indexing_fragments: BTreeMap<_, _> = indexing_fragments .as_ref() .set() .iter() @@ -2130,8 +2142,8 @@ pub fn validate_embedding_settings( .filter_map(|(name, fragment)| { Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) }) - .collect(), - search_fragments + .collect(); + let search_fragments: BTreeMap<_, _> = search_fragments .as_ref() .set() .iter() @@ -2139,12 +2151,29 @@ pub fn validate_embedding_settings( .filter_map(|(name, fragment)| { Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) }) - .collect(), - ) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; - if let Some(response) = response.as_ref().set() { - crate::vector::rest::Response::new(response.to_owned(), &request) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + .collect(); + + let are_fragments_inconsistent = + indexing_fragments.is_empty() ^ search_fragments.is_empty(); + if are_fragments_inconsistent { + return Err(crate::vector::error::NewEmbedderError::rest_inconsistent_fragments( + indexing_fragments.is_empty(), + indexing_fragments, + search_fragments, + )) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); + } + + let request = crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + search_fragments, + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + if let Some(response) = response.as_ref().set() { + crate::vector::rest::Response::new(response.to_owned(), &request) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + } } } diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 9ea8d7703..b769ce277 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -615,6 +615,12 @@ pub struct SubEmbeddingSettings { pub indexing_embedder: Setting, } +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum EmbeddingValidationContext { + FullSettings, + SettingsPartialUpdate, +} + /// Indicates what action should take place during a reindexing operation for an embedder #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum ReindexAction { From ede456c5b0c6021ca9da607b3c7d3cf261a91aac Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:35:19 +0200 Subject: [PATCH 076/101] New error: rest inconsistent fragments --- crates/milli/src/vector/error.rs | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 00d4221e5..b56a5dce9 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use bumpalo::Bump; use hf_hub::api::sync::ApiError; +use itertools::Itertools as _; use super::parsed_vectors::ParsedVectorsDiff; use super::rest::ConfigurationSource; @@ -453,6 +454,29 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_inconsistent_fragments( + indexing_fragments_is_empty: bool, + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + ) -> NewEmbedderError { + let message = if indexing_fragments_is_empty { + format!("`indexingFragments` is empty, but `searchFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `indexingFragments` or remove fragments from `searchFragments` by setting them to `null`", + search_fragments.len(), + search_fragments.keys().take(3).join(", "), if search_fragments.len() > 3 { ", ..." } else { "" } + ) + } else { + format!("`searchFragments` is empty, but `indexingFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `searchFragments` or remove fragments from `indexingFragments` by setting them to `null`", + indexing_fragments.len(), + indexing_fragments.keys().take(3).join(", "), if indexing_fragments.len() > 3 { ", ..." } else { "" } + ) + }; + + Self { + kind: NewEmbedderErrorKind::RestInconsistentFragments { message }, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -572,6 +596,8 @@ pub enum NewEmbedderErrorKind { CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, #[error("cannot infer `dimensions` for an embedder using `indexingFragments`.\n - Note: Specify `dimensions` explicitly or don't use `indexingFragments`.")] RestCannotInferDimensionsForFragment, + #[error("inconsistent fragments: {message}")] + RestInconsistentFragments { message: String }, } pub struct PossibleEmbeddingMistakes { From f6287602e9bbbf69f1296d77db69572cdd1d5990 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:35:44 +0200 Subject: [PATCH 077/101] Improve error message when request contains the wrong type of placeholder --- crates/milli/src/vector/rest.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 9477959ad..41e8ca9f9 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -561,6 +561,7 @@ impl Request { Err(error) => { let message = error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); + let message = format!("{message}\n - Note: this template is using a document template, and so expects to contain the placeholder {REQUEST_PLACEHOLDER:?} rather than {REQUEST_FRAGMENT_PLACEHOLDER:?}"); return Err(NewEmbedderError::rest_could_not_parse_template(message)); } }; @@ -592,15 +593,23 @@ impl RequestFromFragments { request: Value, search_fragments: impl IntoIterator, ) -> Result { - let request = - match InjectableValue::new(request, REQUEST_FRAGMENT_PLACEHOLDER, REPEAT_PLACEHOLDER) { - Ok(template) => template, - Err(error) => { - let message = - error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); - return Err(NewEmbedderError::rest_could_not_parse_template(message)); - } - }; + let request = match InjectableValue::new( + request, + REQUEST_FRAGMENT_PLACEHOLDER, + REPEAT_PLACEHOLDER, + ) { + Ok(template) => template, + Err(error) => { + let message = error.error_message( + "request", + REQUEST_FRAGMENT_PLACEHOLDER, + REPEAT_PLACEHOLDER, + ); + let message = format!("{message}\n - Note: this template is using fragments, and so expects to contain the placeholder {REQUEST_FRAGMENT_PLACEHOLDER:?} rathern than {REQUEST_PLACEHOLDER:?}"); + + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; let search_fragments: Result<_, NewEmbedderError> = search_fragments .into_iter() From 82a796aea7e0402f605d46b9aabfffd984bdf2b0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 11:36:14 +0200 Subject: [PATCH 078/101] vector settings: fix bug where removed fragments were returned as new --- crates/milli/src/vector/settings.rs | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index b769ce277..1b85dd503 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -2420,8 +2420,17 @@ pub(crate) fn fragments_from_settings( setting: &Setting, ) -> impl Iterator + '_ { let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) }; + + let filter_map = |(name, fragment): (&String, &Option)| { + if fragment.is_some() { + Some(name.clone()) + } else { + None + } + }; + if let Some(setting) = setting.indexing_fragments.as_ref().set() { - Either::Right(setting.keys().cloned()) + Either::Right(setting.iter().filter_map(filter_map)) } else { let Some(setting) = setting.indexing_embedder.as_ref().set() else { return Either::Left(None.into_iter()); @@ -2429,6 +2438,6 @@ pub(crate) fn fragments_from_settings( let Some(setting) = setting.indexing_fragments.as_ref().set() else { return Either::Left(None.into_iter()); }; - Either::Right(setting.keys().cloned()) + Either::Right(setting.iter().filter_map(filter_map)) } } From 91e77abf4fabfde895a8746fea605c8e87d6653d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 2 Jul 2025 12:15:11 +0200 Subject: [PATCH 079/101] Bump the mini-dashboard to v0.2.20 --- crates/meilisearch/Cargo.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/Cargo.toml b/crates/meilisearch/Cargo.toml index fe00d9fee..83eb439d9 100644 --- a/crates/meilisearch/Cargo.toml +++ b/crates/meilisearch/Cargo.toml @@ -169,5 +169,5 @@ german = ["meilisearch-types/german"] turkish = ["meilisearch-types/turkish"] [package.metadata.mini-dashboard] -assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.19/build.zip" -sha1 = "7974430d5277c97f67cf6e95eec6faaac2788834" +assets-url = "https://github.com/meilisearch/mini-dashboard/releases/download/v0.2.20/build.zip" +sha1 = "82a7ddd7bf14bb5323c3d235d2b62892a98b6a59" From 895db76a517e8b5a4e1d5c2e4457ddd9023453f3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:10:05 +0200 Subject: [PATCH 080/101] Fix snaps --- crates/meilisearch/tests/vector/rest.rs | 36 ++++++++++++------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 87296c36a..e03563bcc 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,9 +1,9 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicUsize; +use std::time::Duration; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; -use std::time::Duration; use tokio::sync::mpsc; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; @@ -408,13 +408,13 @@ async fn bad_request() { .await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" - { - "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", - "code": "vector_embedding_error", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#vector_embedding_error" - } - "###); + { + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); // A repeat string appears inside a repeated value let (response, code) = index @@ -437,7 +437,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated", + "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -460,7 +460,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array", + "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -483,7 +483,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0", + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -506,7 +506,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2", + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -529,7 +529,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value", + "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -556,7 +556,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -577,7 +577,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -598,7 +598,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`", + "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -619,7 +619,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -920,7 +920,7 @@ async fn bad_settings() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" From aa6855cd4ff796f002a18885a00080ac24af31cf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:12:23 +0200 Subject: [PATCH 081/101] Vector settings: don't assume which kind of request is asked when looking at a settings update without fragments --- crates/milli/src/update/settings.rs | 122 +++++++++++++++++++++------- 1 file changed, 91 insertions(+), 31 deletions(-) diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c2152022b..911f51865 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -2131,28 +2131,41 @@ pub fn validate_embedding_settings( })?; } - // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments - if context == EmbeddingValidationContext::FullSettings { - if let Some(request) = request.as_ref().set() { - let indexing_fragments: BTreeMap<_, _> = indexing_fragments - .as_ref() - .set() - .iter() - .flat_map(|map| map.iter()) - .filter_map(|(name, fragment)| { - Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) - }) - .collect(); - let search_fragments: BTreeMap<_, _> = search_fragments - .as_ref() - .set() - .iter() - .flat_map(|map| map.iter()) - .filter_map(|(name, fragment)| { - Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) - }) - .collect(); + // used below + enum WithFragments { + Yes { + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + }, + No, + Maybe, + } + let with_fragments = { + let has_reset = matches!(indexing_fragments, Setting::Reset) + || matches!(search_fragments, Setting::Reset); + let indexing_fragments: BTreeMap<_, _> = indexing_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(); + let search_fragments: BTreeMap<_, _> = search_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(); + + let has_fragments = !indexing_fragments.is_empty() || !search_fragments.is_empty(); + + if context == EmbeddingValidationContext::FullSettings { let are_fragments_inconsistent = indexing_fragments.is_empty() ^ search_fragments.is_empty(); if are_fragments_inconsistent { @@ -2163,17 +2176,64 @@ pub fn validate_embedding_settings( )) .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); } - - let request = crate::vector::rest::RequestData::new( - request.to_owned(), - indexing_fragments, - search_fragments, - ) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; - if let Some(response) = response.as_ref().set() { - crate::vector::rest::Response::new(response.to_owned(), &request) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + } + if has_fragments { + if context == EmbeddingValidationContext::SettingsPartialUpdate + && matches!(document_template, Setting::Set(_)) + { + return Err( + crate::vector::error::NewEmbedderError::rest_document_template_and_fragments( + indexing_fragments.len(), + search_fragments.len(), + ), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); } + WithFragments::Yes { indexing_fragments, search_fragments } + } else if has_reset || context == EmbeddingValidationContext::FullSettings { + WithFragments::No + } else { + // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments + WithFragments::Maybe + } + }; + if let Some(request) = request.as_ref().set() { + let request = match with_fragments { + WithFragments::Yes { indexing_fragments, search_fragments } => { + crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + search_fragments, + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + WithFragments::No => crate::vector::rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())), + WithFragments::Maybe => { + let mut indexing_fragments = BTreeMap::new(); + indexing_fragments.insert("test".to_string(), serde_json::json!("test")); + crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + Default::default(), + ) + .or_else(|_| { + crate::vector::rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + }) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + }?; + if let Some(response) = response.as_ref().set() { + crate::vector::rest::Response::new(response.to_owned(), &request) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; } } From 7113fcf63a6c344a9211fbbe7a7a8c23ff780689 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:17:12 +0200 Subject: [PATCH 082/101] New error --- crates/milli/src/vector/error.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index b56a5dce9..0d737cbfc 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -477,6 +477,19 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_document_template_and_fragments( + indexing_fragments_len: usize, + search_fragments_len: usize, + ) -> Self { + Self { + kind: NewEmbedderErrorKind::RestDocumentTemplateAndFragments { + indexing_fragments_len, + search_fragments_len, + }, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -598,6 +611,8 @@ pub enum NewEmbedderErrorKind { RestCannotInferDimensionsForFragment, #[error("inconsistent fragments: {message}")] RestInconsistentFragments { message: String }, + #[error("cannot pass both fragments and a document template.\n - Note: {indexing_fragments_len} fragments declared in `indexingFragments` and {search_fragments_len} fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`")] + RestDocumentTemplateAndFragments { indexing_fragments_len: usize, search_fragments_len: usize }, } pub struct PossibleEmbeddingMistakes { From 428463e45c804a606b4576b500100407bbc5d02e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 2 Jul 2025 16:17:22 +0200 Subject: [PATCH 083/101] Check indexing fragments as well as search fragments --- crates/milli/src/vector/rest.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index 41e8ca9f9..7a16f1a1e 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -110,6 +110,13 @@ impl RequestData { Ok(if indexing_fragments.is_empty() && search_fragments.is_empty() { RequestData::Single(Request::new(request)?) } else { + for (name, value) in indexing_fragments { + JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing(&format!(".indexingFragments.{name}")), + ) + })?; + } RequestData::FromFragments(RequestFromFragments::new(request, search_fragments)?) }) } @@ -614,14 +621,12 @@ impl RequestFromFragments { let search_fragments: Result<_, NewEmbedderError> = search_fragments .into_iter() .map(|(name, value)| { - Ok(( - name, - JsonTemplate::new(value).map_err(|error| { - NewEmbedderError::rest_could_not_parse_template( - error.parsing("searchFragments"), - ) - })?, - )) + let json_template = JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing(&format!(".searchFragments.{name}")), + ) + })?; + Ok((name, json_template)) }) .collect(); From 549dc985b8ae6e09306172aa350d5ec11c55cae5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 09:58:41 +0200 Subject: [PATCH 084/101] Old dump import indexer: fix the case where going from Generated to Generated --- .../extract/extract_vector_points.rs | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index d40e82b92..54fcca75f 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -726,6 +726,35 @@ fn extract_vector_document_diff( } let has_fragments = !runtime.fragments().is_empty(); if has_fragments { + let mut fragment_diff = Vec::new(); + let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let old_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Deletion, + old_fields_ids_map, + ); + + let new_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + + for new in runtime.fragments() { + let name = &new.name; + let fragment = + RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = fragment + .diff_documents(&old_document, &new_document, &()) + .expect("ignoring errors so this cannot fail"); + + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } else { let prompt = &runtime.document_template; // Don't give up if the old prompt was failing let old_prompt = Some(&prompt).map(|p| { @@ -741,38 +770,9 @@ fn extract_vector_document_diff( ); VectorStateDelta::NowGenerated(new_prompt) } else { - let mut fragment_diff = Vec::new(); - let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); - let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); - - let old_document = crate::update::new::document::KvDelAddDocument::new( - obkv, - DelAdd::Deletion, - old_fields_ids_map, - ); - - let new_document = crate::update::new::document::KvDelAddDocument::new( - obkv, - DelAdd::Addition, - new_fields_ids_map, - ); - - for new in runtime.fragments() { - let name = &new.name; - let fragment = - RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); - - let diff = fragment - .diff_documents(&old_document, &new_document, &()) - .expect("ignoring errors so this cannot fail"); - - fragment_diff.push((name.clone(), diff)); - } - VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange } - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange } } else { VectorStateDelta::NowRemoved From a06cb1bfd6a21b283f5aeb7cee7ae0c605580b0c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:02:16 +0200 Subject: [PATCH 085/101] Remove `Embed::process_embeddings` and have it be an inherent function of the type that uses it --- .../index_documents/extract/extract_vector_points.rs | 8 -------- crates/milli/src/update/new/extract/vectors/mod.rs | 9 ++++----- crates/milli/src/vector/session.rs | 2 -- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 54fcca75f..677ff93c9 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1300,12 +1300,4 @@ impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) } } - - fn process_embeddings( - &mut self, - _metadata: crate::vector::session::Metadata<'doc>, - _embeddings: Vec, - ) { - unimplemented!("unused") - } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index c08fadb14..f8e0e7cb5 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -452,6 +452,10 @@ impl OnEmbeddingDocumentUpdates<'_, '_> { fn clear_vectors(&self, docid: DocumentId) { self.sender.set_vectors(docid, self.embedder_id, vec![]).unwrap(); } + + fn process_embeddings(&mut self, metadata: Metadata<'_>, embeddings: Vec) { + self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); + } } impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { @@ -469,11 +473,6 @@ impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { ) .unwrap(); } - - fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec) { - self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); - } - fn process_embedding_error( &mut self, error: crate::vector::hf::EmbedError, diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index dd005e993..5f6d68879 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -30,8 +30,6 @@ pub trait OnEmbed<'doc> { unused_vectors_distribution: &Self::ErrorMetadata, metadata: &[Metadata<'doc>], ) -> crate::Error; - - fn process_embeddings(&mut self, metadata: Metadata<'doc>, embeddings: Vec); } pub struct EmbedSession<'doc, C, I> { From bbcabc47bda50573a0289b1ff48b3d18e794d8fb Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 3 Jul 2025 08:06:38 +0000 Subject: [PATCH 086/101] Update version for the next release (v1.16.0) in Cargo.toml --- Cargo.lock | 34 +++++++++++++++++----------------- Cargo.toml | 2 +- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index be6aa4b21..ceec0a05e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -580,7 +580,7 @@ source = "git+https://github.com/meilisearch/bbqueue#cbb87cc707b5af415ef203bdaf2 [[package]] name = "benchmarks" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "bumpalo", @@ -770,7 +770,7 @@ dependencies = [ [[package]] name = "build-info" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "time", @@ -1774,7 +1774,7 @@ dependencies = [ [[package]] name = "dump" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "big_s", @@ -2006,7 +2006,7 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "file-store" -version = "1.15.2" +version = "1.16.0" dependencies = [ "tempfile", "thiserror 2.0.12", @@ -2028,7 +2028,7 @@ dependencies = [ [[package]] name = "filter-parser" -version = "1.15.2" +version = "1.16.0" dependencies = [ "insta", "nom", @@ -2049,7 +2049,7 @@ dependencies = [ [[package]] name = "flatten-serde-json" -version = "1.15.2" +version = "1.16.0" dependencies = [ "criterion", "serde_json", @@ -2194,7 +2194,7 @@ dependencies = [ [[package]] name = "fuzzers" -version = "1.15.2" +version = "1.16.0" dependencies = [ "arbitrary", "bumpalo", @@ -2994,7 +2994,7 @@ dependencies = [ [[package]] name = "index-scheduler" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "backoff", @@ -3230,7 +3230,7 @@ dependencies = [ [[package]] name = "json-depth-checker" -version = "1.15.2" +version = "1.16.0" dependencies = [ "criterion", "serde_json", @@ -3724,7 +3724,7 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "meili-snap" -version = "1.15.2" +version = "1.16.0" dependencies = [ "insta", "md5", @@ -3735,7 +3735,7 @@ dependencies = [ [[package]] name = "meilisearch" -version = "1.15.2" +version = "1.16.0" dependencies = [ "actix-cors", "actix-http", @@ -3830,7 +3830,7 @@ dependencies = [ [[package]] name = "meilisearch-auth" -version = "1.15.2" +version = "1.16.0" dependencies = [ "base64 0.22.1", "enum-iterator", @@ -3849,7 +3849,7 @@ dependencies = [ [[package]] name = "meilisearch-types" -version = "1.15.2" +version = "1.16.0" dependencies = [ "actix-web", "anyhow", @@ -3884,7 +3884,7 @@ dependencies = [ [[package]] name = "meilitool" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "clap", @@ -3918,7 +3918,7 @@ dependencies = [ [[package]] name = "milli" -version = "1.15.2" +version = "1.16.0" dependencies = [ "allocator-api2 0.3.0", "arroy", @@ -4470,7 +4470,7 @@ checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" [[package]] name = "permissive-json-pointer" -version = "1.15.2" +version = "1.16.0" dependencies = [ "big_s", "serde_json", @@ -7258,7 +7258,7 @@ dependencies = [ [[package]] name = "xtask" -version = "1.15.2" +version = "1.16.0" dependencies = [ "anyhow", "build-info", diff --git a/Cargo.toml b/Cargo.toml index 835ef497c..3e57563b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ members = [ ] [workspace.package] -version = "1.15.2" +version = "1.16.0" authors = [ "Quentin de Quelen ", "Clément Renault ", From 3740755d9c05b6beee8b5c8537b1ba39112c18a8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:11:07 +0200 Subject: [PATCH 087/101] Compare to `RawValue::NULL` constant rather than explicit "null" --- crates/milli/src/vector/parsed_vectors.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 8ff5a2201..b96922bc4 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -151,7 +151,7 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { } Ok(Some("embeddings")) => { let value: &RawValue = match map.next_value::<&RawValue>() { - Ok(value) if value.get() == "null" => continue, + Ok(value) if value.get() == RawValue::NULL.get() => continue, Ok(value) => value, Err(error) => { return Ok(Err(RawVectorsError::DeserializeEmbeddings { From 735634e998943adc64a9289272edb3073a3d1e69 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:32:57 +0200 Subject: [PATCH 088/101] Send owned metadata and clear inputs in case of error --- .../extract/extract_vector_points.rs | 2 +- crates/milli/src/update/new/extract/vectors/mod.rs | 2 +- crates/milli/src/vector/session.rs | 13 ++++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 677ff93c9..9604c4823 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1259,7 +1259,7 @@ impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { error: crate::vector::error::EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &crate::vector::error::UnusedVectorsDistribution, - _metadata: &[crate::vector::session::Metadata<'doc>], + _metadata: bumpalo::collections::Vec<'doc, crate::vector::session::Metadata<'doc>>, ) -> crate::Error { if let FaultSource::Bug = error.fault { crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index f8e0e7cb5..72a07dea6 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -478,7 +478,7 @@ impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { error: crate::vector::hf::EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &UnusedVectorsDistributionBump, - metadata: &[Metadata<'doc>], + metadata: BVec<'doc, Metadata<'doc>>, ) -> crate::Error { if let FaultSource::Bug = error.fault { crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs index 5f6d68879..b582bd840 100644 --- a/crates/milli/src/vector/session.rs +++ b/crates/milli/src/vector/session.rs @@ -28,7 +28,7 @@ pub trait OnEmbed<'doc> { error: EmbedError, embedder_name: &'doc str, unused_vectors_distribution: &Self::ErrorMetadata, - metadata: &[Metadata<'doc>], + metadata: BVec<'doc, Metadata<'doc>>, ) -> crate::Error; } @@ -143,12 +143,19 @@ impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { Ok(()) } Err(error) => { + // reset metadata and inputs, and send metadata to the error processing. + let doc_alloc = self.metadata.bump(); + let metadata = std::mem::replace( + &mut self.metadata, + BVec::with_capacity_in(self.inputs.capacity(), doc_alloc), + ); + self.inputs.clear(); return Err(self.on_embed.process_embedding_error( error, self.embedder_name, unused_vectors_distribution, - &self.metadata, - )) + metadata, + )); } }; self.inputs.clear(); From 87f105747f857449e6fd0562c11eb1716db9bcb0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:41:20 +0200 Subject: [PATCH 089/101] Add documentation to `Extractor` trait --- crates/milli/src/vector/extractor.rs | 32 +++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/vector/extractor.rs b/crates/milli/src/vector/extractor.rs index cbfc62ee1..2ab541ac1 100644 --- a/crates/milli/src/vector/extractor.rs +++ b/crates/milli/src/vector/extractor.rs @@ -12,19 +12,41 @@ use crate::update::new::document::Document; use crate::vector::RuntimeFragment; use crate::GlobalFieldsIdsMap; +/// Trait for types that extract embedder inputs from a document. +/// +/// An embedder input can then be sent to an embedder by using an [`super::session::EmbedSession`]. pub trait Extractor<'doc> { - type DocumentMetadata; + /// The embedder input that is extracted from documents by this extractor. + /// + /// The inputs have to be comparable for equality so that diffing is possible. type Input: PartialEq; + + /// The error that can happen while extracting from a document. type Error; + /// Metadata associated with a document. + type DocumentMetadata; + + /// Extract the embedder input from a document and its metadata. fn extract<'a, D: Document<'a> + Debug>( &self, doc: D, meta: &Self::DocumentMetadata, ) -> Result, Self::Error>; + /// Unique `id` associated with this extractor. + /// + /// This will serve to decide where to store the vectors in the vector store. + /// The id should be stable for a given extractor. fn extractor_id(&self) -> u8; + /// The result of diffing the embedder inputs extracted from two versions of a document. + /// + /// # Parameters + /// + /// - `old`: old version of the document + /// - `new`: new version of the document + /// - `meta`: metadata associated to the document fn diff_documents<'a, OD: Document<'a> + Debug, ND: Document<'a> + Debug>( &self, old: OD, @@ -39,6 +61,13 @@ pub trait Extractor<'doc> { to_diff(old_input, new_input) } + /// The result of diffing the embedder inputs extracted from a document by two versions of this extractor. + /// + /// # Parameters + /// + /// - `doc`: the document from which to extract the embedder inputs + /// - `meta`: metadata associated to the document + /// - `old`: If `Some`, the old version of this extractor. If `None`, this is equivalent to calling `ExtractorDiff::Added(self.extract(_))`. fn diff_settings<'a, D: Document<'a> + Debug>( &self, doc: D, @@ -51,6 +80,7 @@ pub trait Extractor<'doc> { to_diff(old_input, new_input) } + /// Returns an extractor wrapping `self` and set to ignore all errors arising from extracting with this extractor. fn ignore_errors(self) -> IgnoreErrorExtractor where Self: Sized, From 0ca652de2811d16733018ffc0c9f203d16307eee Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 10:52:30 +0200 Subject: [PATCH 090/101] Extract vector points: remove the { --- .../update/index_documents/extract/extract_vector_points.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 9604c4823..064cfd154 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -882,8 +882,7 @@ fn regenerate_all_fragments<'a>( let name = &new.name; let new = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); - let diff = - { new.extract(&obkv_document, &()) }.expect("ignoring errors so this cannot fail"); + let diff = new.extract(&obkv_document, &()).expect("ignoring errors so this cannot fail"); if let Some(value) = diff { fragment_diff.push((name.clone(), value)); } From dfe0c8664ee20300d562f6e059bcd33a4bb4c054 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 11:08:31 +0200 Subject: [PATCH 091/101] Add a version of prompt::Context that has no fields --- crates/milli/src/prompt/context.rs | 34 ++++++++++++++------ crates/milli/src/vector/json_template/mod.rs | 3 +- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/crates/milli/src/prompt/context.rs b/crates/milli/src/prompt/context.rs index 84523333a..8958cb693 100644 --- a/crates/milli/src/prompt/context.rs +++ b/crates/milli/src/prompt/context.rs @@ -6,12 +6,18 @@ use liquid::{ObjectView, ValueView}; #[derive(Debug, Clone)] pub struct Context<'a, D: ObjectView, F: ArrayView> { document: &'a D, - fields: &'a F, + fields: Option<&'a F>, } impl<'a, D: ObjectView, F: ArrayView> Context<'a, D, F> { pub fn new(document: &'a D, fields: &'a F) -> Self { - Self { document, fields } + Self { document, fields: Some(fields) } + } +} + +impl<'a, D: ObjectView> Context<'a, D, Vec> { + pub fn without_fields(document: &'a D) -> Self { + Self { document, fields: None } } } @@ -21,17 +27,27 @@ impl ObjectView for Context<'_, D, F> { } fn size(&self) -> i64 { - 2 + if self.fields.is_some() { + 2 + } else { + 1 + } } fn keys<'k>(&'k self) -> Box> + 'k> { - Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s))) + let keys = if self.fields.is_some() { + either::Either::Left(["doc", "fields"]) + } else { + either::Either::Right(["doc"]) + }; + + Box::new(keys.into_iter().map(KStringCow::from_static)) } fn values<'k>(&'k self) -> Box + 'k> { Box::new( std::iter::once(self.document.as_value()) - .chain(std::iter::once(self.fields.as_value())), + .chain(self.fields.iter().map(|fields| fields.as_value())), ) } @@ -40,13 +56,13 @@ impl ObjectView for Context<'_, D, F> { } fn contains_key(&self, index: &str) -> bool { - index == "doc" || index == "fields" + index == "doc" || (index == "fields" && self.fields.is_some()) } fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> { - match index { - "doc" => Some(self.document.as_value()), - "fields" => Some(self.fields.as_value()), + match (index, &self.fields) { + ("doc", _) => Some(self.document.as_value()), + ("fields", Some(fields)) => Some(fields.as_value()), _ => None, } } diff --git a/crates/milli/src/vector/json_template/mod.rs b/crates/milli/src/vector/json_template/mod.rs index 57a3b67b1..d7ce3e8f1 100644 --- a/crates/milli/src/vector/json_template/mod.rs +++ b/crates/milli/src/vector/json_template/mod.rs @@ -115,8 +115,7 @@ impl JsonTemplate { doc_alloc: &'doc Bump, ) -> Result { let document = ParseableDocument::new(document, doc_alloc); - let v: Vec = vec![]; - let context = crate::prompt::Context::new(&document, &v); + let context = crate::prompt::Context::without_fields(&document); self.render(&context) } From 6b94033c978a86a85135d9bb3cee18d214483d46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 11:30:24 +0200 Subject: [PATCH 092/101] Correctly export the chat completions settings in dumps --- crates/dump/src/writer.rs | 24 ++++++++++++++++++- crates/index-scheduler/src/processing.rs | 1 + .../src/scheduler/process_dump_creation.rs | 23 ++++++++++++------ 3 files changed, 40 insertions(+), 8 deletions(-) diff --git a/crates/dump/src/writer.rs b/crates/dump/src/writer.rs index 63b006b5c..9f828595a 100644 --- a/crates/dump/src/writer.rs +++ b/crates/dump/src/writer.rs @@ -5,7 +5,7 @@ use std::path::PathBuf; use flate2::write::GzEncoder; use flate2::Compression; use meilisearch_types::batches::Batch; -use meilisearch_types::features::{Network, RuntimeTogglableFeatures}; +use meilisearch_types::features::{ChatCompletionSettings, Network, RuntimeTogglableFeatures}; use meilisearch_types::keys::Key; use meilisearch_types::settings::{Checked, Settings}; use serde_json::{Map, Value}; @@ -51,6 +51,10 @@ impl DumpWriter { KeyWriter::new(self.dir.path().to_path_buf()) } + pub fn create_chat_completions_settings(&self) -> Result { + ChatCompletionsSettingsWriter::new(self.dir.path().join("chat-completions-settings")) + } + pub fn create_tasks_queue(&self) -> Result { TaskWriter::new(self.dir.path().join("tasks")) } @@ -104,6 +108,24 @@ impl KeyWriter { } } +pub struct ChatCompletionsSettingsWriter { + path: PathBuf, +} + +impl ChatCompletionsSettingsWriter { + pub(crate) fn new(path: PathBuf) -> Result { + std::fs::create_dir(&path)?; + Ok(ChatCompletionsSettingsWriter { path }) + } + + pub fn push_settings(&mut self, name: &str, settings: &ChatCompletionSettings) -> Result<()> { + let mut settings_file = File::create(self.path.join(name).with_extension("json"))?; + serde_json::to_writer(&mut settings_file, &settings)?; + settings_file.flush()?; + Ok(()) + } +} + pub struct TaskWriter { queue: BufWriter, update_files: PathBuf, diff --git a/crates/index-scheduler/src/processing.rs b/crates/index-scheduler/src/processing.rs index 2aa7cf859..fdd8e42ef 100644 --- a/crates/index-scheduler/src/processing.rs +++ b/crates/index-scheduler/src/processing.rs @@ -103,6 +103,7 @@ make_enum_progress! { pub enum DumpCreationProgress { StartTheDumpCreation, DumpTheApiKeys, + DumpTheChatCompletionSettings, DumpTheTasks, DumpTheBatches, DumpTheIndexes, diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index a6d785b2f..a6907d739 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -43,7 +43,16 @@ impl IndexScheduler { let rtxn = self.env.read_txn()?; - // 2. dump the tasks + // 2. dump the chat completion settings + // TODO should I skip the export if the chat completion has been disabled? + progress.update_progress(DumpCreationProgress::DumpTheChatCompletionSettings); + let mut dump_chat_completion_settings = dump.create_chat_completions_settings()?; + for result in self.chat_settings.iter(&rtxn)? { + let (name, chat_settings) = result?; + dump_chat_completion_settings.push_settings(name, &chat_settings)?; + } + + // 3. dump the tasks progress.update_progress(DumpCreationProgress::DumpTheTasks); let mut dump_tasks = dump.create_tasks_queue()?; @@ -81,7 +90,7 @@ impl IndexScheduler { let mut dump_content_file = dump_tasks.push_task(&t.into())?; - // 2.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. + // 3.1. Dump the `content_file` associated with the task if there is one and the task is not finished yet. if let Some(content_file) = content_file { if self.scheduler.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -105,7 +114,7 @@ impl IndexScheduler { } dump_tasks.flush()?; - // 3. dump the batches + // 4. dump the batches progress.update_progress(DumpCreationProgress::DumpTheBatches); let mut dump_batches = dump.create_batches_queue()?; @@ -138,7 +147,7 @@ impl IndexScheduler { } dump_batches.flush()?; - // 4. Dump the indexes + // 5. Dump the indexes progress.update_progress(DumpCreationProgress::DumpTheIndexes); let nb_indexes = self.index_mapper.index_mapping.len(&rtxn)? as u32; let mut count = 0; @@ -178,7 +187,7 @@ impl IndexScheduler { let documents = index .all_documents(&rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; - // 4.1. Dump the documents + // 5.1. Dump the documents for ret in documents { if self.scheduler.must_stop_processing.get() { return Err(Error::AbortedTask); @@ -240,7 +249,7 @@ impl IndexScheduler { atomic.fetch_add(1, Ordering::Relaxed); } - // 4.2. Dump the settings + // 5.2. Dump the settings let settings = meilisearch_types::settings::settings( index, &rtxn, @@ -251,7 +260,7 @@ impl IndexScheduler { Ok(()) })?; - // 5. Dump experimental feature settings + // 6. Dump experimental feature settings progress.update_progress(DumpCreationProgress::DumpTheExperimentalFeatures); let features = self.features().runtime_features(); dump.create_experimental_features(features)?; From a051ab3d9ae8ad7bf4262cbf608eb04383a6441d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 12:04:40 +0200 Subject: [PATCH 093/101] Support importing chat completions settings --- crates/dump/src/reader/mod.rs | 9 +++++++++ crates/dump/src/reader/v6/mod.rs | 26 ++++++++++++++++++++++++++ crates/meilisearch/src/lib.rs | 28 +++++++++++++++++----------- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/crates/dump/src/reader/mod.rs b/crates/dump/src/reader/mod.rs index 2b4440ab7..23e7eec9e 100644 --- a/crates/dump/src/reader/mod.rs +++ b/crates/dump/src/reader/mod.rs @@ -116,6 +116,15 @@ impl DumpReader { } } + pub fn chat_completions_settings( + &mut self, + ) -> Result> + '_>> { + match self { + DumpReader::Current(current) => current.chat_completions_settings(), + DumpReader::Compat(_compat) => Ok(Box::new(std::iter::empty())), + } + } + pub fn features(&self) -> Result> { match self { DumpReader::Current(current) => Ok(current.features()), diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 0b4ba5bdd..0c920aadb 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -1,3 +1,4 @@ +use std::ffi::OsStr; use std::fs::{self, File}; use std::io::{BufRead, BufReader, ErrorKind}; use std::path::Path; @@ -21,6 +22,7 @@ pub type Unchecked = meilisearch_types::settings::Unchecked; pub type Task = crate::TaskDump; pub type Batch = meilisearch_types::batches::Batch; pub type Key = meilisearch_types::keys::Key; +pub type ChatCompletionSettings = meilisearch_types::features::ChatCompletionSettings; pub type RuntimeTogglableFeatures = meilisearch_types::features::RuntimeTogglableFeatures; pub type Network = meilisearch_types::features::Network; @@ -192,6 +194,30 @@ impl V6Reader { ) } + pub fn chat_completions_settings( + &mut self, + ) -> Result> + '_>> { + let entries = fs::read_dir(self.dump.path().join("chat-completions-settings"))?; + Ok(Box::new( + entries + .map(|entry| -> Result> { + let entry = entry?; + let file_name = entry.file_name(); + let path = Path::new(&file_name); + if entry.file_type()?.is_file() && path.extension() == Some(OsStr::new("json")) + { + let name = path.file_stem().unwrap().to_str().unwrap().to_string(); + let file = File::open(entry.path())?; + let settings = serde_json::from_reader(file)?; + Ok(Some((name, settings))) + } else { + Ok(None) + } + }) + .filter_map(|entry| entry.transpose()), + )) + } + pub fn features(&self) -> Option { self.features } diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 871bd688e..b11a4a76d 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -498,14 +498,20 @@ fn import_dump( keys.push(key); } - // 3. Import the runtime features and network + // 3. Import the `ChatCompletionSettings`s. + for result in dump_reader.chat_completions_settings()? { + let (name, settings) = result?; + index_scheduler.put_chat_settings(&name, &settings)?; + } + + // 4. Import the runtime features and network let features = dump_reader.features()?.unwrap_or_default(); index_scheduler.put_runtime_features(features)?; let network = dump_reader.network()?.cloned().unwrap_or_default(); index_scheduler.put_network(network)?; - // 3.1 Use all cpus to process dump if `max_indexing_threads` not configured + // 4.1 Use all cpus to process dump if `max_indexing_threads` not configured let backup_config; let base_config = index_scheduler.indexer_config(); @@ -522,7 +528,7 @@ fn import_dump( // /!\ The tasks must be imported AFTER importing the indexes or else the scheduler might // try to process tasks while we're trying to import the indexes. - // 4. Import the indexes. + // 5. Import the indexes. for index_reader in dump_reader.indexes()? { let mut index_reader = index_reader?; let metadata = index_reader.metadata(); @@ -535,20 +541,20 @@ fn import_dump( let mut wtxn = index.write_txn()?; let mut builder = milli::update::Settings::new(&mut wtxn, &index, indexer_config); - // 4.1 Import the primary key if there is one. + // 5.1 Import the primary key if there is one. if let Some(ref primary_key) = metadata.primary_key { builder.set_primary_key(primary_key.to_string()); } - // 4.2 Import the settings. + // 5.2 Import the settings. tracing::info!("Importing the settings."); let settings = index_reader.settings()?; apply_settings_to_builder(&settings, &mut builder); let embedder_stats: Arc = Default::default(); builder.execute(&|| false, &progress, embedder_stats.clone())?; - // 4.3 Import the documents. - // 4.3.1 We need to recreate the grenad+obkv format accepted by the index. + // 5.3 Import the documents. + // 5.3.1 We need to recreate the grenad+obkv format accepted by the index. tracing::info!("Importing the documents."); let file = tempfile::tempfile()?; let mut builder = DocumentsBatchBuilder::new(BufWriter::new(file)); @@ -559,7 +565,7 @@ fn import_dump( // This flush the content of the batch builder. let file = builder.into_inner()?.into_inner()?; - // 4.3.2 We feed it to the milli index. + // 5.3.2 We feed it to the milli index. let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; @@ -591,15 +597,15 @@ fn import_dump( index_scheduler.refresh_index_stats(&uid)?; } - // 5. Import the queue + // 6. Import the queue let mut index_scheduler_dump = index_scheduler.register_dumped_task()?; - // 5.1. Import the batches + // 6.1. Import the batches for ret in dump_reader.batches()? { let batch = ret?; index_scheduler_dump.register_dumped_batch(batch)?; } - // 5.2. Import the tasks + // 6.2. Import the tasks for ret in dump_reader.tasks()? { let (task, file) = ret?; index_scheduler_dump.register_dumped_task(task, file)?; From 6e6fd077d42802057198c523b9b39f4dd8a024e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 13:33:56 +0200 Subject: [PATCH 094/101] Ignore unexisting chat completions settings folder --- crates/dump/src/reader/v6/mod.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/dump/src/reader/v6/mod.rs b/crates/dump/src/reader/v6/mod.rs index 0c920aadb..449a7e5fe 100644 --- a/crates/dump/src/reader/v6/mod.rs +++ b/crates/dump/src/reader/v6/mod.rs @@ -197,7 +197,11 @@ impl V6Reader { pub fn chat_completions_settings( &mut self, ) -> Result> + '_>> { - let entries = fs::read_dir(self.dump.path().join("chat-completions-settings"))?; + let entries = match fs::read_dir(self.dump.path().join("chat-completions-settings")) { + Ok(entries) => entries, + Err(e) if e.kind() == ErrorKind::NotFound => return Ok(Box::new(std::iter::empty())), + Err(e) => return Err(e.into()), + }; Ok(Box::new( entries .map(|entry| -> Result> { From 2b75072b0976f6068511dd00fb5e2252ad08280f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 14:04:27 +0200 Subject: [PATCH 095/101] Expose the number of internal chat searches on the /metrics route --- crates/meilisearch/src/metrics.rs | 6 ++++++ crates/meilisearch/src/routes/chats/chat_completions.rs | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 29c1aeae8..1c7d0c3a4 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -15,6 +15,12 @@ lazy_static! { "Meilisearch number of degraded search requests" )) .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS: IntGauge = + register_int_gauge!(opts!( + "meilisearch_chat_internal_search_requests", + "Meilisearch number of search requests performed by the chat route itself" + )) + .expect("Can't create a metric"); pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch DB Size In Bytes")) .expect("Can't create a metric"); diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index ccbdccbbc..f6030f2bc 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -48,7 +48,9 @@ use crate::analytics::Analytics; use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; -use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; +use crate::metrics::{ + MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, +}; use crate::routes::chats::utils::SseEventSender; use crate::routes::indexes::search::search_kind; use crate::search::{add_search_rules, prepare_search, search_from_kind, SearchQuery}; @@ -286,7 +288,7 @@ async fn process_search_request( let output = output?; let mut documents = Vec::new(); if let Ok((ref rtxn, ref search_result)) = output { - // aggregate.succeed(search_result); + MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS.inc(); if search_result.degraded { MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); } From 90e6b6416f301c95536b040bb0f9ae302b08a13a Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 3 Jul 2025 14:35:02 +0200 Subject: [PATCH 096/101] new extractor bugfixes: - fix old_has_fragments - new_is_user_provided is always false when generating fragments, even if no fragment ever matches --- .../src/update/new/extract/vectors/mod.rs | 24 +++++-------------- 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 72a07dea6..4ca68027c 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -357,7 +357,7 @@ impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor> chunks.is_user_provided_must_regenerate(document.docid()); let old_has_fragments = old_embedders .get(embedder_name) - .map(|embedder| embedder.fragments().is_empty()) + .map(|embedder| !embedder.fragments().is_empty()) .unwrap_or_default(); let new_has_fragments = chunks.has_fragments(); @@ -628,9 +628,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { session.on_embed_mut().clear_vectors(docid); } - let mut extracted = false; - let extracted = &mut extracted; - settings_delta.try_for_each_fragment_diff( session.embedder_name(), |fragment_diff| { @@ -660,7 +657,6 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { ); } ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { - *extracted = true; session.request_embedding( metadata, input, @@ -673,13 +669,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { Result::Ok(()) }, )?; - self.set_status( - docid, - old_is_user_provided, - true, - old_is_user_provided & !*extracted, - true, - ); + self.set_status(docid, old_is_user_provided, true, false, true); } ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); @@ -732,7 +722,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { where 'a: 'doc, { - let extracted = match &mut self.kind { + match &mut self.kind { ChunkType::DocumentTemplate { document_template, session } => { let doc_alloc = session.doc_alloc(); let ex = DocumentTemplateExtractor::new( @@ -785,7 +775,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { docid, old_is_user_provided, old_must_regenerate, - old_is_user_provided && !extracted, + false, new_must_regenerate, ); @@ -968,7 +958,7 @@ fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( old_must_regenerate: bool, session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, -) -> Result +) -> Result<()> where OD: Document<'doc> + Debug, ND: Document<'doc> + Debug, @@ -976,7 +966,6 @@ where E::Input: Input, crate::Error: From, { - let mut extracted = false; for extractor in extractors { let new_rendered = extractor.extract(&new_document, meta)?; let must_regenerate = if !old_must_regenerate { @@ -995,7 +984,6 @@ where }; if must_regenerate { - extracted = true; let metadata = Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; @@ -1011,7 +999,7 @@ where } } - Ok(extracted) + Ok(()) } fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( From 9f0d33ec999920dfdec917aff14604df9f30e6b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 15:05:15 +0200 Subject: [PATCH 097/101] Expose the number of tokens on the chat completions routes --- crates/meilisearch/src/metrics.rs | 5 +++ .../src/routes/chats/chat_completions.rs | 35 ++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 1c7d0c3a4..9941bacae 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -21,6 +21,11 @@ lazy_static! { "Meilisearch number of search requests performed by the chat route itself" )) .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_tokens_usage", "Meilisearch Chat Tokens Usage"), + &["chat", "model", "type"] + ) + .expect("Can't create a metric"); pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = register_int_gauge!(opts!("meilisearch_db_size_bytes", "Meilisearch DB Size In Bytes")) .expect("Can't create a metric"); diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index f6030f2bc..a7d878c6e 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -13,9 +13,9 @@ use async_openai::types::{ ChatCompletionRequestDeveloperMessageContent, ChatCompletionRequestMessage, ChatCompletionRequestSystemMessage, ChatCompletionRequestSystemMessageContent, ChatCompletionRequestToolMessage, ChatCompletionRequestToolMessageContent, - ChatCompletionStreamResponseDelta, ChatCompletionToolArgs, ChatCompletionToolType, - CreateChatCompletionRequest, CreateChatCompletionStreamResponse, FinishReason, FunctionCall, - FunctionCallStream, FunctionObjectArgs, + ChatCompletionStreamOptions, ChatCompletionStreamResponseDelta, ChatCompletionToolArgs, + ChatCompletionToolType, CreateChatCompletionRequest, CreateChatCompletionStreamResponse, + FinishReason, FunctionCall, FunctionCallStream, FunctionObjectArgs, }; use async_openai::Client; use bumpalo::Bump; @@ -49,7 +49,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, + MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOKENS_USAGE, + MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; use crate::routes::indexes::search::search_kind; @@ -490,6 +491,7 @@ async fn streamed_chat( let (tx, rx) = tokio::sync::mpsc::channel(10); let tx = SseEventSender::new(tx); + let workspace_uid = workspace_uid.to_string(); let _join_handle = Handle::current().spawn(async move { let client = Client::with_config(config.clone()); let mut global_tool_calls = HashMap::::new(); @@ -499,6 +501,7 @@ async fn streamed_chat( let output = run_conversation( &index_scheduler, &auth_ctrl, + &workspace_uid, &search_queue, &auth_token, &client, @@ -536,6 +539,7 @@ async fn run_conversation( Data, >, auth_ctrl: &web::Data, + workspace_uid: &str, search_queue: &web::Data, auth_token: &str, client: &Client, @@ -546,12 +550,33 @@ async fn run_conversation( function_support: FunctionSupport, ) -> Result, ()>, SendError> { let mut finish_reason = None; + chat_completion.stream_options = Some(ChatCompletionStreamOptions { include_usage: true }); // safety: unwrap: can only happens if `stream` was set to `false` let mut response = client.chat().create_stream(chat_completion.clone()).await.unwrap(); while let Some(result) = response.next().await { match result { Ok(resp) => { - let choice = &resp.choices[0]; + let choice = match resp.choices.get(0) { + Some(choice) => choice, + None => { + if let Some(usage) = resp.usage.as_ref() { + for (r#type, value) in &[ + ("prompt", usage.prompt_tokens), + ("completion", usage.completion_tokens), + ("total", usage.total_tokens), + ] { + MEILISEARCH_CHAT_TOKENS_USAGE + .with_label_values(&[ + workspace_uid, + &chat_completion.model, + r#type, + ]) + .inc_by(*value as u64); + } + } + break; + } + }; finish_reason = choice.finish_reason; let ChatCompletionStreamResponseDelta { ref tool_calls, .. } = &choice.delta; From b5e41f0e4612eb4c665994a6a33064a2afac8c02 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 15:18:16 +0200 Subject: [PATCH 098/101] Fix the Mistral uncompatibility with the usage of OpenAI --- .../src/routes/chats/chat_completions.rs | 40 +++++++++---------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index a7d878c6e..ea3077e99 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -549,33 +549,33 @@ async fn run_conversation( global_tool_calls: &mut HashMap, function_support: FunctionSupport, ) -> Result, ()>, SendError> { + use DbChatCompletionSource::*; + let mut finish_reason = None; - chat_completion.stream_options = Some(ChatCompletionStreamOptions { include_usage: true }); + chat_completion.stream_options = match source { + OpenAi | AzureOpenAi => Some(ChatCompletionStreamOptions { include_usage: true }), + Mistral | VLlm => None, + }; + // safety: unwrap: can only happens if `stream` was set to `false` let mut response = client.chat().create_stream(chat_completion.clone()).await.unwrap(); while let Some(result) = response.next().await { match result { Ok(resp) => { - let choice = match resp.choices.get(0) { - Some(choice) => choice, - None => { - if let Some(usage) = resp.usage.as_ref() { - for (r#type, value) in &[ - ("prompt", usage.prompt_tokens), - ("completion", usage.completion_tokens), - ("total", usage.total_tokens), - ] { - MEILISEARCH_CHAT_TOKENS_USAGE - .with_label_values(&[ - workspace_uid, - &chat_completion.model, - r#type, - ]) - .inc_by(*value as u64); - } - } - break; + if let Some(usage) = resp.usage.as_ref() { + for (r#type, value) in &[ + ("prompt", usage.prompt_tokens), + ("completion", usage.completion_tokens), + ("total", usage.total_tokens), + ] { + MEILISEARCH_CHAT_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model, r#type]) + .inc_by(*value as u64); } + } + let choice = match resp.choices.first() { + Some(choice) => choice, + None => break, }; finish_reason = choice.finish_reason; From 6397ef12a0f42aee7255b046109ed2a63f3f34d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 15:56:56 +0200 Subject: [PATCH 099/101] Use three metrics for the three different tokens --- crates/meilisearch/src/metrics.rs | 20 +++++++++++++++--- .../src/routes/chats/chat_completions.rs | 21 ++++++++++--------- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 9941bacae..2207e69ff 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -21,9 +21,23 @@ lazy_static! { "Meilisearch number of search requests performed by the chat route itself" )) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( - opts!("meilisearch_chat_tokens_usage", "Meilisearch Chat Tokens Usage"), - &["chat", "model", "type"] + pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_prompt_tokens_usage", "Meilisearch Chat Prompt Tokens Usage"), + &["workspace", "model"] + ) + .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE: IntCounterVec = + register_int_counter_vec!( + opts!( + "meilisearch_chat_completion_tokens_usage", + "Meilisearch Chat Completion Tokens Usage" + ), + &["workspace", "model"] + ) + .expect("Can't create a metric"); + pub static ref MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( + opts!("meilisearch_chat_total_tokens_usage", "Meilisearch Chat Total Tokens Usage"), + &["workspace", "model"] ) .expect("Can't create a metric"); pub static ref MEILISEARCH_DB_SIZE_BYTES: IntGauge = diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index ea3077e99..9d132a96f 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -49,7 +49,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOKENS_USAGE, + MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, + MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; @@ -563,15 +564,15 @@ async fn run_conversation( match result { Ok(resp) => { if let Some(usage) = resp.usage.as_ref() { - for (r#type, value) in &[ - ("prompt", usage.prompt_tokens), - ("completion", usage.completion_tokens), - ("total", usage.total_tokens), - ] { - MEILISEARCH_CHAT_TOKENS_USAGE - .with_label_values(&[workspace_uid, &chat_completion.model, r#type]) - .inc_by(*value as u64); - } + MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model]) + .inc_by(usage.prompt_tokens as u64); + MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model]) + .inc_by(usage.completion_tokens as u64); + MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE + .with_label_values(&[workspace_uid, &chat_completion.model]) + .inc_by(usage.total_tokens as u64); } let choice = match resp.choices.first() { Some(choice) => choice, From 32dede35c75d91f63da3e6b6935665ac1d2bd941 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 3 Jul 2025 15:59:14 +0200 Subject: [PATCH 100/101] Update snapshots --- .../upgrade_failure/after_processing_everything.snap | 4 ++-- .../upgrade_failure/register_automatic_upgrade_task.snap | 2 +- .../registered_a_task_while_the_upgrade_task_is_enqueued.snap | 2 +- .../test_failure.rs/upgrade_failure/upgrade_task_failed.snap | 4 ++-- .../upgrade_failure/upgrade_task_failed_again.snap | 4 ++-- .../upgrade_failure/upgrade_task_succeeded.snap | 4 ++-- crates/meilisearch/tests/upgrade/mod.rs | 4 ++-- ...ches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...ches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...tches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...asks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...asks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap | 2 +- ...tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap | 2 +- ..._whole_batch_queue_once_everything_has_been_processed.snap | 2 +- ...e_whole_task_queue_once_everything_has_been_processed.snap | 2 +- 15 files changed, 20 insertions(+), 20 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap index ee18cd1db..0b5d4409d 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/after_processing_everything.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, batch_uid: 3, status: failed, error: ResponseError { code: 200, message: "Index `doggo` already exists.", error_code: "index_already_exists", error_type: "invalid_request", error_link: "https://docs.meilisearch.com/errors#index_already_exists" }, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} @@ -57,7 +57,7 @@ girafo: { number_of_documents: 0, field_distribution: {} } [timestamp] [4,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } 1 {uid: 1, details: {"primaryKey":"mouse"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"catto":1}}, stop reason: "created batch containing only task with id 1 of type `indexCreation` that cannot be batched with any other task.", } 2 {uid: 2, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 2 of type `indexCreation` that cannot be batched with any other task.", } 3 {uid: 3, details: {"primaryKey":"bone"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"indexCreation":1},"indexUids":{"doggo":1}}, stop reason: "created batch containing only task with id 3 of type `indexCreation` that cannot be batched with any other task.", } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap index 9fa30ee2a..0bfb9c6da 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/register_automatic_upgrade_task.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap index 162798cad..8d374479b 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/registered_a_task_while_the_upgrade_task_is_enqueued.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, status: enqueued, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap index 8f615cb1c..9fc28abbe 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} ---------------------------------------------------------------------- ### Status: @@ -37,7 +37,7 @@ catto [1,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap index a5f9be6e1..33ddf7193 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_failed_again.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: failed, error: ResponseError { code: 200, message: "Planned failure for tests.", error_code: "internal", error_type: "internal", error_link: "https://docs.meilisearch.com/errors#internal" }, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} ---------------------------------------------------------------------- @@ -40,7 +40,7 @@ doggo [2,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"failed":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap index eb738d626..05d366d1e 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_failure.rs/upgrade_failure/upgrade_task_succeeded.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_failure.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 15, 2) }, kind: UpgradeDatabase { from: (1, 12, 0) }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { from: (1, 12, 0), to: (1, 16, 0) }, kind: UpgradeDatabase { from: (1, 12, 0) }} 1 {uid: 1, status: enqueued, details: { primary_key: Some("mouse") }, kind: IndexCreation { index_uid: "catto", primary_key: Some("mouse") }} 2 {uid: 2, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} 3 {uid: 3, status: enqueued, details: { primary_key: Some("bone") }, kind: IndexCreation { index_uid: "doggo", primary_key: Some("bone") }} @@ -43,7 +43,7 @@ doggo [2,3,] [timestamp] [0,] ---------------------------------------------------------------------- ### All Batches: -0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.15.2"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } +0 {uid: 0, details: {"upgradeFrom":"v1.12.0","upgradeTo":"v1.16.0"}, stats: {"totalNbTasks":1,"status":{"succeeded":1},"types":{"upgradeDatabase":1},"indexUids":{}}, stop reason: "stopped after the last task of type `upgradeDatabase` because they cannot be batched with tasks of any other type.", } ---------------------------------------------------------------------- ### Batch to tasks mapping: 0 [0,] diff --git a/crates/meilisearch/tests/upgrade/mod.rs b/crates/meilisearch/tests/upgrade/mod.rs index 4faa7e0c0..8114ed58b 100644 --- a/crates/meilisearch/tests/upgrade/mod.rs +++ b/crates/meilisearch/tests/upgrade/mod.rs @@ -43,7 +43,7 @@ async fn version_too_old() { std::fs::write(db_path.join("VERSION"), "1.11.9999").unwrap(); let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); - snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.15.2"); + snapshot!(err, @"Database version 1.11.9999 is too old for the experimental dumpless upgrade feature. Please generate a dump using the v1.11.9999 and import it in the v1.16.0"); } #[actix_rt::test] @@ -58,7 +58,7 @@ async fn version_requires_downgrade() { std::fs::write(db_path.join("VERSION"), format!("{major}.{minor}.{patch}")).unwrap(); let options = Opt { experimental_dumpless_upgrade: true, ..default_settings }; let err = Server::new_with_options(options).await.map(|_| ()).unwrap_err(); - snapshot!(err, @"Database version 1.15.3 is higher than the Meilisearch version 1.15.2. Downgrade is not supported"); + snapshot!(err, @"Database version 1.16.1 is higher than the Meilisearch version 1.16.0. Downgrade is not supported"); } #[actix_rt::test] diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index 4355b9213..f4edae51b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index 4355b9213..f4edae51b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index 4355b9213..f4edae51b 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/batches_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap index ebe246ee5..01d2ea341 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterEnqueuedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap index ebe246ee5..01d2ea341 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterFinishedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap index ebe246ee5..01d2ea341 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/tasks_filter_afterStartedAt_equal_2025-01-16T16_47_41.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap index c2d7967f0..fb62b35da 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_batch_queue_once_everything_has_been_processed.snap @@ -8,7 +8,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "progress": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "stats": { "totalNbTasks": 1, diff --git a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap index 52da67fef..abb4dcdd9 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap +++ b/crates/meilisearch/tests/upgrade/v1_12/snapshots/v1_12_0.rs/check_the_index_scheduler/the_whole_task_queue_once_everything_has_been_processed.snap @@ -12,7 +12,7 @@ source: crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs "canceledBy": null, "details": { "upgradeFrom": "v1.12.0", - "upgradeTo": "v1.15.2" + "upgradeTo": "v1.16.0" }, "error": null, "duration": "[duration]", From a76a3e8f118a48d1bd57775cf9d509e8374305e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Jul 2025 16:01:31 +0200 Subject: [PATCH 101/101] Change the metric name for the search to use a label --- crates/meilisearch/src/metrics.rs | 12 +++++++----- .../meilisearch/src/routes/chats/chat_completions.rs | 6 +++--- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/src/metrics.rs b/crates/meilisearch/src/metrics.rs index 2207e69ff..d52e04cc6 100644 --- a/crates/meilisearch/src/metrics.rs +++ b/crates/meilisearch/src/metrics.rs @@ -15,12 +15,14 @@ lazy_static! { "Meilisearch number of degraded search requests" )) .expect("Can't create a metric"); - pub static ref MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS: IntGauge = - register_int_gauge!(opts!( - "meilisearch_chat_internal_search_requests", + pub static ref MEILISEARCH_CHAT_SEARCH_REQUESTS: IntCounterVec = register_int_counter_vec!( + opts!( + "meilisearch_chat_search_requests", "Meilisearch number of search requests performed by the chat route itself" - )) - .expect("Can't create a metric"); + ), + &["type"] + ) + .expect("Can't create a metric"); pub static ref MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE: IntCounterVec = register_int_counter_vec!( opts!("meilisearch_chat_prompt_tokens_usage", "Meilisearch Chat Prompt Tokens Usage"), &["workspace", "model"] diff --git a/crates/meilisearch/src/routes/chats/chat_completions.rs b/crates/meilisearch/src/routes/chats/chat_completions.rs index 9d132a96f..4f7087ae8 100644 --- a/crates/meilisearch/src/routes/chats/chat_completions.rs +++ b/crates/meilisearch/src/routes/chats/chat_completions.rs @@ -49,8 +49,8 @@ use crate::error::MeilisearchHttpError; use crate::extractors::authentication::policies::ActionPolicy; use crate::extractors::authentication::{extract_token_from_request, GuardedData, Policy as _}; use crate::metrics::{ - MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS, - MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, + MEILISEARCH_CHAT_COMPLETION_TOKENS_USAGE, MEILISEARCH_CHAT_PROMPT_TOKENS_USAGE, + MEILISEARCH_CHAT_SEARCH_REQUESTS, MEILISEARCH_CHAT_TOTAL_TOKENS_USAGE, MEILISEARCH_DEGRADED_SEARCH_REQUESTS, }; use crate::routes::chats::utils::SseEventSender; @@ -290,7 +290,7 @@ async fn process_search_request( let output = output?; let mut documents = Vec::new(); if let Ok((ref rtxn, ref search_result)) = output { - MEILISEARCH_CHAT_INTERNAL_SEARCH_REQUESTS.inc(); + MEILISEARCH_CHAT_SEARCH_REQUESTS.with_label_values(&["internal"]).inc(); if search_result.degraded { MEILISEARCH_DEGRADED_SEARCH_REQUESTS.inc(); }